From bd61cc63b26b7c5ef84fce9a3d095bb6cc374757 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Thu, 28 Jan 2021 12:30:23 +0100
Subject: [PATCH 001/194] recognize: try to self.resolve_resource model

---
 ocrd_cis/ocropy/recognize.py | 28 +++++++++++++++++-----------
 setup.py                     |  2 +-
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 8d24b9d0..8ebddca3 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -1,5 +1,6 @@
 from __future__ import absolute_import
 
+import sys
 import os.path
 import numpy as np
 from PIL import Image
@@ -102,19 +103,24 @@ def setup(self):
                 x.allocate(5000)
 
     def get_model(self):
-        """Search for the model file.  First checks if
-        parameter['model'] is a valid readeable file and returns it.
-        If not, it checks if the model can be found in the
+        """Search for the model file.  First checks if parameter['model'] can
+        be resolved with OcrdResourceManager to a valid readeable file and
+        returns it.  If not, it checks if the model can be found in the
         dirname(__file__)/models/ directory."""
         canread = lambda p: os.path.isfile(p) and os.access(p, os.R_OK)
-        model = self.parameter['model']
-        if canread(model):
-            return model
-        ocropydir = os.path.dirname(os.path.abspath(__file__))
-        path = os.path.join(ocropydir, 'models', model)
-        if canread(path):
-            return path
-        return model
+        try:
+            model = self.resolve_resource(self.parameter['model'])
+            if canread(model):
+                return model
+        except SystemExit:
+            ocropydir = os.path.dirname(os.path.abspath(__file__))
+            path = os.path.join(ocropydir, 'models', self.parameter['model'])
+            self.logger.info("Failed to resolve model with OCR-D/core mechanism, trying %s", path)
+            if canread(path):
+                return path
+        self.logger.error("Could not find model %s. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s",
+                self.parameter['model'], self.parameter['model'])
+        sys.exit(1)
 
     def process(self):
         """Recognize lines / words / glyphs of the workspace.
diff --git a/setup.py b/setup.py
index 11bbf0a6..b60d3f2f 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@
     packages=find_packages(),
     include_package_data=True,
     install_requires=[
-        'ocrd>=2.13',
+        'ocrd>=2.22.3',
         'click',
         'scipy',
         'numpy>=1.17.0',

From db584d8342beed51f785cbf857f1e3c6881b6116 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 11 Mar 2022 19:39:37 +0100
Subject: [PATCH 002/194] resegment: fix method=baseline

---
 ocrd_cis/ocropy/resegment.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index d9d661b2..ee7f55b2 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -266,9 +266,10 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                         continue
                     line_polygon = baseline_of_segment(line, parent_coords)
                     line_ltr = line_polygon[0,0] < line_polygon[-1,0]
-                    line_polygon = make_valid(join_polygons(LineString(line_polygon).buffer(
+                    line_polygon = make_valid(join_polygons([LineString(line_polygon).buffer(
                         # left-hand side if left-to-right, and vice versa
-                        scale * (-1) ** line_ltr, single_sided=True), loc=line.id))
+                        scale * (-1) ** line_ltr, single_sided=True)],
+                                                            loc=line.id, scale=scale))
                     line_polygon = np.array(line_polygon.exterior, np.int)[:-1]
                     line_y, line_x = draw.polygon(line_polygon[:, 1],
                                                   line_polygon[:, 0],

From 56affe216a0cbdb559e642c05aec4bdee4ecc617 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 11 Mar 2022 19:41:10 +0100
Subject: [PATCH 003/194] resegment: join_polygons: allow non-contiguous input,
 too

---
 ocrd_cis/ocropy/resegment.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index ee7f55b2..4456a8e9 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -1,6 +1,7 @@
 from __future__ import absolute_import
 
 import os.path
+from itertools import chain
 import numpy as np
 from skimage import draw
 from shapely.geometry import Polygon, asPolygon, LineString
@@ -482,6 +483,10 @@ def join_polygons(polygons, loc=''):
     # compoundp = unary_union(polygons)
     # jointp = compoundp.convex_hull
     LOG = getLogger('processor.OcropyResegment')
+    polygons = list(chain.from_iterable([
+        poly.geoms if poly.type in ['MultiPolygon', 'GeometryCollection']
+        else [poly]
+        for poly in polygons]))
     if len(polygons) == 1:
         return polygons[0]
     # get equidistant list of points along hull

From b856f5b75ad0b3c61e0e6acf06599da7460022ac Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 11 Mar 2022 19:43:37 +0100
Subject: [PATCH 004/194] resegment: join_polygons: make equidistant points
 relative to estimated scale

---
 ocrd_cis/ocropy/resegment.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 4456a8e9..4bcc203a 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -385,7 +385,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             # combine all assigned new lines to single outline polygon
             if len(new_lines) > 1:
                 LOG.debug("joining %d new line polygons for '%s'", len(new_lines), line.id)
-            new_polygon = join_polygons([intersections[(i, j)] for i in new_lines], loc=line.id)
+            new_polygon = join_polygons([intersections[(i, j)] for i in new_lines],
+                                        loc=line.id, scale=scale)
             line_polygons[j] = new_polygon
             # convert back to absolute (page) coordinates:
             line_polygon = coordinates_for_segment(new_polygon.exterior.coords[:-1],
@@ -460,7 +461,8 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords,
         else:
             # get alpha shape
             poly = join_polygons([make_valid(Polygon(contour))
-                                  for contour in contours], loc=line.id)
+                                  for contour in contours],
+                                 loc=line.id, scale=scale)
         poly = poly.exterior.coords[:-1]
         polygon = coordinates_for_segment(poly, None, coords)
         polygon = polygon_for_parent(polygon, line.parent_object_)
@@ -478,7 +480,7 @@ def diff_polygons(poly1, poly2):
     poly = make_valid(poly)
     return poly
 
-def join_polygons(polygons, loc=''):
+def join_polygons(polygons, loc='', scale=20):
     """construct concave hull (alpha shape) from input polygons"""
     # compoundp = unary_union(polygons)
     # jointp = compoundp.convex_hull
@@ -493,14 +495,14 @@ def join_polygons(polygons, loc=''):
     # (otherwise alphashape will jump across the interior)
     points = [poly.exterior.interpolate(dist).coords[0] # .xy
               for poly in polygons
-              for dist in np.arange(0, poly.length, 5.0)]
+              for dist in np.arange(0, poly.length, scale / 2)]
     #alpha = alphashape.optimizealpha(points) # too slow
-    alpha = 0.05
+    alpha = 0.03
     jointp = alphashape.alphashape(points, alpha)
     tries = 0
     # from descartes import PolygonPatch
     # import matplotlib.pyplot as plt
-    while jointp.type in ['MultiPolygon', 'GeometryCollection']:
+    while jointp.type in ['MultiPolygon', 'GeometryCollection'] or len(jointp.interiors):
         # plt.figure()
         # plt.gca().scatter(*zip(*points))
         # for geom in jointp.geoms:

From d75e58da30cd681c0be37424f43c2859f64da220 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 25 Mar 2022 15:27:46 +0100
Subject: [PATCH 005/194] update to shapely 1.8

---
 ocrd_cis/ocropy/common.py    |  2 +-
 ocrd_cis/ocropy/resegment.py | 16 ++++++++--------
 ocrd_cis/ocropy/segment.py   | 11 ++++++-----
 setup.py                     |  2 +-
 4 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index d84e42b3..dc8ed20c 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -1141,7 +1141,7 @@ def compute_segmentation(binary,
     LOG.debug('sorting labels by reading order')
     llabels = morph.reading_order(llabels,rl,bt)[llabels]
     DSAVE('llabels_ordered', llabels)
-    
+
     #segmentation = llabels*binary
     #return segmentation
     return llabels, hlines, vlines, images, colseps, scale
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 4bcc203a..85da6c32 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -4,7 +4,7 @@
 from itertools import chain
 import numpy as np
 from skimage import draw
-from shapely.geometry import Polygon, asPolygon, LineString
+from shapely.geometry import Polygon, LineString
 from shapely.prepared import prep
 from shapely.ops import unary_union
 import alphashape
@@ -209,7 +209,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords)
             segment_polygon = make_valid(Polygon(segment_polygon)).buffer(margin)
             line_polygons.append(prep(segment_polygon))
-            segment_polygon = np.array(segment_polygon.exterior, np.int)[:-1]
+            segment_polygon = np.array(segment_polygon.exterior.coords, np.int)[:-1]
             # draw.polygon: If any segment_polygon lies outside of parent
             # (causing negative/above-max indices), either fully or partially,
             # then this will silently ignore them. The caller does not need
@@ -224,7 +224,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                       segment.id, page_id if fullpage else parent.id)
             segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords)
             segment_polygon = make_valid(Polygon(segment_polygon)).buffer(margin)
-            segment_polygon = np.array(segment_polygon.exterior, np.int)[:-1]
+            segment_polygon = np.array(segment_polygon.exterior.coords, np.int)[:-1]
             ignore_bin[draw.polygon(segment_polygon[:, 1],
                                     segment_polygon[:, 0],
                                     parent_bin.shape)] = False
@@ -271,7 +271,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                         # left-hand side if left-to-right, and vice versa
                         scale * (-1) ** line_ltr, single_sided=True)],
                                                             loc=line.id, scale=scale))
-                    line_polygon = np.array(line_polygon.exterior, np.int)[:-1]
+                    line_polygon = np.array(line_polygon.exterior.coords, np.int)[:-1]
                     line_y, line_x = draw.polygon(line_polygon[:, 1],
                                                   line_polygon[:, 0],
                                                   parent_bin.shape)
@@ -284,8 +284,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                 parent_bin, seps=ignore_bin, zoom=zoom, fullpage=fullpage,
                 maxseps=0, maxcolseps=len(ignore), maximages=0)
         except Exception as err:
-            LOG.warning('Cannot line-segment %s "%s": %s',
-                        tag, page_id if fullpage else parent.id, err)
+            LOG.error('Cannot line-segment %s "%s": %s',
+                      tag, page_id if fullpage else parent.id, err)
             return
         LOG.info("Found %d new line labels for %d existing lines on %s '%s'",
                  new_line_labels.max(), len(lines), tag, parent.id)
@@ -476,7 +476,7 @@ def diff_polygons(poly1, poly2):
     if poly.type == 'MultiPolygon':
         poly = poly.convex_hull
     if poly.minimum_clearance < 1.0:
-        poly = asPolygon(np.round(poly.exterior.coords))
+        poly = Polygon(np.round(poly.exterior.coords))
     poly = make_valid(poly)
     return poly
 
@@ -517,7 +517,7 @@ def join_polygons(polygons, loc='', scale=20):
     if jointp.minimum_clearance < 1.0:
         # follow-up calculations will necessarily be integer;
         # so anticipate rounding here and then ensure validity
-        jointp = asPolygon(np.round(jointp.exterior.coords))
+        jointp = Polygon(np.round(jointp.exterior.coords))
         jointp = make_valid(jointp)
     return jointp
 
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index b782fdde..eeaccf2d 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -5,7 +5,7 @@
 from skimage import draw
 from skimage.morphology import convex_hull_image
 import cv2
-from shapely.geometry import Polygon, asPolygon
+from shapely.geometry import Polygon, LineString
 from shapely.prepared import prep
 from shapely.ops import unary_union
 
@@ -125,15 +125,16 @@ def masks2polygons(bg_labels, fg_bin, name, min_area=None, simplify=None):
                 polygon = polygon.simplify(tolerance)
                 if polygon.is_valid:
                     break
-            polygon = polygon.exterior.coords[:-1] # keep open
-            if len(polygon) < 4:
+            poly = polygon.exterior.coords[:-1] # keep open
+            if len(poly) < 4:
                 LOG.warning('Label %d contour %d has less than 4 points for %s',
                             label, i, name)
                 continue
-            results.append((label, polygon))
+            results.append((label, poly))
             result_labels[contour_labels == i+1] = len(results)
     return results, result_labels
 
+
 class OcropySegment(Processor):
 
     def __init__(self, *args, **kwargs):
@@ -761,7 +762,7 @@ def make_intersection(poly1, poly2):
     if interp.minimum_clearance < 1.0:
         # follow-up calculations will necessarily be integer;
         # so anticipate rounding here and then ensure validity
-        interp = asPolygon(np.round(interp.exterior.coords))
+        interp = Polygon(np.round(interp.exterior.coords))
         interp = make_valid(interp)
     return interp
 
diff --git a/setup.py b/setup.py
index 72e11280..a0c371ed 100644
--- a/setup.py
+++ b/setup.py
@@ -43,7 +43,7 @@
         'scipy',
         'numpy>=1.17.0',
         'pillow>=7.1.2',
-        'shapely>=1.7.1,<1.8',
+        'shapely>=1.7.1',
         'scikit-image',
         'alphashape',
         'opencv-python-headless',

From 6a06f36238589f8b35d43f7c9be5d707e97fb97a Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sat, 19 Mar 2022 13:08:55 +0100
Subject: [PATCH 006/194] fix Workspace.save_image_file args

---
 ocrd_cis/ocropy/binarize.py | 18 ++++++------------
 ocrd_cis/ocropy/clip.py     |  6 ++----
 ocrd_cis/ocropy/denoise.py  |  6 ++----
 ocrd_cis/ocropy/deskew.py   |  6 ++----
 ocrd_cis/ocropy/dewarp.py   |  4 ++--
 ocrd_cis/ocropy/segment.py  | 10 ++++------
 6 files changed, 18 insertions(+), 32 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 6092d3d5..872185c3 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -213,10 +213,8 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id):
             file_id += '.IMG-BIN'
             features += ',binarized'
         file_path = self.workspace.save_image_file(
-            bin_image,
-            file_id,
-            page_id=page_id,
-            file_grp=self.output_file_grp)
+            bin_image, file_id, self.output_file_grp,
+            page_id=page_id)
         # update PAGE (reference the image file):
         page.add_AlternativeImage(AlternativeImageType(
             filename=file_path,
@@ -263,10 +261,8 @@ def process_region(self, region, region_image, region_xywh, zoom, page_id, file_
             file_id += '.IMG-BIN'
             features += ',binarized'
         file_path = self.workspace.save_image_file(
-            bin_image,
-            file_id,
-            page_id=page_id,
-            file_grp=self.output_file_grp)
+            bin_image, file_id, self.output_file_grp,
+            page_id=page_id)
         # update PAGE (reference the image file):
         region.add_AlternativeImage(AlternativeImageType(
             filename=file_path,
@@ -306,10 +302,8 @@ def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id, fi
             file_id += '.IMG-BIN'
             features += ',binarized'
         file_path = self.workspace.save_image_file(
-            bin_image,
-            file_id,
-            page_id=page_id,
-            file_grp=self.output_file_grp)
+            bin_image, file_id, self.output_file_grp,
+            page_id=page_id)
         # update PAGE (reference the image file):
         line.add_AlternativeImage(AlternativeImageType(
             filename=file_path,
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 8f84efe6..a305f09e 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -257,10 +257,8 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
         segment_image = crop_image(segment_image,box=segment_bbox)
         # update METS (add the image file):
         file_path = self.workspace.save_image_file(
-            segment_image,
-            file_id=file_id + '.IMG-CLIP',
-            page_id=page_id,
-            file_grp=self.output_file_grp)
+            segment_image, file_id + '.IMG-CLIP', self.output_file_grp,
+            page_id=page_id)
         # update PAGE (reference the image file):
         segment.add_AlternativeImage(AlternativeImageType(
             filename=file_path,
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index 61a77141..cbbdf8cf 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -127,10 +127,8 @@ def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id, f
                                  maxsize=self.parameter['noise_maxsize']/zoom*300/72) # in pt
         # update METS (add the image file):
         file_path = self.workspace.save_image_file(
-            bin_image,
-            file_id + '.IMG-DESPECK',
-            page_id=page_id,
-            file_grp=self.output_file_grp)
+            bin_image, file_id + '.IMG-DESPECK', self.output_file_grp,
+            page_id=page_id)
         # update PAGE (reference the image file):
         segment.add_AlternativeImage(AlternativeImageType(
             filename=file_path,
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index aabbce3e..bb9904e0 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -142,10 +142,8 @@ def _process_segment(self, segment, segment_image, segment_coords, segment_id, p
             segment_coords['features'] += ',deskewed'
         # update METS (add the image file):
         file_path = self.workspace.save_image_file(
-            segment_image,
-            file_id + '.IMG-DESKEW',
-            page_id=page_id,
-            file_grp=self.output_file_grp)
+            segment_image, file_id + '.IMG-DESKEW', self.output_file_grp,
+            page_id=page_id)
         # update PAGE (reference the image file):
         segment.add_AlternativeImage(AlternativeImageType(
             filename=file_path,
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index dc083eaf..7d3251bf 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -168,8 +168,8 @@ def process(self):
                     file_path = self.workspace.save_image_file(
                         dew_image,
                         file_id + '_' + region.id + '_' + line.id + '.IMG-DEWARP',
-                        page_id=input_file.pageId,
-                        file_grp=self.output_file_grp)
+                        self.output_file_grp,
+                        page_id=input_file.pageId)
                     # update PAGE (reference the image file):
                     alternative_image = line.get_AlternativeImage()
                     line.add_AlternativeImage(AlternativeImageType(
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index eeaccf2d..7e94f495 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -669,9 +669,8 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             element_array[sepmask] = np.amax(element_array) # clip to white/bg
             image_clipped = array2pil(element_array)
             file_path = self.workspace.save_image_file(
-                image_clipped, file_id + '.IMG-CLIP',
-                page_id=page_id,
-                file_grp=self.output_file_grp)
+                image_clipped, file_id + '.IMG-CLIP', self.output_file_grp,
+                page_id=page_id)
             element.add_AlternativeImage(AlternativeImageType(
                 filename=file_path, comments=coords['features'] + ',clipped'))
         else:
@@ -708,9 +707,8 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             element_array[sep_bin] = np.amax(element_array) # clip to white/bg
             image_clipped = array2pil(element_array)
             file_path = self.workspace.save_image_file(
-                image_clipped, file_id + '.IMG-CLIP',
-                page_id=page_id,
-                file_grp=self.output_file_grp)
+                image_clipped, file_id + '.IMG-CLIP', self.output_file_grp,
+                page_id=page_id)
             # update PAGE (reference the image file):
             element.add_AlternativeImage(AlternativeImageType(
                 filename=file_path, comments=coords['features'] + ',clipped'))

From 2cdfa7949dbf2e965aa8f44e91cb2353994d9464 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 24 Mar 2022 12:37:59 +0100
Subject: [PATCH 007/194] revert e673544 (crashes OpenCV)

---
 ocrd_cis/ocropy/ocrolib/morph.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py
index a0170c43..75d86b69 100644
--- a/ocrd_cis/ocropy/ocrolib/morph.py
+++ b/ocrd_cis/ocropy/ocrolib/morph.py
@@ -21,7 +21,8 @@ def label(image,**kw):
     """
     # default connectivity in OpenCV: 8 (which is equivalent to...)
     # default connectivity in scikit-image: 2
-    n, labels = cv2.connectedComponents(image.astype(uint8), connectivity=4)
+    # connectivity=4 crashes (segfaults) OpenCV#21366
+    n, labels = cv2.connectedComponents(image.astype(uint8))
     #n, labels = cv2.connectedComponentsWithAlgorithm(image.astype(uint8), connectivity=4, ltype=2, ccltype=cv2.CCL_DEFAULT)
     return labels, n-1
     # try: return measurements.label(image,**kw)

From 8f6cfc54ada9bfb6be3a10ff1bb98f996b14e9f0 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 18 Mar 2022 15:25:58 +0100
Subject: [PATCH 008/194] segment: annotate baselines, too

---
 ocrd_cis/ocropy/common.py    |  81 +++++++++++++++++++++++-
 ocrd_cis/ocropy/resegment.py |   2 +-
 ocrd_cis/ocropy/segment.py   | 118 ++++++++++++++++++++++++++---------
 3 files changed, 167 insertions(+), 34 deletions(-)

diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index dc8ed20c..7afb03af 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -996,7 +996,9 @@ def h_compatible(obj1, obj2, center1, center2):
 #   (which must be split anyway)
 # - with tighter polygonal spread around foreground
 # - with spread of line labels against separator labels
+# - with baseline extraction
 # - return bg line and sep labels intead of just fg line labels
+# - return baseline coords, too
 @checks(ABINARY2)
 def compute_segmentation(binary,
                          zoom=1.0,
@@ -1046,6 +1048,7 @@ def compute_segmentation(binary,
        foreground may remain unlabelled for
        separators and other non-text like small
        noise, or large drop-capitals / images),
+    - list of Numpy arrays of baseline coordinates [y, x points in lr order]
     - Numpy array of horizontal foreground lines mask,
     - Numpy array of vertical foreground lines mask,
     - Numpy array of large/non-text foreground component mask,
@@ -1144,7 +1147,81 @@ def compute_segmentation(binary,
 
     #segmentation = llabels*binary
     #return segmentation
-    return llabels, hlines, vlines, images, colseps, scale
+    blines = compute_baselines(bottom, top, llabels, scale)
+    return llabels, blines, hlines, vlines, images, colseps, scale
+
+@checks(AFLOAT2,AFLOAT2,SEGMENTATION,NUMBER)
+def compute_baselines(bottom, top, linelabels, scale, method='bottom'):
+    """Get the coordinates of baselines running along each bottom gradient peak."""
+    seeds = linelabels > 0
+    # smooth bottom+top maps horizontally for centerline estimation
+    bot = filters.gaussian_filter(bottom, (scale*0.25,scale), mode='constant')
+    top = filters.gaussian_filter(top, (scale*0.25,scale), mode='constant')
+    # idea: center is where bottom and top gradient meet in the middle
+    # (but between top and bottom, not between bottom and top)
+    # - calculation via numpy == or isclose is too fragile numerically:
+    #clines = np.isclose(top, bottom, rtol=0.5) & (np.diff(top - bottom, axis=0, append=0) < 0)
+    # - calculation via zero crossing of bop-bottom is more robust,
+    #   but needs post-processing for lines with much larger height than scale
+    if method == 'center':
+        blines = (np.diff(np.sign(top - bottom), axis=0, append=0) < 0) & seeds
+        #DSAVE('centerlines', blines)
+    # - calculation via peak gradient
+    elif method == 'bottom':
+        bot1d = np.diff(bot, axis=0, append=0)
+        bot1d = np.diff(np.sign(bot1d), axis=0, append=0) < 0
+        bot1d &= bot > 0
+        #DSAVE('bot1d', bot1d)
+        blines = bot1d
+    baselabels, nbaselabels = morph.label(blines)
+    baseslices = [(slice(0,0),slice(0,0))] + morph.find_objects(baselabels)
+    # if multiple labels per seed, ignore the ones above others
+    # (can happen due to mis-estimation of scale)
+    corrs = morph.correspondences(linelabels, baselabels).T
+    labelmap = {}
+    #DSAVE('baselines', baselabels)
+    def partitions(adj, starti, startpart=None):
+        for i in range(starti, len(adj)):
+            if startpart is None:
+                yield from partitions(adj, i + 1, [i])
+            elif all(adj[i][j] for j in startpart):
+                yield from partitions(adj, i + 1, [i] + startpart)
+        if startpart is not None:
+            yield startpart
+    for line in np.unique(linelabels):
+        if not line: continue # ignore bg line
+        corrinds = corrs[:, 0] == line
+        corrinds[corrs[:, 1] == 0] = False # ignore bg baseline
+        if not np.any(corrinds): continue
+        corrinds = corrinds.nonzero()[0]
+        if len(corrinds) == 1:
+            labelmap.setdefault(line, list()).append(corrs[corrinds[0], 1])
+            continue
+        nonoverlapping = ~np.eye(len(corrinds), dtype=np.bool)
+        for i, indi in enumerate(corrinds[:-1]):
+            baselabeli = corrs[indi, 1]
+            baseslicei = baseslices[baselabeli]
+            for j, indj in enumerate(corrinds[i + 1:], i + 1):
+                baselabelj = corrs[indj, 1]
+                baseslicej = baseslices[baselabelj]
+                if sl.xoverlaps(baseslicei, baseslicej):
+                    nonoverlapping[i, j] = False
+                    nonoverlapping[j, i] = False
+        def pathlen(path):
+            return sum(corrs[corrinds[pos], 2] for pos in path)
+        corrgroups = sorted(partitions(nonoverlapping, 0), key=pathlen)
+        # select longest path
+        corrinds = corrinds[corrgroups[-1]]
+        labelmap.setdefault(line, list()).extend(corrs[corrinds, 1])
+    basepoints = []
+    for line in np.unique(linelabels):
+        if line not in labelmap: continue
+        linemask = linelabels == line
+        points = []
+        for label in labelmap[line]:
+            points.extend(list(zip(*np.where((baselabels == label) & linemask))))
+        basepoints.append(points)
+    return basepoints
 
 # from ocropus-gpageseg, but
 # - on both foreground and background,
@@ -1741,7 +1818,7 @@ def find_topological():
                 npartitions > len(gaps)+1 or
                 # partitions without the cut still score better than after
                 sum(map(sl.height if prefer_vertical else sl.width,
-                        (morph.find_objects(partitions)))) > np.max(
+                        morph.find_objects(partitions))) > np.max(
                             partitionscores, initial=0))):
             # continue on each partition by suppressing the others, respectively
             order = morph.reading_order(partitions,rl,bt)
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 85da6c32..81166432 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -280,7 +280,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                         scale=scale, loc=parent.id, threshold=threshold)
             return
         try:
-            new_line_labels, _, _, _, _, scale = compute_segmentation(
+            new_line_labels, _, _, _, _, _, scale = compute_segmentation(
                 parent_bin, seps=ignore_bin, zoom=zoom, fullpage=fullpage,
                 maxseps=0, maxcolseps=len(ignore), maximages=0)
         except Exception as err:
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 7e94f495..4f0e87e4 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -19,6 +19,7 @@
     AlternativeImageType
 )
 from ocrd_models.ocrd_page_generateds import (
+    BaselineType,
     TableRegionType,
     ImageRegionType,
     RegionRefType,
@@ -55,30 +56,38 @@
 
 TOOL = 'ocrd-cis-ocropy-segment'
 
-def masks2polygons(bg_labels, fg_bin, name, min_area=None, simplify=None):
+def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None):
     """Convert label masks into polygon coordinates.
 
     Given a Numpy array of background labels ``bg_labels``,
+    (optionally) a Numpy array of a scalar field ``baselines``,
     and a Numpy array of the foreground ``fg_bin``,
     iterate through all labels (except zero and those labels
     which do not correspond to any foreground at all) to find
-    their outer contours. Each contour part which is not too
-    small and gives a (simplified) polygon of at least 4 points
-    becomes a polygon. (Thus, labels can be split into multiple
-    polygons.)
+    their outer contours and inner baselines. 
+    Each contour part which is not too small and gives a
+    (simplified) polygon of at least 4 points becomes a polygon.
+    (Thus, labels can be split into multiple polygons.)
 
     Return a tuple:
-    - these polygons as a list of label, polygon tuples, and
+    - these polygons as a list of label, polygon, baseline tuples, and
     - a Numpy array of new background labels for that list.
     """
     LOG = getLogger('processor.OcropySegment')
+    # find sharp baseline
+    if baselines is not None:
+        def getx(xy):
+            return xy[0]
+        baselines = [LineString(sorted([p[::-1] for p in line], key=getx)).simplify(5)
+                     for line in baselines
+                     if len(line) >= 2]
     results = list()
     result_labels = np.zeros_like(bg_labels, dtype=bg_labels.dtype)
     for label in np.unique(bg_labels):
         if not label:
             # ignore if background
             continue
-        bg_mask = np.array(bg_labels == label, np.uint8)
+        bg_mask = np.array(bg_labels == label, np.bool)
         if not np.count_nonzero(bg_mask * fg_bin):
             # ignore if missing foreground
             LOG.debug('skipping label %d in %s due to empty fg',
@@ -86,16 +95,16 @@ def masks2polygons(bg_labels, fg_bin, name, min_area=None, simplify=None):
             continue
         # simplify to convex hull
         if simplify is not None:
-            hull = convex_hull_image(bg_mask).astype(np.uint8)
-            conflicts = np.setdiff1d((hull>0) * simplify,
-                                     (bg_mask>0) * simplify)
+            hull = convex_hull_image(bg_mask.astype(np.uint8)).astype(np.bool)
+            conflicts = np.setdiff1d(hull * simplify,
+                                     bg_mask * simplify)
             if conflicts.any():
                 LOG.debug('Cannot simplify %d: convex hull would create additional intersections %s',
                           label, str(conflicts))
             else:
                 bg_mask = hull
         # find outer contour (parts):
-        contours, _ = cv2.findContours(bg_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        contours, _ = cv2.findContours(bg_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
         # determine areas of parts:
         areas = [cv2.contourArea(contour) for contour in contours]
         total_area = sum(areas)
@@ -130,7 +139,49 @@ def masks2polygons(bg_labels, fg_bin, name, min_area=None, simplify=None):
                 LOG.warning('Label %d contour %d has less than 4 points for %s',
                             label, i, name)
                 continue
-            results.append((label, poly))
+            # get baseline segments intersecting with this line mask
+            # and concatenate them from left to right
+            if baselines is not None:
+                base = []
+                for baseline in baselines:
+                    baseline = baseline.intersection(polygon)
+                    # post-process
+                    if (baseline.is_empty or
+                        baseline.type in ['Point', 'MultiPoint']):
+                        continue
+                    base_x = [pt[0] for pt in base]
+                    base_left = min(base_x, default=0)
+                    base_right = max(base_x, default=0)
+                    left = baseline.bounds[0]
+                    right = baseline.bounds[2]
+                    if (baseline.type == 'GeometryCollection' or
+                        baseline.type.startswith('Multi')):
+                        # heterogeneous result: filter point
+                        for geom in baseline.geoms:
+                            if geom.type == 'Point':
+                                continue
+                            left = geom.bounds[0]
+                            right = geom.bounds[2]
+                            if left > base_right:
+                                base.extend(geom.coords)
+                                base_right = right
+                            elif right < base_left:
+                                base = list(geom.coords) + base
+                                base_left = left
+                            else:
+                                LOG.warning("baseline part component crosses existing x")
+                                continue
+                    elif left > base_right:
+                        base.extend(baseline.coords)
+                    elif right < base_left:
+                        base = list(baseline.coords) + base
+                    else:
+                        LOG.warning("baseline part crosses existing x")
+                        continue
+                    assert all(p1[0] < p2[0] for p1, p2 in zip(base[:-1],base[1:])), base
+            else:
+                base = None
+            results.append((label, poly, base))
             result_labels[contour_labels == i+1] = len(results)
     return results, result_labels
 
@@ -472,7 +523,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
         try:
             if report:
                 raise Exception(report)
-            line_labels, hlines, vlines, images, colseps, scale = compute_segmentation(
+            line_labels, baselines, hlines, vlines, images, colseps, scale = compute_segmentation(
                 # suppress separators and ignored regions for textline estimation
                 # but keep them for h/v-line detection (in fullpage mode):
                 element_bin, seps=(sep_bin+ignore_labels)>0,
@@ -568,17 +619,17 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                                                        seps=np.maximum(sepmask, colseps))
                 region_mask |= region_line_labels > 0
                 # find contours for region (can be non-contiguous)
-                regions, _ = masks2polygons(region_mask * region_label, element_bin,
+                regions, _ = masks2polygons(region_mask * region_label, None, element_bin,
                                             '%s "%s"' % (element_name, element_id),
                                             min_area=6000/zoom/zoom,
                                             simplify=ignore_labels * ~(sep_bin))
                 # find contours for lines (can be non-contiguous)
-                lines, _ = masks2polygons(region_line_labels, element_bin,
+                lines, _ = masks2polygons(region_line_labels, baselines, element_bin,
                                           'region "%s"' % element_id,
                                           min_area=640/zoom/zoom)
                 # create new lines in new regions (allocating by intersection)
-                line_polys = [Polygon(polygon) for _, polygon in lines]
-                for _, region_polygon in regions:
+                line_polys = [Polygon(polygon) for _, polygon, _ in lines]
+                for _, region_polygon, _ in regions:
                     region_poly = prep(Polygon(region_polygon))
                     # convert back to absolute (page) coordinates:
                     region_polygon = coordinates_for_segment(region_polygon, image, coords)
@@ -598,7 +649,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     for i, line_poly in enumerate(line_polys):
                         if not region_poly.intersects(line_poly): # .contains
                             continue
-                        line_label, line_polygon = lines[i]
+                        line_label, line_polygon, line_baseline = lines[i]
                         # convert back to absolute (page) coordinates:
                         line_polygon = coordinates_for_segment(line_polygon, image, coords)
                         line_polygon = polygon_for_parent(line_polygon, region)
@@ -610,9 +661,11 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                         line_no += 1
                         line_id = region_id + "_line%04d" % line_no
                         LOG.debug('Line label %d becomes ID "%s"', line_label, line_id)
-                        line = TextLineType(
-                            id=line_id, Coords=CoordsType(
-                            points=points_from_polygon(line_polygon)))
+                        line = TextLineType(id=line_id,
+                                            Coords=CoordsType(points=points_from_polygon(line_polygon)))
+                        if line_baseline:
+                            line_baseline = coordinates_for_segment(line_baseline, image, coords)
+                            line.set_Baseline(BaselineType(points=points_from_polygon(line_baseline)))
                         region.add_TextLine(line)
                     # if the region has received text lines, keep it
                     if region.get_TextLine():
@@ -627,9 +680,9 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             LOG.info('Found %d large non-text/image regions for %s "%s"',
                      num_images, element_name, element_id)
             # find contours around region labels (can be non-contiguous):
-            image_polygons, _ = masks2polygons(image_labels, element_bin,
+            image_polygons, _ = masks2polygons(image_labels, None, element_bin,
                                                '%s "%s"' % (element_name, element_id))
-            for image_label, polygon in image_polygons:
+            for image_label, polygon, _ in image_polygons:
                 # convert back to absolute (page) coordinates:
                 region_polygon = coordinates_for_segment(polygon, image, coords)
                 region_polygon = polygon_for_parent(region_polygon, element)
@@ -648,11 +701,11 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             LOG.info('Found %d/%d h/v-lines for %s "%s"',
                      num_hlines, num_vlines, element_name, element_id)
             # find contours around region labels (can be non-contiguous):
-            hline_polygons, _ = masks2polygons(hline_labels, element_bin,
+            hline_polygons, _ = masks2polygons(hline_labels, None, element_bin,
                                                '%s "%s"' % (element_name, element_id))
-            vline_polygons, _ = masks2polygons(vline_labels, element_bin,
+            vline_polygons, _ = masks2polygons(vline_labels, None, element_bin,
                                                '%s "%s"' % (element_name, element_id))
-            for _, polygon in hline_polygons + vline_polygons:
+            for _, polygon, _ in hline_polygons + vline_polygons:
                 # convert back to absolute (page) coordinates:
                 region_polygon = coordinates_for_segment(polygon, image, coords)
                 region_polygon = polygon_for_parent(region_polygon, element)
@@ -683,11 +736,11 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             # ensure the new line labels do not extrude from the region:
             line_labels = line_labels * region_mask
             # find contours around labels (can be non-contiguous):
-            line_polygons, _ = masks2polygons(line_labels, element_bin,
+            line_polygons, _ = masks2polygons(line_labels, baselines, element_bin,
                                               'region "%s"' % element_id,
                                               min_area=640/zoom/zoom)
             line_no = 0
-            for line_label, polygon in line_polygons:
+            for line_label, polygon, baseline in line_polygons:
                 # convert back to absolute (page) coordinates:
                 line_polygon = coordinates_for_segment(polygon, image, coords)
                 line_polygon = polygon_for_parent(line_polygon, element)
@@ -698,9 +751,12 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                 # annotate result:
                 line_no += 1
                 line_id = element_id + "_line%04d" % line_no
-                element.add_TextLine(TextLineType(
-                    id=line_id, Coords=CoordsType(
-                    points=points_from_polygon(line_polygon))))
+                line = TextLineType(id=line_id,
+                                    Coords=CoordsType(points=points_from_polygon(line_polygon)))
+                if baseline:
+                    line_baseline = coordinates_for_segment(baseline, image, coords)
+                    line.set_Baseline(BaselineType(points=points_from_polygon(line_baseline)))
+                element.add_TextLine(line)
             if not sep_bin.any():
                 return # no derived image
             # annotate a text/image-separated image

From b50c51b3bd575e262d17289aca881300390792a5 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 6 Apr 2022 02:24:30 +0200
Subject: [PATCH 009/194] segment: fix lines2regions non-continguous partitions

---
 ocrd_cis/ocropy/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index 7afb03af..b0d97594 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -1818,7 +1818,7 @@ def find_topological():
                 npartitions > len(gaps)+1 or
                 # partitions without the cut still score better than after
                 sum(map(sl.height if prefer_vertical else sl.width,
-                        morph.find_objects(partitions))) > np.max(
+                        filter(None, morph.find_objects(partitions)))) > np.max(
                             partitionscores, initial=0))):
             # continue on each partition by suppressing the others, respectively
             order = morph.reading_order(partitions,rl,bt)

From c4eaf3d44a649ed5e1ee93ad7c5a19020967f0c0 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 6 Apr 2022 22:38:05 +0200
Subject: [PATCH 010/194] =?UTF-8?q?re/segment:=20alpha=20shape:=20smaller?=
 =?UTF-8?q?=20=CE=B1=20to=20avoid=20holes?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ocrd_cis/ocropy/resegment.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 81166432..b0bd1d4e 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -259,6 +259,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                 # use depth to flatten overlapping lines as seed labels
                 new_labels = np.argmax(distances, axis=0)
             else:
+                # 'baseline'
                 new_labels = np.zeros_like(parent_bin, np.uint8)
                 for i, line in enumerate(lines):
                     if line.Baseline is None:
@@ -497,7 +498,7 @@ def join_polygons(polygons, loc='', scale=20):
               for poly in polygons
               for dist in np.arange(0, poly.length, scale / 2)]
     #alpha = alphashape.optimizealpha(points) # too slow
-    alpha = 0.03
+    alpha = 0.01
     jointp = alphashape.alphashape(points, alpha)
     tries = 0
     # from descartes import PolygonPatch

From 8b0e7b87326463aff326e8be3162c0d925cce521 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 6 Apr 2022 22:39:04 +0200
Subject: [PATCH 011/194] resegment (ccomps/baseline): propagate/spread twice
 to catch diacritics/punctuation, too

---
 ocrd_cis/ocropy/resegment.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index b0bd1d4e..56840258 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -31,7 +31,7 @@
 from .common import (
     pil2array,
     odd,
-    # DSAVE,
+    DSAVE,
     # binarize,
     check_page,
     check_region,
@@ -294,8 +294,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
         new_line_polygons, new_line_labels = masks2polygons(
             new_line_labels, parent_bin, '%s "%s"' % (tag, parent.id),
             min_area=640/zoom/zoom)
-        # DSAVE('line_labels', [np.mean(line_labels, axis=0), parent_bin])
-        # DSAVE('new_line_labels', [new_line_labels, parent_bin], disabled=False)
+        DSAVE('line_labels', [np.mean(line_labels, axis=0), parent_bin])
+        DSAVE('new_line_labels', [new_line_labels, parent_bin])
         new_line_polygons = [make_valid(Polygon(line_poly))
                              for line_label, line_poly in new_line_polygons]
         # polygons for intersecting pairs
@@ -421,12 +421,20 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords,
                 scale=43, loc='', threshold=0.9):
     """redefine line coordinates by contourizing spread of connected components propagated from new labels"""
     LOG = getLogger('processor.OcropyResegment')
+    DSAVE('baseline-seeds', [new_labels, (components>0)])
     # allocate to connected components consistently (by majority,
     # ignoring smallest components like punctuation)
     #new_labels = morph.propagate_labels_majority(binarized, new_labels)
     new_labels = morph.propagate_labels_majority(components > 0, new_labels)
+    DSAVE('majority-propagated', [new_labels, (components>0) & (new_labels==0)])
     # dilate/grow labels from connected components against each other and bg
+    new_labels = morph.spread_labels(new_labels, maxdist=scale*2)
+    DSAVE('scale-spread', [new_labels, (components>0)])
+    # now propagate again to catch smallest components like punctuation
+    new_labels = morph.propagate_labels_majority(components > 0, new_labels)
+    DSAVE('propagated-again', [new_labels, (components>0) & (new_labels==0)])
     new_labels = morph.spread_labels(new_labels, maxdist=scale/2)
+    DSAVE('spread-again', [new_labels, (components>0)])
     # find polygon hull and modify line coords
     for i, line in enumerate(lines):
         new_label = new_labels == i + 1

From 0f359d09fee3a3873ed10c44d7a7e243a6a566a7 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 7 Apr 2022 00:36:58 +0200
Subject: [PATCH 012/194] resegment: if method=lineest, then annotate
 baselines, too

---
 ocrd_cis/ocropy/resegment.py |  78 +++++---------------
 ocrd_cis/ocropy/segment.py   | 134 +++++++++++++++++++++++++----------
 2 files changed, 115 insertions(+), 97 deletions(-)

diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 56840258..997c68f0 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -1,17 +1,15 @@
 from __future__ import absolute_import
 
 import os.path
-from itertools import chain
 import numpy as np
 from skimage import draw
 from shapely.geometry import Polygon, LineString
 from shapely.prepared import prep
 from shapely.ops import unary_union
-import alphashape
 
 from ocrd_modelfactory import page_from_file
 from ocrd_models.ocrd_page import (
-    to_xml, PageType
+    to_xml, PageType, BaselineType
 )
 from ocrd import Processor
 from ocrd_utils import (
@@ -42,7 +40,10 @@
     masks2polygons,
     polygon_for_parent,
     make_valid,
-    make_intersection
+    make_intersection,
+    join_baselines,
+    join_polygons,
+    diff_polygons
 )
 
 TOOL = 'ocrd-cis-ocropy-resegment'
@@ -281,7 +282,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                         scale=scale, loc=parent.id, threshold=threshold)
             return
         try:
-            new_line_labels, _, _, _, _, _, scale = compute_segmentation(
+            new_line_labels, new_baselines, _, _, _, _, scale = compute_segmentation(
                 parent_bin, seps=ignore_bin, zoom=zoom, fullpage=fullpage,
                 maxseps=0, maxcolseps=len(ignore), maximages=0)
         except Exception as err:
@@ -292,12 +293,12 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                  new_line_labels.max(), len(lines), tag, parent.id)
         # polygonalize and prepare comparison
         new_line_polygons, new_line_labels = masks2polygons(
-            new_line_labels, parent_bin, '%s "%s"' % (tag, parent.id),
+            new_line_labels, new_baselines, parent_bin, '%s "%s"' % (tag, parent.id),
             min_area=640/zoom/zoom)
         DSAVE('line_labels', [np.mean(line_labels, axis=0), parent_bin])
         DSAVE('new_line_labels', [new_line_labels, parent_bin])
-        new_line_polygons = [make_valid(Polygon(line_poly))
-                             for line_label, line_poly in new_line_polygons]
+        new_line_polygons, new_baselines = zip(*[(make_valid(Polygon(line_poly)), LineString(baseline))
+                                                 for _, line_poly, baseline in new_line_polygons])
         # polygons for intersecting pairs
         intersections = dict()
         # ratio of overlap between intersection and new line
@@ -386,9 +387,11 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             # combine all assigned new lines to single outline polygon
             if len(new_lines) > 1:
                 LOG.debug("joining %d new line polygons for '%s'", len(new_lines), line.id)
-            new_polygon = join_polygons([intersections[(i, j)] for i in new_lines],
-                                        loc=line.id, scale=scale)
+            new_polygon = join_polygons([intersections[(i, j)]
+                                         for i in new_lines], loc=line.id, scale=scale)
             line_polygons[j] = new_polygon
+            new_baseline = join_baselines([new_polygon.intersection(new_baselines[i])
+                                           for i in new_lines], loc=line.id)
             # convert back to absolute (page) coordinates:
             line_polygon = coordinates_for_segment(new_polygon.exterior.coords[:-1],
                                                    parent_image, parent_coords)
@@ -398,6 +401,10 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                 return
             # annotate result:
             line.get_Coords().set_points(points_from_polygon(line_polygon))
+            if new_baseline is not None:
+                new_baseline = coordinates_for_segment(new_baseline.coords,
+                                                       parent_image, parent_coords)
+                line.set_Baseline(BaselineType(points=points_from_polygon(new_baseline)))
             # now also ensure the assigned lines do not overlap other existing lines
             for i in new_lines:
                 for otherj in np.nonzero(fits_fg[i] > 0.1)[0]:
@@ -480,58 +487,9 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords,
             continue
         line.get_Coords().set_points(points_from_polygon(polygon))
 
-def diff_polygons(poly1, poly2):
-    poly = poly1.difference(poly2)
-    if poly.type == 'MultiPolygon':
-        poly = poly.convex_hull
-    if poly.minimum_clearance < 1.0:
-        poly = Polygon(np.round(poly.exterior.coords))
-    poly = make_valid(poly)
-    return poly
-
-def join_polygons(polygons, loc='', scale=20):
-    """construct concave hull (alpha shape) from input polygons"""
-    # compoundp = unary_union(polygons)
-    # jointp = compoundp.convex_hull
-    LOG = getLogger('processor.OcropyResegment')
-    polygons = list(chain.from_iterable([
-        poly.geoms if poly.type in ['MultiPolygon', 'GeometryCollection']
-        else [poly]
-        for poly in polygons]))
-    if len(polygons) == 1:
-        return polygons[0]
-    # get equidistant list of points along hull
-    # (otherwise alphashape will jump across the interior)
-    points = [poly.exterior.interpolate(dist).coords[0] # .xy
-              for poly in polygons
-              for dist in np.arange(0, poly.length, scale / 2)]
-    #alpha = alphashape.optimizealpha(points) # too slow
-    alpha = 0.01
-    jointp = alphashape.alphashape(points, alpha)
-    tries = 0
-    # from descartes import PolygonPatch
-    # import matplotlib.pyplot as plt
-    while jointp.type in ['MultiPolygon', 'GeometryCollection'] or len(jointp.interiors):
-        # plt.figure()
-        # plt.gca().scatter(*zip(*points))
-        # for geom in jointp.geoms:
-        #     plt.gca().add_patch(PolygonPatch(geom, alpha=0.2))
-        # plt.show()
-        alpha *= 0.7
-        tries += 1
-        if tries > 10:
-            LOG.warning("cannot find alpha for concave hull on '%s'", loc)
-            alpha = 0
-        jointp = alphashape.alphashape(points, alpha)
-    if jointp.minimum_clearance < 1.0:
-        # follow-up calculations will necessarily be integer;
-        # so anticipate rounding here and then ensure validity
-        jointp = Polygon(np.round(jointp.exterior.coords))
-        jointp = make_valid(jointp)
-    return jointp
-
 # zzz should go into core ocrd_utils
 def baseline_of_segment(segment, coords):
     line = np.array(polygon_from_points(segment.get_Baseline().points))
     line = transform_coordinates(line, coords['transform'])
     return np.round(line).astype(np.int32)
+
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 4f0e87e4..11a018a5 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -1,6 +1,7 @@
 from __future__ import absolute_import
 
 import os.path
+from itertools import chain
 import numpy as np
 from skimage import draw
 from skimage.morphology import convex_hull_image
@@ -8,6 +9,7 @@
 from shapely.geometry import Polygon, LineString
 from shapely.prepared import prep
 from shapely.ops import unary_union
+import alphashape
 
 from ocrd_modelfactory import page_from_file
 from ocrd_models.ocrd_page import (
@@ -142,43 +144,10 @@ def getx(xy):
             # get baseline segments intersecting with this line mask
             # and concatenate them from left to right
             if baselines is not None:
-                base = []
-                for baseline in baselines:
-                    baseline = baseline.intersection(polygon)
-                    # post-process
-                    if (baseline.is_empty or
-                        baseline.type in ['Point', 'MultiPoint']):
-                        continue
-                    base_x = [pt[0] for pt in base]
-                    base_left = min(base_x, default=0)
-                    base_right = max(base_x, default=0)
-                    left = baseline.bounds[0]
-                    right = baseline.bounds[2]
-                    if (baseline.type == 'GeometryCollection' or
-                        baseline.type.startswith('Multi')):
-                        # heterogeneous result: filter point
-                        for geom in baseline.geoms:
-                            if geom.type == 'Point':
-                                continue
-                            left = geom.bounds[0]
-                            right = geom.bounds[2]
-                            if left > base_right:
-                                base.extend(geom.coords)
-                                base_right = right
-                            elif right < base_left:
-                                base = list(geom.coords) + base
-                                base_left = left
-                            else:
-                                LOG.warning("baseline part component crosses existing x")
-                                continue
-                    elif left > base_right:
-                        base.extend(baseline.coords)
-                    elif right < base_left:
-                        base = list(baseline.coords) + base
-                    else:
-                        LOG.warning("baseline part crosses existing x")
-                        continue
-                    assert all(p1[0] < p2[0] for p1, p2 in zip(base[:-1],base[1:])), base
+                base = join_baselines([baseline.intersection(polygon)
+                                       for baseline in baselines], name)
+                if base is not None:
+                    base = base.coords
             else:
                 base = None
             results.append((label, poly, base))
@@ -834,6 +803,97 @@ def make_valid(polygon):
         polygon = polygon.simplify(tolerance)
     return polygon
 
+def diff_polygons(poly1, poly2):
+    poly = poly1.difference(poly2)
+    if poly.type == 'MultiPolygon':
+        poly = poly.convex_hull
+    if poly.minimum_clearance < 1.0:
+        poly = Polygon(np.round(poly.exterior.coords))
+    poly = make_valid(poly)
+    return poly
+
+def join_polygons(polygons, loc='', scale=20):
+    """construct concave hull (alpha shape) from input polygons"""
+    # compoundp = unary_union(polygons)
+    # jointp = compoundp.convex_hull
+    LOG = getLogger('processor.OcropyResegment')
+    polygons = list(chain.from_iterable([
+        poly.geoms if poly.type in ['MultiPolygon', 'GeometryCollection']
+        else [poly]
+        for poly in polygons]))
+    if len(polygons) == 1:
+        return polygons[0]
+    # get equidistant list of points along hull
+    # (otherwise alphashape will jump across the interior)
+    points = [poly.exterior.interpolate(dist).coords[0] # .xy
+              for poly in polygons
+              for dist in np.arange(0, poly.length, scale / 2)]
+    #alpha = alphashape.optimizealpha(points) # too slow
+    alpha = 0.01
+    jointp = alphashape.alphashape(points, alpha)
+    tries = 0
+    # from descartes import PolygonPatch
+    # import matplotlib.pyplot as plt
+    while jointp.type in ['MultiPolygon', 'GeometryCollection'] or len(jointp.interiors):
+        # plt.figure()
+        # plt.gca().scatter(*zip(*points))
+        # for geom in jointp.geoms:
+        #     plt.gca().add_patch(PolygonPatch(geom, alpha=0.2))
+        # plt.show()
+        alpha *= 0.7
+        tries += 1
+        if tries > 10:
+            LOG.warning("cannot find alpha for concave hull on '%s'", loc)
+            alpha = 0
+        jointp = alphashape.alphashape(points, alpha)
+    if jointp.minimum_clearance < 1.0:
+        # follow-up calculations will necessarily be integer;
+        # so anticipate rounding here and then ensure validity
+        jointp = Polygon(np.round(jointp.exterior.coords))
+        jointp = make_valid(jointp)
+    return jointp
+
+def join_baselines(baselines, loc=''):
+    LOG = getLogger('processor.OcropyResegment')
+    result = []
+    for baseline in baselines:
+        if (baseline.is_empty or
+            baseline.type in ['Point', 'MultiPoint']):
+            continue
+        base_x = [pt[0] for pt in result]
+        base_left = min(base_x, default=0)
+        base_right = max(base_x, default=0)
+        left = baseline.bounds[0]
+        right = baseline.bounds[2]
+        if (baseline.type == 'GeometryCollection' or
+            baseline.type.startswith('Multi')):
+            # heterogeneous result: filter point
+            for geom in baseline.geoms:
+                if geom.type == 'Point':
+                    continue
+                left = geom.bounds[0]
+                right = geom.bounds[2]
+                if left > base_right:
+                    result.extend(geom.coords)
+                    base_right = right
+                elif right < base_left:
+                    result = list(geom.coords) + result
+                    base_left = left
+                else:
+                    LOG.warning("baseline part component crosses existing x in %s", loc)
+                    continue
+        elif left > base_right:
+            result.extend(baseline.coords)
+        elif right < base_left:
+            result = list(baseline.coords) + result
+        else:
+            LOG.warning("baseline part crosses existing x in %s", loc)
+            continue
+        assert all(p1[0] < p2[0] for p1, p2 in zip(result[:-1], result[1:])), result
+    if not len(result):
+        return None
+    return LineString(result)
+
 def page_get_reading_order(ro, rogroup):
     """Add all elements from the given reading order group to the given dictionary.
     

From b3018eb0b176b24eb09a86d73537d55169de2f43 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Thu, 7 Apr 2022 14:28:50 +0200
Subject: [PATCH 013/194] ocrd-tool.json: typo cr{,e}ate

---
 ocrd_cis/ocrd-tool.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json
index 74c0d0c9..91a8722b 100644
--- a/ocrd_cis/ocrd-tool.json
+++ b/ocrd_cis/ocrd-tool.json
@@ -439,7 +439,7 @@
 				},
 				"model": {
 					"type": "string",
-					"description": "load model or crate new one (e.g. fraktur.pyrnn)"
+					"description": "load model or create new one (e.g. fraktur.pyrnn)"
 				},
 				"ntrain": {
 					"type": "number",

From 97af16c769483f91461c41815b17cc371ff061f9 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Tue, 12 Apr 2022 09:34:36 +0200
Subject: [PATCH 014/194] =?UTF-8?q?segment:=20rewrite=20separator=20detect?=
 =?UTF-8?q?ion=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

instead of detecting hlines and vlines independently,
and via costly horizontal/vertical morphology operations,
analyse image by medial axis transform (skeleton and distance
transform of all connected components);
then filter components that are too compact (inner vs outer size),
also filter by statistics of distance along the skeleton: filter
if too wide on average or too variant;
then apply morphological closing to reconnect broken segments, linking
only those components that roughly extend each other in the same direction;
finally, sort by size and filter components that are too small in inner
(skeleton length) or outer size (bbox diagonal), selecting only the topmost
candidates;
propagate from skeleton to full component and then spread a little into the
background
---
 ocrd_cis/ocropy/common.py    | 187 ++++++++++++++++++++++++++++-------
 ocrd_cis/ocropy/resegment.py |   2 +-
 ocrd_cis/ocropy/segment.py   |  33 +++----
 3 files changed, 163 insertions(+), 59 deletions(-)

diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index b0d97594..3b8d0f60 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -7,6 +7,7 @@
 from scipy.ndimage import measurements, filters, interpolation, morphology
 from scipy import stats, signal
 #from skimage.morphology import convex_hull_image
+from skimage.morphology import medial_axis
 from PIL import Image
 
 from . import ocrolib
@@ -450,18 +451,18 @@ def on_press(event):
 
 @checks(ABINARY2,NUMBER)
 def compute_images(binary, scale, maximages=5):
-    """Finds (and removes) large connected foreground components.
+    """Detects large connected foreground components that could be images.
     
     Parameters:
     - ``binary``, a bool or int array of the page image, with 1=black
     - ``scale``, square root of average bbox area of characters
-    - ``maximages``, maximum number of large components to keep
+    - ``maximages``, maximum number of images to find
     (This could be drop-capitals, line drawings or photos.)
     
-    Returns a same-size bool array as a mask image.
+    Returns a same-size image label array.
     """
     if maximages == 0:
-        return binary == -1
+        return np.zeros_like(binary, np.int)
     images = binary
     # d0 = odd(max(2,scale/5))
     # d1 = odd(max(2,scale/8))
@@ -473,7 +474,7 @@ def compute_images(binary, scale, maximages=5):
     images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=2*maximages)
     DSAVE('images1_large', images+0.6*binary)
     if not images.any():
-        return images > 0
+        return np.zeros_like(binary, np.int)
     # 2- open horizontally and vertically to suppress
     #    v/h-lines; these will be detected separately,
     #    and it is dangerous to combine them into one
@@ -498,14 +499,130 @@ def compute_images(binary, scale, maximages=5):
     images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=maximages)
     DSAVE('images5_selected', images+0.6*binary)
     if not images.any():
-        return images > 0
+        return np.zeros_like(binary, np.int)
     # 6- dilate a little to get a smooth contour without gaps
     dilated = morph.r_dilation(images, (odd(scale),odd(scale)))
     images = morph.propagate_labels_majority(binary, dilated+1)
     images = morph.spread_labels(images, maxdist=scale)==2
+    images, _ = morph.label(images)
     DSAVE('images6_dilated', images+0.6*binary)
     # we could repeat reconstruct-dilate here...
-    return images > 0
+    return images
+
+@checks(ABINARY2,NUMBER)
+def compute_seplines(binary, scale, maxseps=0):
+    """Detects thin connected foreground components that could be separators.
+    
+    Parameters:
+    - ``binary``, a bool or int array of the page image, with 1=black
+    - ``scale``, square root of average bbox area of characters
+    - ``maxseps``, maximum number of separators to find
+    (This could be horizontal, vertical or oblique, even slightly warped and discontinuous lines.)
+    
+    Returns a same-size separator label array.
+    """
+    if maxseps == 0:
+        return np.zeros_like(binary, np.int)
+    skel, dist = medial_axis(binary, return_distance=True)
+    DSAVE("medial-axis", [dist, skel])
+    labels, nlabels = morph.label(skel)
+    slices = [None] + morph.find_objects(labels)
+    DSAVE("skel-labels", labels)
+    # determine those components which could be separators
+    # (filter by compactness, and by mean+variance of distances)
+    sepmap = np.zeros(nlabels + 1, np.int)
+    numsep = 0
+    sepsizes = [0]
+    sepslices = [None]
+    for label in range(1, nlabels + 1):
+        labelslice = slices[label]
+        labelmask = labels == label
+        labelsize = np.count_nonzero(labelmask) # sum of skel pixels, i.e. "inner length"
+        labellength = np.hypot(*sl.dims(labelslice)) # length of bbox diagonal, i.e. "outer length"
+        #LOG.debug("skel label %d has inner size %d and outer size %d", label, labelsize, labellength)
+        if labelsize > 1.4 * labellength:
+            # not long / stretched out / too compact
+            # todo: maybe just check aspect ratio for orthogonal lines?
+            continue
+        distances = dist[labelmask]
+        mean_dist = np.mean(distances)
+        var_dist = np.var(distances)
+        #LOG.debug("skel label %d has dist %.1f±%.2f", label, mean_dist, np.sqrt(var_dist))
+        # todo: empirical analysis of ideal thresholds
+        if mean_dist < scale / 4 and var_dist < 0.3:
+            numsep += 1
+            sepmap[label] = numsep
+            sepsizes.append(labelsize)
+            sepslices.append(labelslice)
+        # todo: we could also use the mean+var as a criterion to split components
+        #       where the distance exceeds the threshold; e.g. vlines that touch
+        #       letters or images
+    sepsizes = np.array(sepsizes)
+    sepslices = np.array(sepslices)
+    LOG.debug("detected %d separator candidates", numsep)
+    DSAVE("seps-raw", sepmap[labels])
+    d0 = odd(max(1,scale/2))
+    d1 = odd(max(1,scale/4))
+    closed = morph.rb_closing(sepmap[labels] > 0, (d0,d1))
+    DSAVE("seps-closed", [dist, closed])
+    labels2, nlabels2 = morph.label(closed)
+    corrs = morph.correspondences(sepmap[labels], labels2, return_counts=False).T
+    corrmap = np.arange(numsep + 1)
+    for sep2 in range(1, nlabels2 + 1):
+        corrinds = corrs[:, 1] == sep2
+        corrinds[corrs[:, 0] == 0] = False # ignore bg
+        corrinds = corrinds.nonzero()[0]
+        if len(corrinds) == 1:
+            continue
+        for i, indi in enumerate(corrinds[:-1]):
+            sepi = corrs[indi, 0]
+            labeli = np.flatnonzero(sepmap == sepi)[0]
+            slicei = slices[labeli]
+            lengthi = np.hypot(*sl.dims(slicei))
+            for j, indj in enumerate(corrinds[i + 1:], i + 1):
+                sepj = corrs[indj, 0]
+                labelj = np.flatnonzero(sepmap == sepj)[0]
+                slicej = slices[labelj]
+                lengthj = np.hypot(*sl.dims(slicej))
+                #inter = sl.intersect(slicei, slicej)
+                union = sl.union(slicei, slicej)
+                length = np.hypot(*sl.dims(union))
+                if length > 0.9 * (lengthi + lengthj):
+                #if sl.empty(inter) or sl.area(inter) / sl.area(union) < 0.2:
+                    corrmap[sepj] = corrmap[sepi]
+    _, corrmap = np.unique(corrmap, return_inverse=True) # make contiguous
+    numsep = corrmap.max()
+    LOG.debug("linked to %d separator candidates", numsep)
+    def union(slices):
+        if len(slices) > 1:
+            return sl.union(slices[0], union(slices[1:]))
+        return slices[0]
+    for sep in range(1, numsep + 1):
+        sepsizes[sep] = sum(sepsizes[corrmap == sep])
+        sepslices[sep] = union(sepslices[corrmap == sep])
+    sepsizes = sepsizes[:numsep + 1]
+    sepslices = sepslices[:numsep + 1]
+    seplengths = np.array([np.hypot(*sl.dims(sepslice)) if sepslice else 0
+                           for sepslice in sepslices])
+    sepmap = corrmap[sepmap]
+    DSAVE("seps-raw-linked", sepmap[labels])
+    # order by size, filter minsize and filter top maxseps
+    order = np.argsort(sepsizes)[::-1]
+    # no more than maxseps and no smaller than scale
+    minsize = np.flatnonzero((sepsizes[order] < scale) | (seplengths[order] < 3 * scale))
+    if np.any(minsize):
+        maxseps = min(maxseps, minsize[0])
+    maxseps = min(maxseps, numsep)
+    ordermap = np.zeros(numsep + 1, np.int)
+    ordermap[order[:maxseps]] = np.arange(1, maxseps + 1)
+    sepmap = ordermap[sepmap]
+    DSAVE("sep-top", sepmap[labels])
+    sepseeds = morph.propagate_labels_simple(binary, sepmap[labels])
+    DSAVE("seps-top-propagated", sepseeds)
+    # FIXME: perhaps hclose / vclose first?
+    seplabels = morph.spread_labels(sepseeds, maxdist=scale / 2)
+    DSAVE("seps-top-spread", seplabels)
+    return seplabels
 
 # from ocropus-gpageseg, but with horizontal opening
 @deprecated
@@ -974,20 +1091,20 @@ def h_compatible(obj1, obj2, center1, center2):
             relabel[relabel == label2] = new_label
     # apply re-assignments:
     seeds = relabel[seeds]
-    DSAVE("hmerge5_connected", seeds)
+    # DSAVE("hmerge5_connected", seeds)
     return seeds
         
 # from ocropus-gpageseg, but:
 # - with fullpage switch
-#   (opt-in for h/v-line and column detection),
+#   (opt-in for separator line and column detection),
 # - with external separator mask
-#   (opt-in for h/v-line pass-through)
+#   (opt-in for separator line pass-through)
 # - with zoom parameter
 #   (make fixed dimension params relative to pixel density,
 #    instead of blind 300 DPI assumption)
-# - with improved h/v-line and column detection
-# - with v-line detection _before_ column detection
-# - with h/v-line suppression _after_ large component filtering
+# - with improved separator line and column detection
+# - with separator detection _before_ column detection
+# - with separator suppression _after_ large component filtering
 # - with more robust line seed estimation,
 # - with horizontal merge instead of blur,
 # - with component majority for foreground
@@ -1005,10 +1122,9 @@ def compute_segmentation(binary,
                          fullpage=False,
                          seps=None,
                          maxcolseps=2,
+                         csminheight=4,
                          maxseps=0,
                          maximages=0,
-                         csminheight=4,
-                         hlminwidth=10,
                          spread_dist=None,
                          rl=False,
                          bt=False):
@@ -1026,13 +1142,10 @@ def compute_segmentation(binary,
     - for up to ``maxcolseps`` multi-line vertical whitespaces
       (as column separators, counted piece-wise) of at least
       ``csminheight`` multiples of ``scale``,
-    - for up to ``maxseps`` vertical black lines
-      (as column separators, counted piece-wise) of at least
-      ``csminheight`` multiples of ``scale``, and
-    - for any number of horizontal lines of at least
-      ``hlminwidth`` multiples of ``scale``,
+    - for up to ``maxseps`` black separator lines (horizontal, vertical
+      or oblique; counted piece-wise),
     - for anything in ``seps`` if given,
-    then suppress these separator components and return them separately.
+    then suppress these non-text components and return them separately.
     
     Labels will be projected ("spread") from the foreground to the
     surrounding background within ``spread_dist`` distance (or half
@@ -1049,8 +1162,7 @@ def compute_segmentation(binary,
        separators and other non-text like small
        noise, or large drop-capitals / images),
     - list of Numpy arrays of baseline coordinates [y, x points in lr order]
-    - Numpy array of horizontal foreground lines mask,
-    - Numpy array of vertical foreground lines mask,
+    - Numpy array of foreground separator lines mask,
     - Numpy array of large/non-text foreground component mask,
     - Numpy array of vertical background separators mask,
     - the estimated scale (i.e. median sqrt bbox area of glyph components).
@@ -1062,18 +1174,17 @@ def compute_segmentation(binary,
     LOG.debug('height: %d, zoom: %.2f, scale: %d', binary.shape[0], zoom, scale)
 
     if fullpage:
-        LOG.debug('computing images')
+        LOG.debug('detecting images')
         images = compute_images(binary, scale, maximages=maximages)
-        LOG.debug('computing horizontal/vertical line separators')
-        hlines = compute_hlines(binary, scale, hlminwidth=hlminwidth, images=images)
-        vlines = compute_separators_morph(binary, scale, csminheight=csminheight, maxseps=maxseps, images=images)
-        binary = np.minimum(binary,1-hlines)
-        binary = np.minimum(binary,1-vlines)
-        binary = np.minimum(binary,1-images)
+        LOG.debug('detecting separators')
+        #hlines = compute_hlines(binary, scale, hlminwidth=hlminwidth, images=images)
+        #vlines = compute_separators_morph(binary, scale, csminheight=csminheight, maxseps=maxseps, images=images)
+        slines = compute_seplines(binary, scale, maxseps=maxseps)
+        binary = np.minimum(binary, 1 - (slines > 0))
+        binary = np.minimum(binary, 1 - (images > 0))
     else:
-        hlines = np.zeros_like(binary, np.bool)
-        vlines = np.zeros_like(binary, np.bool)
-        images = np.zeros_like(binary, np.bool)
+        slines = np.zeros_like(binary, np.uint8)
+        images = np.zeros_like(binary, np.uint8)
     if seps is not None and not seps.all():
         # suppress separators/images for line estimation
         # (unless it encompasses the full image for some reason)
@@ -1092,8 +1203,7 @@ def compute_segmentation(binary,
         # get a larger (closed) mask of all separators
         # (both bg boundary and fg line seps, detected
         # and passed in) to separate line/column labels
-        sepmask = np.maximum(hlines, vlines)
-        sepmask = np.maximum(sepmask, images)
+        sepmask = np.maximum(slines > 0, images > 0)
         sepmask = np.maximum(sepmask, colseps)
         if seps is not None:
             sepmask = np.maximum(sepmask, seps)
@@ -1148,7 +1258,7 @@ def compute_segmentation(binary,
     #segmentation = llabels*binary
     #return segmentation
     blines = compute_baselines(bottom, top, llabels, scale)
-    return llabels, blines, hlines, vlines, images, colseps, scale
+    return llabels, blines, slines, images, colseps, scale
 
 @checks(AFLOAT2,AFLOAT2,SEGMENTATION,NUMBER)
 def compute_baselines(bottom, top, linelabels, scale, method='bottom'):
@@ -1180,6 +1290,9 @@ def compute_baselines(bottom, top, linelabels, scale, method='bottom'):
     corrs = morph.correspondences(linelabels, baselabels).T
     labelmap = {}
     #DSAVE('baselines', baselabels)
+    # FIXME: this is slow and should be replace by some graph clustering algorithm
+    #        (we want a permutation matrix which maximizes triangles in the adjacency matrix,
+    #         then pick the triangle-subgraph with the largest sum of pixels at its nodes)
     def partitions(adj, starti, startpart=None):
         for i in range(starti, len(adj)):
             if startpart is None:
@@ -1530,7 +1643,7 @@ def find_topological():
                     seplabs, counts = np.unique(seplab * bin, return_counts=True)
                     kept = np.in1d(seplab.ravel(), seplabs[counts > scale * min_line])
                     seplab = seplab * kept.reshape(*seplab.shape)
-                    DSAVE('seplab', seplab)
+                    #DSAVE('seplab', seplab)
                     sepobj = morph.find_objects(seplab)
                     if not len(sepobj):
                         return
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 997c68f0..f96f2750 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -282,7 +282,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                         scale=scale, loc=parent.id, threshold=threshold)
             return
         try:
-            new_line_labels, new_baselines, _, _, _, _, scale = compute_segmentation(
+            new_line_labels, new_baselines, _, _, _, scale = compute_segmentation(
                 parent_bin, seps=ignore_bin, zoom=zoom, fullpage=fullpage,
                 maxseps=0, maxcolseps=len(ignore), maximages=0)
         except Exception as err:
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 11a018a5..b70997e5 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -191,7 +191,7 @@ def process(self):
         When ``level-of-operation`` is ``page`` or ``table``, this also entails
         detecting
         - up to ``maximages`` large foreground images,
-        - up to ``maxseps`` foreground h/v-line separators and
+        - up to ``maxseps`` foreground line separators and
         - up to ``maxcolseps`` background column separators
         before text line segmentation itself, as well as aggregating text lines
         to text regions afterwards.
@@ -492,7 +492,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
         try:
             if report:
                 raise Exception(report)
-            line_labels, baselines, hlines, vlines, images, colseps, scale = compute_segmentation(
+            line_labels, baselines, seplines, images, colseps, scale = compute_segmentation(
                 # suppress separators and ignored regions for textline estimation
                 # but keep them for h/v-line detection (in fullpage mode):
                 element_bin, seps=(sep_bin+ignore_labels)>0,
@@ -502,8 +502,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                 maxcolseps=self.parameter['maxcolseps'],
                 maxseps=self.parameter['maxseps'],
                 maximages=self.parameter['maximages'] if element_name != 'table' else 0,
-                csminheight=self.parameter['csminheight'],
-                hlminwidth=self.parameter['hlminwidth'])
+                csminheight=self.parameter['csminheight'])
         except Exception as err:
             if isinstance(element, TextRegionType):
                 LOG.error('Cannot line-segment region "%s": %s', element_id, err)
@@ -526,8 +525,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                 # (these cannot be split or grouped together with other regions)
                 line_labels = np.where(line_labels, line_labels+len(ignore), ignore_labels)
                 # suppress separators/images in fg and try to use for partitioning slices
-                sepmask = np.maximum(np.maximum(hlines, vlines),
-                                     np.maximum(sep_bin, images))
+                sepmask = np.maximum(sep_bin, np.maximum(seplines > 0, images > 0))
                 region_labels = lines2regions(
                     element_bin, line_labels,
                     rlabels=ignore_labels,
@@ -645,11 +643,9 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                             index = page_add_to_reading_order(rogroup, region.id, index)
             # add additional image/non-text regions from compute_segmentation
             # (e.g. drop-capitals or images) ...
-            image_labels, num_images = morph.label(images)
-            LOG.info('Found %d large non-text/image regions for %s "%s"',
-                     num_images, element_name, element_id)
+            LOG.info('Found %d large image regions for %s "%s"', images.max(), element_name, element_id)
             # find contours around region labels (can be non-contiguous):
-            image_polygons, _ = masks2polygons(image_labels, None, element_bin,
+            image_polygons, _ = masks2polygons(images, None, element_bin,
                                                '%s "%s"' % (element_name, element_id))
             for image_label, polygon, _ in image_polygons:
                 # convert back to absolute (page) coordinates:
@@ -664,22 +660,17 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                 element.add_ImageRegion(ImageRegionType(
                     id=region_id, Coords=CoordsType(
                     points=points_from_polygon(region_polygon))))
-            # split rulers into separator regions:
-            hline_labels, num_hlines = morph.label(hlines)
-            vline_labels, num_vlines = morph.label(vlines)
-            LOG.info('Found %d/%d h/v-lines for %s "%s"',
-                     num_hlines, num_vlines, element_name, element_id)
+            # split detected separator labels into separator regions:
+            LOG.info('Found %d separator lines for %s "%s"', seplines.max(), element_name, element_id)
             # find contours around region labels (can be non-contiguous):
-            hline_polygons, _ = masks2polygons(hline_labels, None, element_bin,
-                                               '%s "%s"' % (element_name, element_id))
-            vline_polygons, _ = masks2polygons(vline_labels, None, element_bin,
-                                               '%s "%s"' % (element_name, element_id))
-            for _, polygon, _ in hline_polygons + vline_polygons:
+            sep_polygons, _ = masks2polygons(seplines, None, element_bin,
+                                             '%s "%s"' % (element_name, element_id))
+            for sep_label, polygon, _ in sep_polygons:
                 # convert back to absolute (page) coordinates:
                 region_polygon = coordinates_for_segment(polygon, image, coords)
                 region_polygon = polygon_for_parent(region_polygon, element)
                 if region_polygon is None:
-                    LOG.warning('Ignoring extant region contour for separator')
+                    LOG.warning('Ignoring extant region contour for separator %d', sep_label)
                     continue
                 # annotate result:
                 region_no += 1

From c10b692fcbff544e8a2e485fd00c51cfc7eeed42 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Tue, 12 Apr 2022 09:35:39 +0200
Subject: [PATCH 015/194] segment: for more robust bg separator detection,
 combine criteria of gradient maximum and percentile

---
 ocrd_cis/ocropy/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index 3b8d0f60..062d792e 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -802,7 +802,7 @@ def compute_colseps_conv(binary, scale=1.0, csminheight=10, maxcolseps=2):
     grad = filters.gaussian_filter(1.0*binary,(scale,scale*0.5),order=(0,1))
     grad = filters.uniform_filter(grad,(10.0*scale,1)) # csminheight
     DSAVE("colwsseps2_grad-raw",grad)
-    grad = (grad>0.5*np.amax(grad))
+    grad = grad > np.minimum(0.5 * np.amax(grad), np.percentile(grad, 99.5))
     DSAVE("colwsseps2_grad",grad)
     # combine dilated edges and whitespace
     seps = np.minimum(thresh,filters.maximum_filter(grad,(odd(10*scale),odd(5*scale))))

From 789361500fd7926476033bcf1ffc882eb0e6fb71 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 28 Apr 2022 22:28:53 +0200
Subject: [PATCH 016/194] remove Calamari dependency (not used, only CLI
 callout)

---
 setup.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index a0c371ed..fc98d00d 100644
--- a/setup.py
+++ b/setup.py
@@ -47,8 +47,7 @@
         'scikit-image',
         'alphashape',
         'opencv-python-headless',
-        'python-Levenshtein',
-        'calamari_ocr == 0.3.5'
+        'python-Levenshtein'
     ],
     extras_require={
         'debug': ['matplotlib>3.0.0'],

From ca15800f86e6ece0e390bd41a6b2d66295f5bc74 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 28 Apr 2022 22:29:27 +0200
Subject: [PATCH 017/194] join_polygons (alpha shape): make more robust

---
 ocrd_cis/ocropy/segment.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index b70997e5..8320deb5 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -818,14 +818,14 @@ def join_polygons(polygons, loc='', scale=20):
     # (otherwise alphashape will jump across the interior)
     points = [poly.exterior.interpolate(dist).coords[0] # .xy
               for poly in polygons
-              for dist in np.arange(0, poly.length, scale / 2)]
+              for dist in np.arange(0, poly.length, min(scale / 2, poly.length / 4))]
     #alpha = alphashape.optimizealpha(points) # too slow
     alpha = 0.01
     jointp = alphashape.alphashape(points, alpha)
     tries = 0
     # from descartes import PolygonPatch
     # import matplotlib.pyplot as plt
-    while jointp.type in ['MultiPolygon', 'GeometryCollection'] or len(jointp.interiors):
+    while jointp.is_empty or jointp.area == 0.0 or jointp.type in ['MultiPolygon', 'GeometryCollection'] or len(jointp.interiors):
         # plt.figure()
         # plt.gca().scatter(*zip(*points))
         # for geom in jointp.geoms:

From b490d3f8144cd78ce23c19722bef5f72d10c9872 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 8 Jun 2022 23:49:20 +0200
Subject: [PATCH 018/194] re/segment: join_polygons directly instead of
 alphashape

---
 ocrd_cis/ocropy/segment.py | 50 +++++++++++++++++---------------------
 setup.py                   |  1 -
 2 files changed, 22 insertions(+), 29 deletions(-)

diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 8320deb5..8ed2042c 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -3,13 +3,14 @@
 import os.path
 from itertools import chain
 import numpy as np
+from scipy.sparse.csgraph import minimum_spanning_tree
 from skimage import draw
 from skimage.morphology import convex_hull_image
 import cv2
 from shapely.geometry import Polygon, LineString
 from shapely.prepared import prep
-from shapely.ops import unary_union
-import alphashape
+from shapely.ops import unary_union, nearest_points
+from shapely.validation import explain_validity
 
 from ocrd_modelfactory import page_from_file
 from ocrd_models.ocrd_page import (
@@ -803,40 +804,33 @@ def diff_polygons(poly1, poly2):
     poly = make_valid(poly)
     return poly
 
-def join_polygons(polygons, loc='', scale=20):
+def join_polygons(polygons, scale=20):
     """construct concave hull (alpha shape) from input polygons"""
     # compoundp = unary_union(polygons)
     # jointp = compoundp.convex_hull
-    LOG = getLogger('processor.OcropyResegment')
     polygons = list(chain.from_iterable([
         poly.geoms if poly.type in ['MultiPolygon', 'GeometryCollection']
         else [poly]
         for poly in polygons]))
-    if len(polygons) == 1:
+    npoly = len(polygons)
+    if npoly == 1:
         return polygons[0]
-    # get equidistant list of points along hull
-    # (otherwise alphashape will jump across the interior)
-    points = [poly.exterior.interpolate(dist).coords[0] # .xy
-              for poly in polygons
-              for dist in np.arange(0, poly.length, min(scale / 2, poly.length / 4))]
-    #alpha = alphashape.optimizealpha(points) # too slow
-    alpha = 0.01
-    jointp = alphashape.alphashape(points, alpha)
-    tries = 0
-    # from descartes import PolygonPatch
-    # import matplotlib.pyplot as plt
-    while jointp.is_empty or jointp.area == 0.0 or jointp.type in ['MultiPolygon', 'GeometryCollection'] or len(jointp.interiors):
-        # plt.figure()
-        # plt.gca().scatter(*zip(*points))
-        # for geom in jointp.geoms:
-        #     plt.gca().add_patch(PolygonPatch(geom, alpha=0.2))
-        # plt.show()
-        alpha *= 0.7
-        tries += 1
-        if tries > 10:
-            LOG.warning("cannot find alpha for concave hull on '%s'", loc)
-            alpha = 0
-        jointp = alphashape.alphashape(points, alpha)
+    # find min-dist path through all polygons (travelling salesman)
+    pairs = itertools.combinations(range(npoly), 2)
+    dists = np.eye(npoly, dtype=float)
+    for i, j in pairs:
+	dists[i, j] = polygons[i].distance(polygons[j])
+        dists[j, i] = dists[i, j]
+    dists = minimum_spanning_tree(dists, overwrite=True)
+    # add bridge polygons (where necessary)
+    for prevp, nextp in zip(*dists.nonzero()):
+        prevp = polygons[prevp]
+        nextp = polygons[nextp]
+        nearest = nearest_points(prevp, nextp)
+        bridgep = LineString(nearest).buffer(max(1, scale/5), resolution=1)
+        polygons.append(bridgep)
+    jointp = unary_union(polygons)
+    assert jointp.type == 'Polygon', jointp.wkt
     if jointp.minimum_clearance < 1.0:
         # follow-up calculations will necessarily be integer;
         # so anticipate rounding here and then ensure validity
diff --git a/setup.py b/setup.py
index fc98d00d..4a37603e 100644
--- a/setup.py
+++ b/setup.py
@@ -45,7 +45,6 @@
         'pillow>=7.1.2',
         'shapely>=1.7.1',
         'scikit-image',
-        'alphashape',
         'opencv-python-headless',
         'python-Levenshtein'
     ],

From 2bc033cce570ef32b172df9fb7f4e309970669b1 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 8 Jun 2022 23:51:58 +0200
Subject: [PATCH 019/194] re/segment: sort text lines in reading order

---
 ocrd_cis/ocropy/segment.py | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 8ed2042c..3fc150ff 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -59,7 +59,7 @@
 
 TOOL = 'ocrd-cis-ocropy-segment'
 
-def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None):
+def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, reorder=True):
     """Convert label masks into polygon coordinates.
 
     Given a Numpy array of background labels ``bg_labels``,
@@ -114,11 +114,15 @@ def getx(xy):
         if not total_area:
             # ignore if too small
             continue
-        # sort contours in reading order
+        # redraw label array
         contour_labels = np.zeros_like(bg_mask, np.uint8)
         for i, contour in enumerate(contours):
-            cv2.drawContours(contour_labels, contours[i:i+1], -1, i+1, cv2.FILLED)
-        order = np.argsort(morph.reading_order(contour_labels)[1:])
+            cv2.drawContours(contour_labels, contours, i, i+1, cv2.FILLED)
+        if reorder:
+            # sort contours in reading order
+            order = np.argsort(morph.reading_order(contour_labels)[1:])
+        else:
+            order = range(len(contours))
         # convert to polygons
         for i in order:
             contour = contours[i]
@@ -133,14 +137,13 @@ def getx(xy):
             polygon = contour[:, 0, ::] # already ordered x,y
             # simplify and validate:
             polygon = Polygon(polygon)
-            for tolerance in range(2, int(area)):
-                polygon = polygon.simplify(tolerance)
-                if polygon.is_valid:
-                    break
+            if not polygon.is_valid:
+                #LOG.debug(polygon.wkt)
+                LOG.debug(explain_validity(polygon))
+            polygon = make_valid(polygon)
             poly = polygon.exterior.coords[:-1] # keep open
             if len(poly) < 4:
-                LOG.warning('Label %d contour %d has less than 4 points for %s',
-                            label, i, name)
+                LOG.warning('Label %d contour %d for %s has less than 4 points', label, i, name)
                 continue
             # get baseline segments intersecting with this line mask
             # and concatenate them from left to right
@@ -572,7 +575,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                         "region label %d has both existing regions and new lines (%s)" % (
                             region_label, str(region_line_labels0))
                     region = ignore[region_line_labels0[0] - 1]
-                    if rogroup and region.parent_object_ == element and not isinstance(region, SeparatorRegionType):
+                    if rogroup and region.parent_object_ is element and not isinstance(region, SeparatorRegionType):
                         index = page_add_to_reading_order(rogroup, region.id, index)
                     LOG.debug('Region label %d is for ignored region "%s"',
                               region_label, region.id)
@@ -662,10 +665,11 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     id=region_id, Coords=CoordsType(
                     points=points_from_polygon(region_polygon))))
             # split detected separator labels into separator regions:
-            LOG.info('Found %d separator lines for %s "%s"', seplines.max(), element_name, element_id)
+            LOG.info('Found %d separators for %s "%s"', seplines.max(), element_name, element_id)
             # find contours around region labels (can be non-contiguous):
             sep_polygons, _ = masks2polygons(seplines, None, element_bin,
-                                             '%s "%s"' % (element_name, element_id))
+                                             '%s "%s"' % (element_name, element_id),
+                                             reorder=False)
             for sep_label, polygon, _ in sep_polygons:
                 # convert back to absolute (page) coordinates:
                 region_polygon = coordinates_for_segment(polygon, image, coords)
@@ -772,8 +776,7 @@ def make_intersection(poly1, poly2):
         interp = unary_union([geom for geom in interp.geoms if geom.area > 0])
     if interp.type == 'MultiPolygon':
         # homogeneous result: construct convex hull to connect
-        # FIXME: construct concave hull / alpha shape
-        interp = interp.convex_hull
+        interp = join_polygons(interp.geoms)
     if interp.minimum_clearance < 1.0:
         # follow-up calculations will necessarily be integer;
         # so anticipate rounding here and then ensure validity

From 956f4a399d1ac3a135544261ce2c5dfb86fcbbe3 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 9 Jun 2022 00:25:14 +0200
Subject: [PATCH 020/194] baseline extraction: partition by finding largest
 cliques

---
 ocrd_cis/ocropy/common.py | 24 +++++++-----------------
 setup.py                  |  1 +
 2 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index 062d792e..3a51d6e5 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -8,6 +8,7 @@
 from scipy import stats, signal
 #from skimage.morphology import convex_hull_image
 from skimage.morphology import medial_axis
+import networkx as nx
 from PIL import Image
 
 from . import ocrolib
@@ -1281,7 +1282,7 @@ def compute_baselines(bottom, top, linelabels, scale, method='bottom'):
         bot1d = np.diff(bot, axis=0, append=0)
         bot1d = np.diff(np.sign(bot1d), axis=0, append=0) < 0
         bot1d &= bot > 0
-        #DSAVE('bot1d', bot1d)
+        DSAVE('bot1d', bot1d)
         blines = bot1d
     baselabels, nbaselabels = morph.label(blines)
     baseslices = [(slice(0,0),slice(0,0))] + morph.find_objects(baselabels)
@@ -1289,18 +1290,7 @@ def compute_baselines(bottom, top, linelabels, scale, method='bottom'):
     # (can happen due to mis-estimation of scale)
     corrs = morph.correspondences(linelabels, baselabels).T
     labelmap = {}
-    #DSAVE('baselines', baselabels)
-    # FIXME: this is slow and should be replace by some graph clustering algorithm
-    #        (we want a permutation matrix which maximizes triangles in the adjacency matrix,
-    #         then pick the triangle-subgraph with the largest sum of pixels at its nodes)
-    def partitions(adj, starti, startpart=None):
-        for i in range(starti, len(adj)):
-            if startpart is None:
-                yield from partitions(adj, i + 1, [i])
-            elif all(adj[i][j] for j in startpart):
-                yield from partitions(adj, i + 1, [i] + startpart)
-        if startpart is not None:
-            yield startpart
+    DSAVE('baselines-raw', baselabels)
     for line in np.unique(linelabels):
         if not line: continue # ignore bg line
         corrinds = corrs[:, 0] == line
@@ -1320,11 +1310,11 @@ def partitions(adj, starti, startpart=None):
                 if sl.xoverlaps(baseslicei, baseslicej):
                     nonoverlapping[i, j] = False
                     nonoverlapping[j, i] = False
+        # find all maximal cliques in the graph (i.e. all fully connected subgraphs)
+        # and then pick the partition with the largest sum of pixels at its nodes
         def pathlen(path):
-            return sum(corrs[corrinds[pos], 2] for pos in path)
-        corrgroups = sorted(partitions(nonoverlapping, 0), key=pathlen)
-        # select longest path
-        corrinds = corrinds[corrgroups[-1]]
+            return sum(corrs[corrinds[path], 2])
+        corrinds = corrinds[max(nx.find_cliques(nx.Graph(nonoverlapping)), key=pathlen)]
         labelmap.setdefault(line, list()).extend(corrs[corrinds, 1])
     basepoints = []
     for line in np.unique(linelabels):
diff --git a/setup.py b/setup.py
index 4a37603e..a5e19979 100644
--- a/setup.py
+++ b/setup.py
@@ -45,6 +45,7 @@
         'pillow>=7.1.2',
         'shapely>=1.7.1',
         'scikit-image',
+        'networkx',
         'opencv-python-headless',
         'python-Levenshtein'
     ],

From 62d0729779d771a57fdd67d8057485ae6c4f6176 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 9 Jun 2022 00:30:14 +0200
Subject: [PATCH 021/194] sepline detection linking: partition by finding
 largest cliques

---
 ocrd_cis/ocropy/common.py | 131 ++++++++++++++++++++++++++++++++------
 1 file changed, 112 insertions(+), 19 deletions(-)

diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index 3a51d6e5..a195a4c1 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -535,33 +535,112 @@ def compute_seplines(binary, scale, maxseps=0):
     numsep = 0
     sepsizes = [0]
     sepslices = [None]
+    sepdists = [0]
     for label in range(1, nlabels + 1):
         labelslice = slices[label]
         labelmask = labels == label
         labelsize = np.count_nonzero(labelmask) # sum of skel pixels, i.e. "inner length"
+        labelarea = sl.area(labelslice)
+        labelaspect = sl.aspect(labelslice)
+        if labelaspect > 1:
+            labelaspect = 1 / labelaspect
         labellength = np.hypot(*sl.dims(labelslice)) # length of bbox diagonal, i.e. "outer length"
         #LOG.debug("skel label %d has inner size %d and outer size %d", label, labelsize, labellength)
-        if labelsize > 1.4 * labellength:
-            # not long / stretched out / too compact
-            # todo: maybe just check aspect ratio for orthogonal lines?
+        if labelsize > 1.5 * labellength and labelaspect >= 0.1 and labelsize < 15 * scale: #and labelsize > 0.1 * labelarea
+            # not long / straight, but very compact
             continue
         distances = dist[labelmask]
-        mean_dist = np.mean(distances)
-        var_dist = np.var(distances)
-        #LOG.debug("skel label %d has dist %.1f±%.2f", label, mean_dist, np.sqrt(var_dist))
+        avg_dist = np.median(distances) #np.mean(distances)
+        std_dist = np.std(distances)
         # todo: empirical analysis of ideal thresholds
-        if mean_dist < scale / 4 and var_dist < 0.3:
-            numsep += 1
-            sepmap[label] = numsep
-            sepsizes.append(labelsize)
-            sepslices.append(labelslice)
-        # todo: we could also use the mean+var as a criterion to split components
-        #       where the distance exceeds the threshold; e.g. vlines that touch
-        #       letters or images
+        if avg_dist > scale / 4 or std_dist/avg_dist > 0.7:
+            continue
+        #LOG.debug("skel label %d has dist %.1f±%.2f", label, avg_dist, std_dist)
+        numsep += 1
+        sepmap[label] = numsep
+        sepsizes.append(labelsize)
+        sepslices.append(labelslice)
+        sepdists.append(avg_dist)
+        if labelsize > 10 * scale and avg_dist > 0 and std_dist / avg_dist > 0.2:
+            # try to split this large label up along neighbouring spans of similar distances:
+            # (e.g. vlines that touch letters or images)
+            # 1. get optimal (by variability) spans as bin intervals, then merge largest spans
+            disthist, distedges = np.histogram(distances, bins='scott', density=True) # stone
+            disthist *= np.diff(distedges) # get probability masses
+            disthistlarge = disthist > 0.1
+            if np.count_nonzero(disthistlarge) < 2:
+                continue # only 1 large bin
+            disthistlarge[-1] = True # ensure full interval
+            distedges = distedges[1:][disthistlarge]
+            disthist = np.cumsum(disthist)[disthistlarge]
+            disthist = np.diff(disthist, prepend=0)
+            distbin = np.digitize(distances, distedges, right=True)
+            # 2. now find connected components within bins, but map all tiny components
+            #    to a single label so they can be replaced by their neighbours later-on
+            sublabels = np.zeros_like(labels)
+            sublabels[labelmask] = distbin + 1
+            DSAVE("sublabels", sublabels)
+            sublabels2 = np.zeros_like(labels)
+            sublabel = 1
+            sublabelmap = [0, 1]
+            for bin in range(len(distedges)):
+                binmask = sublabels == bin + 1
+                binlabels, nbinlabels = morph.label(binmask)
+                _, binlabelcounts = np.unique(binlabels, return_counts=True)
+                largemask = (binlabelcounts > 2 * scale)[binlabels]
+                smallmask = (binlabelcounts <= 2 * scale)[binlabels]
+                sublabels2[binmask & smallmask] = 1
+                if not np.any(binmask & largemask):
+                    continue
+                sublabels2[binmask & largemask] = binlabels[binmask & largemask] + sublabel
+                sublabel += nbinlabels
+                sublabelmap.extend(nbinlabels*[bin + 1])
+            if sublabel == 1:
+                continue # only tiny sublabels here
+            DSAVE("sublabels_connected", sublabels2)
+            sublabelmap = np.array(sublabelmap)
+            # 3. finally, replace tiny components by nearest components,
+            #    and recombine survivors to bin labels
+            smallmask = sublabels2 == 1
+            sublabels2[smallmask] = 0
+            sublabels2[smallmask] = morph.spread_labels(sublabels2)[smallmask]
+            sublabels = sublabelmap[sublabels2]
+            DSAVE("sublabels_final", sublabels)
+            # now apply as multiple separators
+            numsep -= 1
+            sepmap[label] = 0
+            slices[label] = None
+            sepsizes = sepsizes[:-1]
+            sepslices = sepslices[:-1]
+            sepdists = sepdists[:-1]
+            for sublabel in np.unique(sublabels[labelmask]):
+                sublabelmask = sublabels == sublabel
+                sublabelsize = np.count_nonzero(sublabelmask)
+                sublabelslice = sublabelmask.nonzero()
+                sublabelslice = sl.box(sublabelslice[0].min(),
+                                       sublabelslice[0].max(),
+                                       sublabelslice[1].min(),
+                                       sublabelslice[1].max())
+                subdistances = dist[sublabelmask]
+                nlabels += 1
+                numsep += 1
+                sepmap = np.append(sepmap, numsep)
+                labels[sublabelmask] = nlabels
+                slices.append(sublabelslice)
+                sepsizes.append(sublabelsize)
+                sepslices.append(sublabelslice)
+                sepdists.append(np.median(subdistances))
+                #LOG.debug("adding sublabel %d as sep %d (size %d [%s])", sublabel, numsep, sublabelsize, str(sublabelslice))
     sepsizes = np.array(sepsizes)
     sepslices = np.array(sepslices)
     LOG.debug("detected %d separator candidates", numsep)
     DSAVE("seps-raw", sepmap[labels])
+    # now dilate+erode to link neighbouring candidates,
+    # but allow only such links which
+    # - stay consistent regarding avg/std width
+    # - do not enclose large areas in between
+    # - do not "change direction" (roughly adds up their diagonals)
+    # then combine mutual neighbourships to largest allowed partitions
     d0 = odd(max(1,scale/2))
     d1 = odd(max(1,scale/4))
     closed = morph.rb_closing(sepmap[labels] > 0, (d0,d1))
@@ -574,23 +653,37 @@ def compute_seplines(binary, scale, maxseps=0):
         corrinds[corrs[:, 0] == 0] = False # ignore bg
         corrinds = corrinds.nonzero()[0]
         if len(corrinds) == 1:
-            continue
+            continue # nothing to link
+        nonoverlapping = np.zeros((len(corrinds), len(corrinds)), dtype=np.bool)
         for i, indi in enumerate(corrinds[:-1]):
             sepi = corrs[indi, 0]
             labeli = np.flatnonzero(sepmap == sepi)[0]
             slicei = slices[labeli]
             lengthi = np.hypot(*sl.dims(slicei))
+            areai = sl.area(slicei)
             for j, indj in enumerate(corrinds[i + 1:], i + 1):
                 sepj = corrs[indj, 0]
                 labelj = np.flatnonzero(sepmap == sepj)[0]
                 slicej = slices[labelj]
                 lengthj = np.hypot(*sl.dims(slicej))
-                #inter = sl.intersect(slicei, slicej)
+                areaj = sl.area(slicej)
                 union = sl.union(slicei, slicej)
                 length = np.hypot(*sl.dims(union))
-                if length > 0.9 * (lengthi + lengthj):
-                #if sl.empty(inter) or sl.area(inter) / sl.area(union) < 0.2:
-                    corrmap[sepj] = corrmap[sepi]
+                if length < 0.9 * (lengthi + lengthj):
+                    continue
+                if sl.area(union) > 1.3 * (areai + areaj):
+                    continue
+                if not (0.8 < sepdists[sepi] / sepdists[sepj] < 1.2):
+                    continue
+                inter = sl.intersect(slicei, slicej)
+                if (sl.empty(inter) or
+                    (sl.area(inter) / areai < 0.2 and
+                     sl.area(inter) / areaj < 0.2)):
+                    nonoverlapping[i, j] = True
+                    nonoverlapping[j, i] = True
+        # find largest maximal clique (i.e. fully connected subgraphs)
+        corrinds = corrinds[max(nx.find_cliques(nx.Graph(nonoverlapping)), key=len)]
+        corrmap[corrs[corrinds, 0]] = corrs[corrinds[0], 0]
     _, corrmap = np.unique(corrmap, return_inverse=True) # make contiguous
     numsep = corrmap.max()
     LOG.debug("linked to %d separator candidates", numsep)

From ff326ca273a00ebd391c0bd29dbf16c63cd54c0d Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 9 Jun 2022 00:32:40 +0200
Subject: [PATCH 022/194] sepline detection linking: filter results entirely
 composed by tiny components

---
 ocrd_cis/ocropy/common.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index a195a4c1..012d55fd 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -522,6 +522,14 @@ def compute_seplines(binary, scale, maxseps=0):
     
     Returns a same-size separator label array.
     """
+    # tries to find a compromise for the following issues,
+    # potentially occurring in combination (or all at once):
+    # - non-congiguous or broken lines (due to thin ink or low contrast)
+    # - skewed, curved or warped lines (due to non-planar photography or irregular typography)
+    # - very close or overlapping text (due to show-through or bad binarization)
+    # - superimposed fg noise (due to bad binarization) that may connect text and non-text
+    # - intersecting vertical and horizontal lines, even closed shapes (enclosing text)
+    # - line-like glyphs (i.e. false positives)
     if maxseps == 0:
         return np.zeros_like(binary, np.int)
     skel, dist = medial_axis(binary, return_distance=True)
@@ -692,7 +700,7 @@ def union(slices):
             return sl.union(slices[0], union(slices[1:]))
         return slices[0]
     for sep in range(1, numsep + 1):
-        sepsizes[sep] = sum(sepsizes[corrmap == sep])
+        sepsizes[sep] = max(sepsizes[corrmap == sep]) # sum
         sepslices[sep] = union(sepslices[corrmap == sep])
     sepsizes = sepsizes[:numsep + 1]
     sepslices = sepslices[:numsep + 1]

From 65eee888d19f74dddf3c47bbb60138d5e7e26994 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 9 Jun 2022 00:34:24 +0200
Subject: [PATCH 023/194] sepline detection masking: spread against bg and
 non-separator fg

---
 ocrd_cis/ocropy/common.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index 012d55fd..6b6eab78 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -719,11 +719,17 @@ def union(slices):
     ordermap[order[:maxseps]] = np.arange(1, maxseps + 1)
     sepmap = ordermap[sepmap]
     DSAVE("sep-top", sepmap[labels])
-    sepseeds = morph.propagate_labels_simple(binary, sepmap[labels])
-    DSAVE("seps-top-propagated", sepseeds)
-    # FIXME: perhaps hclose / vclose first?
+    # spread into fg against other fg
+    sepseeds = sepmap[labels]
+    sepseeds = morph.spread_labels(sepseeds, maxdist=max(sepdists))
+    sepseeds[~binary] = 0
+    #labels = morph.propagate_labels_simple(binary, labels)
+    #DSAVE("seps-top-spread-fg", sepseeds)
+    # spread into bg against other fg
+    sepseeds[binary & (sepseeds == 0)] = maxseps + 1
     seplabels = morph.spread_labels(sepseeds, maxdist=scale / 2)
-    DSAVE("seps-top-spread", seplabels)
+    seplabels[seplabels == maxseps + 1] = 0
+    DSAVE("seps-top-spread-bg", seplabels)
     return seplabels
 
 # from ocropus-gpageseg, but with horizontal opening

From 81276131abe8425a3bf67c13ce8d3fbed4604322 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 9 Jun 2022 00:46:11 +0200
Subject: [PATCH 024/194] ocropy.lines2region: improve splitting by separators
 (fix 6d8c0d36)

- when trying to partition slices by separators topologically,
  also treat pre-existing regions like separators, but
  prevent placing them into distinct partitions
- when trying to partition slices by separators morphologically,
  also merge partitions that share any significant line labels
---
 ocrd_cis/ocropy/common.py | 52 +++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index 6b6eab78..e98ab4a2 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -1675,12 +1675,6 @@ def finalize():
             sepm = sl.cut(sepmask, box)
             if isinstance(mask, np.ndarray):
                 sepm = np.where(mask, sepm, 1)
-            if isinstance(rlabels, np.ndarray):
-                # treat existing regions like separators
-                rlab = sl.cut(rlabels, box)
-                if isinstance(mask, np.ndarray):
-                    rlab = np.where(mask, rlab, 0)
-                sepm = np.where(rlab, 1, sepm)
             # provide `partitions` for next step
             partitions, npartitions = 1-sepm, 1
             new_partition_type = None
@@ -1690,31 +1684,35 @@ def finalize():
                 # try to apply in this cut like another separator
                 partitions, npartitions = morph.label(1-sepm)
                 if npartitions > 1:
-                    # first, delete partitions that have no significant line labels
-                    splitmap = np.zeros(len(objects)+1, dtype=np.int)
-                    for label in range(1, npartitions+1):
-                        linecounts = np.bincount(lbin[partitions==label], minlength=len(objects))
+                    # delete partitions that have no significant line labels,
+                    # merge partitions that share any significant line labels
+                    splitmap = np.zeros((len(objects), npartitions), dtype=np.bool)
+                    for label in range(npartitions):
+                        linecounts = np.bincount(lbin[partitions==label+1], minlength=len(objects))
                         linecounts[0] = 0 # without bg
                         # get significant line labels for this partition
                         # (but keep insignificant non-empty labels if complete)
                         mincounts = np.minimum(min_line * scale, np.maximum(1, bincounts))
                         linelabels = np.nonzero(linecounts >= mincounts)[0]
                         if linelabels.size:
-                            splitmap[linelabels] = label
-                            if debug: LOG.debug('  sepmask partition %d: %s', label, str(linelabels))
+                            splitmap[linelabels, label] = True
+                            if debug: LOG.debug('  sepmask partition %d: %s', label+1, str(linelabels))
                         else:
-                            partitions[partitions==label] = 0
-                    # second, merge partitions that share any significant line labels
-                    for label1 in range(1, npartitions+1):
-                        if not np.any(splitmap == label1):
+                            partitions[partitions==label+1] = 0
+                    if isinstance(rlabels, np.ndarray):
+                        # keep existing regions in distinct partitions if possible
+                        rlab = sl.cut(rlabels, box)
+                        if isinstance(mask, np.ndarray):
+                            rlab = np.where(mask, rlab, 0)
+                        splitmap[np.unique(lbin[rlab>0])] = False
+                    mergemap = np.arange(npartitions + 1)
+                    for line in splitmap:
+                        if not np.any(line):
                             continue
-                        for label2 in range(label1+1, npartitions+1):
-                            if not np.any(splitmap == label2):
-                                continue
-                            if np.any((splitmap == label1) & (splitmap == label2)):
-                                splitmap[splitmap == label2] = label1
-                                partitions[partitions==label2] = label1
-                    npartitions = len(np.setdiff1d(np.unique(splitmap), [0]))
+                        parts = np.flatnonzero(line)+1
+                        mergemap[parts] = mergemap[parts[0]]
+                    partitions = mergemap[partitions]
+                    npartitions = len(np.setdiff1d(np.unique(mergemap), [0]))
                     new_partition_type = 'splitmask'
                     if debug: LOG.debug('  %d sepmask partitions after filtering and merging', npartitions)
             if partition_type != 'topological':
@@ -1722,10 +1720,16 @@ def finalize():
                 # get current slice's line labels
                 def find_topological():
                     # run only if needed (no other partition/slicing possible)
-                    nonlocal partitions, npartitions, new_partition_type
+                    nonlocal sepm, partitions, npartitions, new_partition_type
                     llab = sl.cut(llabels, box)
                     if isinstance(mask, np.ndarray):
                         llab = np.where(mask, llab, 0)
+                    if isinstance(rlabels, np.ndarray):
+                        # treat existing regions like separators
+                        rlab = sl.cut(rlabels, box)
+                        if isinstance(mask, np.ndarray):
+                            rlab = np.where(mask, rlab, 0)
+                        sepm = np.where(rlab, 1, sepm)
                     obj = [sl.intersect(o, box) for o in objects]
                     # get current slice's foreground
                     bin = sl.cut(binary, box)

From 2849464c14bf51b06cf01a36916a48e9424dc71a Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 9 Jun 2022 00:58:03 +0200
Subject: [PATCH 025/194] ocrolib.morph.all_neighbors: no diagonals

---
 ocrd_cis/ocropy/ocrolib/morph.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py
index 75d86b69..7d6ffc85 100644
--- a/ocrd_cis/ocropy/ocrolib/morph.py
+++ b/ocrd_cis/ocropy/ocrolib/morph.py
@@ -170,6 +170,20 @@ def rg_closing(image,size,origin=0):
     # image = r_dilation(image,size,origin=0)
     # return r_erosion(image,size,origin=-1)
 
+@checks(GRAYSCALE,ABINARY2)
+def rg_reconstruction(image,mask,step=1,maxsteps=None):
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2*step+1,2*step+1))
+    dilated = image
+    while maxsteps is None or maxsteps > 0:
+        dilated = cv2.dilate(src=dilated, kernel=kernel)
+        dilated = np.where(mask, dilated, image)
+        # did result change?
+        if (image == dilated).all():
+            return dilated
+        if maxsteps:
+            maxsteps -= step
+    return dilated
+
 @checks(SEGMENTATION)
 def showlabels(x,n=7):
     import matplotlib.pyplot as plt
@@ -337,8 +351,8 @@ def all_neighbors(image, dist=1, bg=NaN):
     assert amin(image)>=0
     u = unique(q*image+shift(image,(dist,0),order=0,cval=bg))
     d = unique(q*image+shift(image,(-dist,0),order=0,cval=bg))
-    l = unique(q*image+shift(image,(dist,dist),order=0,cval=bg))
-    r = unique(q*image+shift(image,(-dist,dist),order=0,cval=bg))
+    l = unique(q*image+shift(image,(0,dist),order=0,cval=bg))
+    r = unique(q*image+shift(image,(0,-dist),order=0,cval=bg))
     all = unique(r_[u,d,l,r])
     all = all[all!=bg]
     all = c_[all//q,all%q]

From 4d7a0831e9072c56375e7cd9c2fc5ed107f565b3 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 9 Jun 2022 08:27:29 +0200
Subject: [PATCH 026/194] sepline detection polygonization: cut inner holes
 open

---
 ocrd_cis/ocropy/segment.py | 90 +++++++++++++++++++++++++++++++++++---
 1 file changed, 85 insertions(+), 5 deletions(-)

diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 3fc150ff..73b54fa8 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -59,7 +59,7 @@
 
 TOOL = 'ocrd-cis-ocropy-segment'
 
-def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, reorder=True):
+def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True):
     """Convert label masks into polygon coordinates.
 
     Given a Numpy array of background labels ``bg_labels``,
@@ -106,8 +106,88 @@ def getx(xy):
                           label, str(conflicts))
             else:
                 bg_mask = hull
-        # find outer contour (parts):
-        contours, _ = cv2.findContours(bg_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        if open_holes:
+            # def plot_poly(contour, color):
+            #     import matplotlib.pyplot as plt
+            #     from matplotlib.patches import Polygon as PolygonPatch
+            #     plt.figure()
+            #     plt.imshow(fg_bin)
+            #     plt.gca().scatter(*zip(*contour[:,0]))
+            #     plt.gca().add_patch(PolygonPatch(contour[:,0], alpha=0.5, color=color, closed=False))
+            #     plt.show()
+            # find outer contour (parts) plus direct holes (if any)
+            contours = []
+            cont, hier = cv2.findContours(bg_mask.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
+            idx = 0
+            while idx >= 0:
+                contour = cont[idx]
+                if len(contour) < 3:
+                    idx = hier[0, idx, 0]
+                    continue
+                #plot_poly(contour, 'red')
+                idx_hole = hier[0, idx, 2]
+                while idx_hole >= 0:
+                    hole = cont[idx_hole]
+                    if len(hole) < 3:
+                        idx_hole = hier[0, idx_hole, 0]
+                        continue
+                    LOG.debug("label %d contour %d [%d pts] has hole %d [%d pts]",
+                              label, idx, len(contour), idx_hole, len(hole))
+                    #plot_poly(hole, 'blue')
+                    # cut child from outside...
+                    # first get nearest point on child
+                    hole_idx = np.argmin([cv2.pointPolygonTest(contour, tuple(pt[0]), True)
+                                          for pt in hole])
+                    # now get nearest point on parent
+                    # (we cannot use PolygonTest directly, because we must also interpolate
+                    #  to prevent crossing edges; at least each 10px)
+                    contour = np.append(contour, contour[0:1], axis=0)
+                    contour2 = np.diff(contour, axis=0)
+                    contourtics = np.maximum(1, np.linalg.norm(contour2, axis=2).astype(np.int)[:,0] // 10)
+                    interpol = []
+                    for i, ntics in enumerate(contourtics):
+                        interpol.extend(np.array(contour[i:i+1] +
+                                                 contour2[i:i+1] *
+                                                 np.linspace(0, 1, ntics)[:,np.newaxis,np.newaxis],
+                                                 np.int))
+                    interpol.append(contour[-1])
+                    interpol = np.array(interpol)
+                    contourtics = np.insert(np.cumsum(contourtics), 0, 0)
+                    assert np.all(contour == interpol[contourtics])
+                    interpol_idx = np.linalg.norm(interpol - hole[hole_idx], axis=2).argmin()
+                    contour_idx = np.searchsorted(contourtics, interpol_idx)
+                    if interpol_idx in contourtics:
+                        contour_idx2 = contour_idx + 1
+                    else:
+                        contour_idx2 = contour_idx
+                    if contour_idx2 >= len(contour):
+                        contour_idx2 = 0
+                    cispoint1 = cispoint2 = interpol[interpol_idx:interpol_idx+1]
+                    if interpol_idx == 0:
+                        diff1 = (interpol[-1:] - cispoint1) // 5
+                    else:
+                        diff1 = (interpol[interpol_idx-1:interpol_idx] - cispoint1) // 5
+                    if interpol_idx + 1 >= len(interpol):
+                        diff2 = (interpol[0:1] - cispoint2) // 5
+                    else:
+                        diff2 = (interpol[interpol_idx+1:interpol_idx+2] - cispoint2) // 5
+                    cispoint1 = cispoint1 + diff1
+                    cispoint2 = cispoint2 + diff2
+                    LOG.debug("stitching at interpolation pos %d hole pos %d", interpol_idx, hole_idx)
+                    # now stitch together outer (up to cision), inner (re-arranged around cision), outer (rest)
+                    # (this works, because inner contours have inverse direction)
+                    contour = np.concatenate([contour[:contour_idx], cispoint1,
+                                              hole[hole_idx:], hole[:hole_idx],
+                                              cispoint2, contour[contour_idx:]])
+                    #plot_poly(contour, 'green')
+                    idx_hole = hier[0, idx_hole, 0]
+                #plot_poly(contour, 'red')
+                LOG.debug("adding label %d contour %d [%d pts]", label, idx, len(contour))
+                contours.append(contour)
+                idx = hier[0, idx, 0]
+        else:
+            # find outer contour (parts):
+            contours, _ = cv2.findContours(bg_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
         # determine areas of parts:
         areas = [cv2.contourArea(contour) for contour in contours]
         total_area = sum(areas)
@@ -669,7 +749,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             # find contours around region labels (can be non-contiguous):
             sep_polygons, _ = masks2polygons(seplines, None, element_bin,
                                              '%s "%s"' % (element_name, element_id),
-                                             reorder=False)
+                                             open_holes=True, reorder=False)
             for sep_label, polygon, _ in sep_polygons:
                 # convert back to absolute (page) coordinates:
                 region_polygon = coordinates_for_segment(polygon, image, coords)
@@ -822,7 +902,7 @@ def join_polygons(polygons, scale=20):
     pairs = itertools.combinations(range(npoly), 2)
     dists = np.eye(npoly, dtype=float)
     for i, j in pairs:
-	dists[i, j] = polygons[i].distance(polygons[j])
+        dists[i, j] = polygons[i].distance(polygons[j])
         dists[j, i] = dists[i, j]
     dists = minimum_spanning_tree(dists, overwrite=True)
     # add bridge polygons (where necessary)

From c90b29f4c6f3369b5eecae1617903dada14a3553 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 9 Jun 2022 10:34:57 +0200
Subject: [PATCH 027/194] re/segment: join_polygons: fix b490d3f8 imports

---
 ocrd_cis/ocropy/segment.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 73b54fa8..f9948579 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -1,7 +1,7 @@
 from __future__ import absolute_import
 
 import os.path
-from itertools import chain
+import itertools
 import numpy as np
 from scipy.sparse.csgraph import minimum_spanning_tree
 from skimage import draw
@@ -887,11 +887,11 @@ def diff_polygons(poly1, poly2):
     poly = make_valid(poly)
     return poly
 
-def join_polygons(polygons, scale=20):
+def join_polygons(polygons, loc='', scale=20):
     """construct concave hull (alpha shape) from input polygons"""
     # compoundp = unary_union(polygons)
     # jointp = compoundp.convex_hull
-    polygons = list(chain.from_iterable([
+    polygons = list(itertools.chain.from_iterable([
         poly.geoms if poly.type in ['MultiPolygon', 'GeometryCollection']
         else [poly]
         for poly in polygons]))

From 2ec107eea4fa29f672641c5ea55616e958330437 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sat, 25 Mar 2023 02:34:23 +0100
Subject: [PATCH 028/194] re/segment: join_polygons: connect touching
 neighbours, too

---
 ocrd_cis/ocropy/segment.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index f9948579..43005088 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -902,8 +902,11 @@ def join_polygons(polygons, loc='', scale=20):
     pairs = itertools.combinations(range(npoly), 2)
     dists = np.eye(npoly, dtype=float)
     for i, j in pairs:
-        dists[i, j] = polygons[i].distance(polygons[j])
-        dists[j, i] = dists[i, j]
+        dist = polygons[i].distance(polygons[j])
+        if dist == 0:
+            dist = 1e-5 # if pair merely touches, we still need to get an edge
+        dists[i, j] = dist
+        dists[j, i] = dist
     dists = minimum_spanning_tree(dists, overwrite=True)
     # add bridge polygons (where necessary)
     for prevp, nextp in zip(*dists.nonzero()):

From b2aba78d072486edf8cc441b9a9dd6543fe91937 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sat, 25 Mar 2023 02:37:42 +0100
Subject: [PATCH 029/194] re/segment: join_baselines: for complex subtypes,
 apply recursively

---
 ocrd_cis/ocropy/segment.py | 39 ++++++++++++++++----------------------
 1 file changed, 16 insertions(+), 23 deletions(-)

diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 43005088..6f7848ae 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -927,40 +927,33 @@ def join_polygons(polygons, loc='', scale=20):
 def join_baselines(baselines, loc=''):
     LOG = getLogger('processor.OcropyResegment')
     result = []
-    for baseline in baselines:
-        if (baseline.is_empty or
-            baseline.type in ['Point', 'MultiPoint']):
-            continue
+    def add_baseline(baseline):
+        nonlocal result
         base_x = [pt[0] for pt in result]
         base_left = min(base_x, default=0)
         base_right = max(base_x, default=0)
         left = baseline.bounds[0]
         right = baseline.bounds[2]
-        if (baseline.type == 'GeometryCollection' or
-            baseline.type.startswith('Multi')):
-            # heterogeneous result: filter point
-            for geom in baseline.geoms:
-                if geom.type == 'Point':
-                    continue
-                left = geom.bounds[0]
-                right = geom.bounds[2]
-                if left > base_right:
-                    result.extend(geom.coords)
-                    base_right = right
-                elif right < base_left:
-                    result = list(geom.coords) + result
-                    base_left = left
-                else:
-                    LOG.warning("baseline part component crosses existing x in %s", loc)
-                    continue
-        elif left > base_right:
+        if baseline.coords[0][0] > baseline.coords[-1][0]:
+            baseline.coords = list(baseline.coords[::-1])
+        if left > base_right:
             result.extend(baseline.coords)
         elif right < base_left:
             result = list(baseline.coords) + result
         else:
             LOG.warning("baseline part crosses existing x in %s", loc)
-            continue
+            return
         assert all(p1[0] < p2[0] for p1, p2 in zip(result[:-1], result[1:])), result
+    for baseline in baselines:
+        if (baseline.is_empty or
+            baseline.type in ['Point', 'MultiPoint']):
+            continue
+        if (baseline.type == 'GeometryCollection' or
+            baseline.type.startswith('Multi')):
+            for geom in baseline.geoms:
+                add_baseline(geom)
+            continue
+        add_baseline(baseline)
     if not len(result):
         return None
     return LineString(result)

From 77d60ca5006caffd406a29c67acd964933558efc Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sat, 25 Mar 2023 02:40:12 +0100
Subject: [PATCH 030/194] re/segment: join_baselines: skip lines outside of
 polygon

---
 ocrd_cis/ocropy/segment.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 6f7848ae..cfa99e62 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -221,6 +221,9 @@ def getx(xy):
                 #LOG.debug(polygon.wkt)
                 LOG.debug(explain_validity(polygon))
             polygon = make_valid(polygon)
+            if not polygon.is_valid:
+                #LOG.debug(polygon.wkt)
+                LOG.warning(explain_validity(polygon))
             poly = polygon.exterior.coords[:-1] # keep open
             if len(poly) < 4:
                 LOG.warning('Label %d contour %d for %s has less than 4 points', label, i, name)
@@ -229,7 +232,8 @@ def getx(xy):
             # and concatenate them from left to right
             if baselines is not None:
                 base = join_baselines([baseline.intersection(polygon)
-                                       for baseline in baselines], name)
+                                       for baseline in baselines
+                                       if baseline.intersects(polygon)], name)
                 if base is not None:
                     base = base.coords
             else:

From 8c986be4246a9e7c2a3123e6b76862bfeed8ed7c Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sat, 25 Mar 2023 02:41:43 +0100
Subject: [PATCH 031/194] re/segment: improve polygon simplification

---
 ocrd_cis/ocropy/segment.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index cfa99e62..e83dcacd 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -869,17 +869,18 @@ def make_intersection(poly1, poly2):
     return interp
 
 def make_valid(polygon):
-    for split in range(1, len(polygon.exterior.coords)-1):
+    points = list(polygon.exterior.coords)
+    for split in range(1, len(points)):
         if polygon.is_valid or polygon.simplify(polygon.area).is_valid:
             break
         # simplification may not be possible (at all) due to ordering
         # in that case, try another starting point
-        polygon = Polygon(polygon.exterior.coords[-split:]+polygon.exterior.coords[:-split])
-    for tolerance in range(1, int(polygon.area)):
+        polygon = Polygon(points[-split:]+points[:-split])
+    for tolerance in range(int(polygon.area)):
         if polygon.is_valid:
             break
         # simplification may require a larger tolerance
-        polygon = polygon.simplify(tolerance)
+        polygon = polygon.simplify(tolerance + 1)
     return polygon
 
 def diff_polygons(poly1, poly2):

From 0acc6f292dfb69c4f91660b43a2d30c36484815e Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sat, 25 Mar 2023 02:45:05 +0100
Subject: [PATCH 032/194] resegment: list instead of generator

---
 ocrd_cis/ocropy/resegment.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index f96f2750..3bb270d0 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -297,8 +297,9 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             min_area=640/zoom/zoom)
         DSAVE('line_labels', [np.mean(line_labels, axis=0), parent_bin])
         DSAVE('new_line_labels', [new_line_labels, parent_bin])
-        new_line_polygons, new_baselines = zip(*[(make_valid(Polygon(line_poly)), LineString(baseline))
-                                                 for _, line_poly, baseline in new_line_polygons])
+        new_line_polygons, new_baselines = list(zip(*[
+            (make_valid(Polygon(line_poly)), LineString(baseline))
+            for _, line_poly, baseline in new_line_polygons])) or ([], [])
         # polygons for intersecting pairs
         intersections = dict()
         # ratio of overlap between intersection and new line

From 38206f0fc138077ecc1e3dd0d3409293e9807c85 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sat, 25 Mar 2023 02:46:56 +0100
Subject: [PATCH 033/194] adapt to Numpy 1.24 dtypes

---
 ocrd_cis/ocropy/common.py          | 36 +++++++++++++++---------------
 ocrd_cis/ocropy/ocrolib/lineest.py |  2 +-
 ocrd_cis/ocropy/resegment.py       | 22 +++++++++---------
 ocrd_cis/ocropy/segment.py         | 18 +++++++--------
 4 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index e98ab4a2..86372eeb 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -463,7 +463,7 @@ def compute_images(binary, scale, maximages=5):
     Returns a same-size image label array.
     """
     if maximages == 0:
-        return np.zeros_like(binary, np.int)
+        return np.zeros_like(binary, int)
     images = binary
     # d0 = odd(max(2,scale/5))
     # d1 = odd(max(2,scale/8))
@@ -475,7 +475,7 @@ def compute_images(binary, scale, maximages=5):
     images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=2*maximages)
     DSAVE('images1_large', images+0.6*binary)
     if not images.any():
-        return np.zeros_like(binary, np.int)
+        return np.zeros_like(binary, int)
     # 2- open horizontally and vertically to suppress
     #    v/h-lines; these will be detected separately,
     #    and it is dangerous to combine them into one
@@ -500,7 +500,7 @@ def compute_images(binary, scale, maximages=5):
     images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=maximages)
     DSAVE('images5_selected', images+0.6*binary)
     if not images.any():
-        return np.zeros_like(binary, np.int)
+        return np.zeros_like(binary, int)
     # 6- dilate a little to get a smooth contour without gaps
     dilated = morph.r_dilation(images, (odd(scale),odd(scale)))
     images = morph.propagate_labels_majority(binary, dilated+1)
@@ -531,7 +531,7 @@ def compute_seplines(binary, scale, maxseps=0):
     # - intersecting vertical and horizontal lines, even closed shapes (enclosing text)
     # - line-like glyphs (i.e. false positives)
     if maxseps == 0:
-        return np.zeros_like(binary, np.int)
+        return np.zeros_like(binary, int)
     skel, dist = medial_axis(binary, return_distance=True)
     DSAVE("medial-axis", [dist, skel])
     labels, nlabels = morph.label(skel)
@@ -539,7 +539,7 @@ def compute_seplines(binary, scale, maxseps=0):
     DSAVE("skel-labels", labels)
     # determine those components which could be separators
     # (filter by compactness, and by mean+variance of distances)
-    sepmap = np.zeros(nlabels + 1, np.int)
+    sepmap = np.zeros(nlabels + 1, int)
     numsep = 0
     sepsizes = [0]
     sepslices = [None]
@@ -662,7 +662,7 @@ def compute_seplines(binary, scale, maxseps=0):
         corrinds = corrinds.nonzero()[0]
         if len(corrinds) == 1:
             continue # nothing to link
-        nonoverlapping = np.zeros((len(corrinds), len(corrinds)), dtype=np.bool)
+        nonoverlapping = np.zeros((len(corrinds), len(corrinds)), dtype=bool)
         for i, indi in enumerate(corrinds[:-1]):
             sepi = corrs[indi, 0]
             labeli = np.flatnonzero(sepmap == sepi)[0]
@@ -715,7 +715,7 @@ def union(slices):
     if np.any(minsize):
         maxseps = min(maxseps, minsize[0])
     maxseps = min(maxseps, numsep)
-    ordermap = np.zeros(numsep + 1, np.int)
+    ordermap = np.zeros(numsep + 1, int)
     ordermap[order[:maxseps]] = np.arange(1, maxseps + 1)
     sepmap = ordermap[sepmap]
     DSAVE("sep-top", sepmap[labels])
@@ -1163,7 +1163,7 @@ def h_compatible(obj1, obj2, center1, center2):
             label1_y, label1_x = np.where(seeds == label)
             label2_y, label2_x = np.where(seed2)
             shared_y = np.intersect1d(label1_y, label2_y)
-            gap = np.zeros_like(seed2, np.bool)
+            gap = np.zeros_like(seed2, bool)
             for y in shared_y:
                 can_x_min = label2_x[label2_y == y][0]
                 can_x_max = label2_x[label2_y == y][-1]
@@ -1407,7 +1407,7 @@ def compute_baselines(bottom, top, linelabels, scale, method='bottom'):
         if len(corrinds) == 1:
             labelmap.setdefault(line, list()).append(corrs[corrinds[0], 1])
             continue
-        nonoverlapping = ~np.eye(len(corrinds), dtype=np.bool)
+        nonoverlapping = ~np.eye(len(corrinds), dtype=bool)
         for i, indi in enumerate(corrinds[:-1]):
             baselabeli = corrs[indi, 1]
             baseslicei = baseslices[baselabeli]
@@ -1577,7 +1577,7 @@ def lines2regions(binary, llabels,
     bincounts = np.bincount(lbinary.flatten())
     
     LOG.debug('combining lines to regions')
-    relabel = np.zeros(np.amax(llabels)+1, np.int)
+    relabel = np.zeros(np.amax(llabels)+1, int)
     num_regions = 0
     def recursive_x_y_cut(box, mask=None, partition_type=None, debug=False):
         """Split lbinary at horizontal or vertical gaps recursively.
@@ -1624,7 +1624,7 @@ def finalize():
                 llab = sl.cut(llabels, box)
                 if isinstance(mask, np.ndarray):
                     llab = np.where(mask, llab, 0)
-                linelabels0 = np.zeros(llabels.max()+1, dtype=np.bool)
+                linelabels0 = np.zeros(llabels.max()+1, dtype=bool)
                 linelabels0[linelabels] = True
                 llab *= linelabels0[llab]
                 newregion = rlab.max()+1
@@ -1686,7 +1686,7 @@ def finalize():
                 if npartitions > 1:
                     # delete partitions that have no significant line labels,
                     # merge partitions that share any significant line labels
-                    splitmap = np.zeros((len(objects), npartitions), dtype=np.bool)
+                    splitmap = np.zeros((len(objects), npartitions), dtype=bool)
                     for label in range(npartitions):
                         linecounts = np.bincount(lbin[partitions==label+1], minlength=len(objects))
                         linecounts[0] = 0 # without bg
@@ -1753,8 +1753,8 @@ def find_topological():
                     linelabels = np.setdiff1d(np.unique(lbin), [0])
                     nlines = linelabels.max() + 1
                     # find pairs of lines above each other with a separator next to them
-                    leftseps = np.zeros((nlines, nseps), np.bool)
-                    rghtseps = np.zeros((nlines, nseps), np.bool)
+                    leftseps = np.zeros((nlines, nseps), bool)
+                    rghtseps = np.zeros((nlines, nseps), bool)
                     for line in linelabels:
                         for i, sep in enumerate(sepobj):
                             if sep is None:
@@ -1775,7 +1775,7 @@ def find_topological():
                     if not np.any(trueseps):
                         return
                     if debug: LOG.debug("trueseps: %s", str(trueseps))
-                    neighbours = np.zeros((nlines, nlines), np.bool)
+                    neighbours = np.zeros((nlines, nlines), bool)
                     for i in linelabels:
                         for j in linelabels[i+1:]:
                             if sl.yoverlap_rel(obj[i], obj[j]) > 0.5:
@@ -1791,7 +1791,7 @@ def find_topological():
                     # group neighbours by adjacency (i.e. put any contiguous pairs
                     # of such line labels into the same group)
                     nlabels = llab.max() + 1
-                    splitmap = np.zeros(nlabels, dtype=np.int)
+                    splitmap = np.zeros(nlabels, dtype=int)
                     for i, j in zip(*neighbours.nonzero()):
                         if splitmap[i] > 0:
                             splitmap[j] = splitmap[i]
@@ -1879,8 +1879,8 @@ def find_topological():
                 if not gaps.shape[0]:
                     continue
                 for start, stop, height in sorted(zip(
-                        props['left_ips'].astype(np.int),
-                        props['right_ips'].astype(np.int),
+                        props['left_ips'].astype(int),
+                        props['right_ips'].astype(int),
                         props['peak_heights']), key=lambda x: x[2]):
                     if is_horizontal:
                         llab[box[0].start+int(scale/2):box[0].stop-int(scale/2),box[1].start+start:box[1].start+stop] = -10*np.log(-height+1e-9)
diff --git a/ocrd_cis/ocropy/ocrolib/lineest.py b/ocrd_cis/ocropy/ocrolib/lineest.py
index 669b12ca..42ef2237 100644
--- a/ocrd_cis/ocropy/ocrolib/lineest.py
+++ b/ocrd_cis/ocropy/ocrolib/lineest.py
@@ -31,7 +31,7 @@ def check(self,line, max_ignore=0.02):
         #DSAVE('lineest check 1 dilated', smoothed + 0.5*line)
         smoothed = filters.gaussian_filter(smoothed, (1, h//10), mode='constant') # 2
         #DSAVE('lineest check 2 smoothed', smoothed + 0.5*line)
-        smoothed = np.array(smoothed > np.median(smoothed), dtype=np.float) # 3 # or 0.05 instead of median?
+        smoothed = np.array(smoothed > np.median(smoothed), dtype=float) # 3 # or 0.05 instead of median?
         #DSAVE('lineest check 3 thresholded', smoothed + 0.5*line)
         smoothed = filters.minimum_filter(smoothed, (2, h//5)) # 4: undo 1/2
         #DSAVE('lineest check 4 eroded', smoothed + 0.5*line)
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 3bb270d0..5b5c37b4 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -188,8 +188,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
         # prepare line segmentation
         parent_array = pil2array(parent_image)
         #parent_array, _ = common.binarize(parent_array, maxskew=0) # just in case still raw
-        parent_bin = np.array(parent_array <= midrange(parent_array), np.bool)
-        ignore_bin = np.ones_like(parent_bin, np.bool)
+        parent_bin = np.array(parent_array <= midrange(parent_array), bool)
+        ignore_bin = np.ones_like(parent_bin, bool)
         if isinstance(parent, PageType):
             tag = 'page'
             fullpage = True
@@ -203,14 +203,14 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                         page_id if fullpage else parent.id, report)
             return
         # get existing line labels:
-        line_labels = np.zeros_like(parent_bin, np.bool)
+        line_labels = np.zeros_like(parent_bin, bool)
         line_labels = np.tile(line_labels[np.newaxis], (len(lines), 1, 1))
         line_polygons = []
         for i, segment in enumerate(lines):
             segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords)
             segment_polygon = make_valid(Polygon(segment_polygon)).buffer(margin)
             line_polygons.append(prep(segment_polygon))
-            segment_polygon = np.array(segment_polygon.exterior.coords, np.int)[:-1]
+            segment_polygon = np.array(segment_polygon.exterior.coords, int)[:-1]
             # draw.polygon: If any segment_polygon lies outside of parent
             # (causing negative/above-max indices), either fully or partially,
             # then this will silently ignore them. The caller does not need
@@ -225,7 +225,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                       segment.id, page_id if fullpage else parent.id)
             segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords)
             segment_polygon = make_valid(Polygon(segment_polygon)).buffer(margin)
-            segment_polygon = np.array(segment_polygon.exterior.coords, np.int)[:-1]
+            segment_polygon = np.array(segment_polygon.exterior.coords, int)[:-1]
             ignore_bin[draw.polygon(segment_polygon[:, 1],
                                     segment_polygon[:, 0],
                                     parent_bin.shape)] = False
@@ -273,7 +273,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                         # left-hand side if left-to-right, and vice versa
                         scale * (-1) ** line_ltr, single_sided=True)],
                                                             loc=line.id, scale=scale))
-                    line_polygon = np.array(line_polygon.exterior.coords, np.int)[:-1]
+                    line_polygon = np.array(line_polygon.exterior.coords, int)[:-1]
                     line_y, line_x = draw.polygon(line_polygon[:, 1],
                                                   line_polygon[:, 0],
                                                   parent_bin.shape)
@@ -303,11 +303,11 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
         # polygons for intersecting pairs
         intersections = dict()
         # ratio of overlap between intersection and new line
-        fits_bg = np.zeros((len(new_line_polygons), len(line_polygons)), np.float)
-        fits_fg = np.zeros((len(new_line_polygons), len(line_polygons)), np.float)
+        fits_bg = np.zeros((len(new_line_polygons), len(line_polygons)), float)
+        fits_fg = np.zeros((len(new_line_polygons), len(line_polygons)), float)
         # ratio of overlap between intersection and existing line
-        covers_bg = np.zeros((len(new_line_polygons), len(line_polygons)), np.float)
-        covers_fg = np.zeros((len(new_line_polygons), len(line_polygons)), np.float)
+        covers_bg = np.zeros((len(new_line_polygons), len(line_polygons)), float)
+        covers_fg = np.zeros((len(new_line_polygons), len(line_polygons)), float)
         # compare segmentations, calculating ratios of overlapping fore/background area
         for i, new_line_poly in enumerate(new_line_polygons):
             for j, line_poly in enumerate(line_polygons):
@@ -333,7 +333,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                     #           fits_bg[i,j]*100, covers_bg[i,j]*100,
                     #           fits_fg[i,j]*100, covers_fg[i,j]*100)
         # assign new lines to existing lines, if possible
-        assignments = np.ones(len(new_line_polygons), np.int) * -1
+        assignments = np.ones(len(new_line_polygons), int) * -1
         for i, new_line_poly in enumerate(new_line_polygons):
             if not fits_bg[i].any():
                 LOG.debug("new line %d fits no existing line's background", i)
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index e83dcacd..34182f20 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -90,7 +90,7 @@ def getx(xy):
         if not label:
             # ignore if background
             continue
-        bg_mask = np.array(bg_labels == label, np.bool)
+        bg_mask = np.array(bg_labels == label, bool)
         if not np.count_nonzero(bg_mask * fg_bin):
             # ignore if missing foreground
             LOG.debug('skipping label %d in %s due to empty fg',
@@ -98,7 +98,7 @@ def getx(xy):
             continue
         # simplify to convex hull
         if simplify is not None:
-            hull = convex_hull_image(bg_mask.astype(np.uint8)).astype(np.bool)
+            hull = convex_hull_image(bg_mask.astype(np.uint8)).astype(bool)
             conflicts = np.setdiff1d(hull * simplify,
                                      bg_mask * simplify)
             if conflicts.any():
@@ -136,20 +136,20 @@ def getx(xy):
                     #plot_poly(hole, 'blue')
                     # cut child from outside...
                     # first get nearest point on child
-                    hole_idx = np.argmin([cv2.pointPolygonTest(contour, tuple(pt[0]), True)
+                    hole_idx = np.argmin([cv2.pointPolygonTest(contour, pt[0].tolist(), True)
                                           for pt in hole])
                     # now get nearest point on parent
                     # (we cannot use PolygonTest directly, because we must also interpolate
                     #  to prevent crossing edges; at least each 10px)
                     contour = np.append(contour, contour[0:1], axis=0)
                     contour2 = np.diff(contour, axis=0)
-                    contourtics = np.maximum(1, np.linalg.norm(contour2, axis=2).astype(np.int)[:,0] // 10)
+                    contourtics = np.maximum(1, np.linalg.norm(contour2, axis=2).astype(int)[:,0] // 10)
                     interpol = []
                     for i, ntics in enumerate(contourtics):
                         interpol.extend(np.array(contour[i:i+1] +
                                                  contour2[i:i+1] *
                                                  np.linspace(0, 1, ntics)[:,np.newaxis,np.newaxis],
-                                                 np.int))
+                                                 int))
                     interpol.append(contour[-1])
                     interpol = np.array(interpol)
                     contourtics = np.insert(np.cumsum(contourtics), 0, 0)
@@ -537,9 +537,9 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             LOG.warning("Skipping '%s' with zero size", element_id)
             return
         element_array = pil2array(image)
-        element_bin = np.array(element_array <= midrange(element_array), np.bool)
-        sep_bin = np.zeros_like(element_bin, np.bool)
-        ignore_labels = np.zeros_like(element_bin, np.int)
+        element_bin = np.array(element_array <= midrange(element_array), bool)
+        sep_bin = np.zeros_like(element_bin, bool)
+        ignore_labels = np.zeros_like(element_bin, int)
         for i, segment in enumerate(ignore):
             LOG.debug('masking foreground of %s "%s" for "%s"',
                       type(segment).__name__[:-4], segment.id, element_id)
@@ -778,7 +778,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
         else:
             # get mask from region polygon:
             region_polygon = coordinates_of_segment(element, image, coords)
-            region_mask = np.zeros_like(element_bin, np.bool)
+            region_mask = np.zeros_like(element_bin, bool)
             region_mask[draw.polygon(region_polygon[:, 1],
                                      region_polygon[:, 0],
                                      region_mask.shape)] = True

From 6eed14cb5ca71c16b56c6c5985ca853680f9fa9e Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sat, 25 Mar 2023 02:47:17 +0100
Subject: [PATCH 034/194] adapt to Shapely 2.0 deprecations

---
 ocrd_cis/ocropy/segment.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 34182f20..478d1c05 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -855,10 +855,10 @@ def make_intersection(poly1, poly2):
     # post-process
     if interp.is_empty or interp.area == 0.0:
         return None
-    if interp.type == 'GeometryCollection':
+    if interp.geom_type == 'GeometryCollection':
         # heterogeneous result: filter zero-area shapes (LineString, Point)
         interp = unary_union([geom for geom in interp.geoms if geom.area > 0])
-    if interp.type == 'MultiPolygon':
+    if interp.geom_type == 'MultiPolygon':
         # homogeneous result: construct convex hull to connect
         interp = join_polygons(interp.geoms)
     if interp.minimum_clearance < 1.0:
@@ -885,7 +885,7 @@ def make_valid(polygon):
 
 def diff_polygons(poly1, poly2):
     poly = poly1.difference(poly2)
-    if poly.type == 'MultiPolygon':
+    if poly.geom_type == 'MultiPolygon':
         poly = poly.convex_hull
     if poly.minimum_clearance < 1.0:
         poly = Polygon(np.round(poly.exterior.coords))
@@ -897,7 +897,7 @@ def join_polygons(polygons, loc='', scale=20):
     # compoundp = unary_union(polygons)
     # jointp = compoundp.convex_hull
     polygons = list(itertools.chain.from_iterable([
-        poly.geoms if poly.type in ['MultiPolygon', 'GeometryCollection']
+        poly.geoms if poly.geom_type in ['MultiPolygon', 'GeometryCollection']
         else [poly]
         for poly in polygons]))
     npoly = len(polygons)
@@ -921,7 +921,7 @@ def join_polygons(polygons, loc='', scale=20):
         bridgep = LineString(nearest).buffer(max(1, scale/5), resolution=1)
         polygons.append(bridgep)
     jointp = unary_union(polygons)
-    assert jointp.type == 'Polygon', jointp.wkt
+    assert jointp.geom_type == 'Polygon', jointp.wkt
     if jointp.minimum_clearance < 1.0:
         # follow-up calculations will necessarily be integer;
         # so anticipate rounding here and then ensure validity
@@ -951,10 +951,10 @@ def add_baseline(baseline):
         assert all(p1[0] < p2[0] for p1, p2 in zip(result[:-1], result[1:])), result
     for baseline in baselines:
         if (baseline.is_empty or
-            baseline.type in ['Point', 'MultiPoint']):
+            baseline.geom_type in ['Point', 'MultiPoint']):
             continue
-        if (baseline.type == 'GeometryCollection' or
-            baseline.type.startswith('Multi')):
+        if (baseline.geom_type == 'GeometryCollection' or
+            baseline.geom_type.startswith('Multi')):
             for geom in baseline.geoms:
                 add_baseline(geom)
             continue

From 2bf18e0e786683d857ca60a00ced26195a1811d4 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Fri, 14 Apr 2023 10:04:36 +0200
Subject: [PATCH 035/194] check_page: double max page size to 20k by 20k pixels

---
 ocrd_cis/ocropy/common.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index d84e42b3..728fabb1 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -370,9 +370,9 @@ def check_page(binary, zoom=1.0):
     if np.mean(binary)<np.median(binary): return "image may be inverted"
     h,w = binary.shape
     if h<600/zoom: return "image not tall enough for a page image %s"%(binary.shape,)
-    if h>10000/zoom: return "image too tall for a page image %s"%(binary.shape,)
+    if h>20000/zoom: return "image too tall for a page image %s"%(binary.shape,)
     if w<600/zoom: return "image too narrow for a page image %s"%(binary.shape,)
-    if w>10000/zoom: return "image too wide for a page image %s"%(binary.shape,)
+    if w>20000/zoom: return "image too wide for a page image %s"%(binary.shape,)
     # zoom factor (DPI relative) and 4 (against fragmentation from binarization)
     slots = int(w*h*1.0/(30*30)*zoom*zoom) * 4
     _,ncomps = measurements.label(binary)

From cd08aab3d1e3352e5c838f0c0b203de9e27e8f8b Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Tue, 30 May 2023 21:24:36 +0200
Subject: [PATCH 036/194] check_page/region/line: skip assumptions on number of
 components

---
 ocrd_cis/ocropy/common.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index 86372eeb..189c0db5 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -319,6 +319,7 @@ def check_line(binary, zoom=1.0):
     ##if w<1.5*h: return "line too short %s"%(binary.shape,)
     if w<1.5*h and w<32/zoom: return "image too short for a line image %s"%(binary.shape,)
     if w>4000/zoom: return "image too long for a line image %s"%(binary.shape,)
+    return None
     ratio = w*1.0/h
     _, ncomps = measurements.label(binary)
     lo = int(0.5*ratio+0.5)
@@ -348,6 +349,7 @@ def check_region(binary, zoom=1.0):
     if h>5000/zoom: return "image too tall for a region image %s"%(binary.shape,)
     if w<100/zoom: return "image too narrow for a region image %s"%(binary.shape,)
     if w>5000/zoom: return "image too wide for a region image %s"%(binary.shape,)
+    return None
     # zoom factor (DPI relative) and 4 (against fragmentation from binarization)
     slots = int(w*h*1.0/(30*30)*zoom*zoom) * 4
     _,ncomps = measurements.label(binary)
@@ -375,6 +377,7 @@ def check_page(binary, zoom=1.0):
     if h>10000/zoom: return "image too tall for a page image %s"%(binary.shape,)
     if w<600/zoom: return "image too narrow for a page image %s"%(binary.shape,)
     if w>10000/zoom: return "image too wide for a page image %s"%(binary.shape,)
+    return None
     # zoom factor (DPI relative) and 4 (against fragmentation from binarization)
     slots = int(w*h*1.0/(30*30)*zoom*zoom) * 4
     _,ncomps = measurements.label(binary)

From 70b21919f2f62dd8152992ba4d73d9d0f16f575b Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Tue, 30 May 2023 21:25:55 +0200
Subject: [PATCH 037/194] resegment: add param baseline_only

---
 ocrd_cis/ocrd-tool.json      |  7 ++++++-
 ocrd_cis/ocropy/resegment.py | 26 +++++++++++++++++---------
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json
index 74c0d0c9..e82a1c75 100644
--- a/ocrd_cis/ocrd-tool.json
+++ b/ocrd_cis/ocrd-tool.json
@@ -191,7 +191,7 @@
 			"output_file_grp": [
 				"OCR-D-SEG-LINE"
 			],
-			"description": "Resegment text lines",
+			"description": "Improve coordinates of text lines",
 			"parameters": {
 				"level-of-operation": {
 					"type": "string",
@@ -205,6 +205,11 @@
 					"description": "source for new line polygon candidates ('lineest' for line estimation, i.e. how Ocropy would have segmented text lines; 'baseline' tries to re-polygonize from the baseline annotation; 'ccomps' avoids crossing connected components by majority rule)",
 					"default": "lineest"
 				},
+				"baseline_only": {
+					"type": "boolean",
+					"description": "ignore existing textline coords completely and use baseline as input if possible",
+					"default": false
+				},
 				"dpi": {
 					"type": "number",
 					"format": "float",
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 5b5c37b4..9242773d 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -207,8 +207,12 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
         line_labels = np.tile(line_labels[np.newaxis], (len(lines), 1, 1))
         line_polygons = []
         for i, segment in enumerate(lines):
-            segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords)
-            segment_polygon = make_valid(Polygon(segment_polygon)).buffer(margin)
+            if self.parameter['baseline_only'] and segment.Baseline:
+                segment_baseline = baseline_of_segment(segment, parent_coords)
+                segment_polygon = polygon_from_baseline(segment_baseline, 30/zoom)
+            else:
+                segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords)
+                segment_polygon = make_valid(Polygon(segment_polygon)).buffer(margin)
             line_polygons.append(prep(segment_polygon))
             segment_polygon = np.array(segment_polygon.exterior.coords, int)[:-1]
             # draw.polygon: If any segment_polygon lies outside of parent
@@ -267,12 +271,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                         LOG.warning("Skipping '%s' without baseline", line.id)
                         new_labels[line_labels[i]] = i + 1
                         continue
-                    line_polygon = baseline_of_segment(line, parent_coords)
-                    line_ltr = line_polygon[0,0] < line_polygon[-1,0]
-                    line_polygon = make_valid(join_polygons([LineString(line_polygon).buffer(
-                        # left-hand side if left-to-right, and vice versa
-                        scale * (-1) ** line_ltr, single_sided=True)],
-                                                            loc=line.id, scale=scale))
+                    line_baseline = baseline_of_segment(line, parent_coords)
+                    line_polygon = polygon_from_baseline(line_baseline, scale)
                     line_polygon = np.array(line_polygon.exterior.coords, int)[:-1]
                     line_y, line_x = draw.polygon(line_polygon[:, 1],
                                                   line_polygon[:, 0],
@@ -460,7 +460,7 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords,
             continue
         count = np.count_nonzero(old_label * binarized)
         if not count:
-            LOG.warning("skipping binarizy-empty line '%s'", line.id)
+            LOG.warning("skipping binary-empty line '%s'", line.id)
             continue
         covers = np.count_nonzero(new_label * binarized) / count
         if covers < threshold:
@@ -494,3 +494,11 @@ def baseline_of_segment(segment, coords):
     line = transform_coordinates(line, coords['transform'])
     return np.round(line).astype(np.int32)
 
+# zzz should go into core ocrd_utils
+def polygon_from_baseline(baseline, scale):
+    ltr = baseline[0,0] < baseline[-1,0]
+    # left-hand side if left-to-right, and vice versa
+    polygon = make_valid(join_polygons([LineString(baseline).buffer(scale * (-1) ** ltr,
+                                                                    single_sided=True)],
+                                       scale=scale))
+    return polygon

From 35227a9bdc1977af7b2e51940f47516bcd1e10f0 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Tue, 30 May 2023 21:26:38 +0200
Subject: [PATCH 038/194] resegment (baseline/ccomps): improve handling of fg
 conflicts

---
 ocrd_cis/ocropy/resegment.py | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 9242773d..ad05792e 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -2,7 +2,7 @@
 
 import os.path
 import numpy as np
-from skimage import draw
+from skimage import draw, segmentation
 from shapely.geometry import Polygon, LineString
 from shapely.prepared import prep
 from shapely.ops import unary_union
@@ -429,20 +429,22 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords,
                 scale=43, loc='', threshold=0.9):
     """redefine line coordinates by contourizing spread of connected components propagated from new labels"""
     LOG = getLogger('processor.OcropyResegment')
-    DSAVE('baseline-seeds', [new_labels, (components>0)])
-    # allocate to connected components consistently (by majority,
-    # ignoring smallest components like punctuation)
-    #new_labels = morph.propagate_labels_majority(binarized, new_labels)
-    new_labels = morph.propagate_labels_majority(components > 0, new_labels)
-    DSAVE('majority-propagated', [new_labels, (components>0) & (new_labels==0)])
+    DSAVE('seeds', [new_labels, (components>0)])
+    # allocate to connected components consistently
+    # (ignoring smallest components like punctuation)
+    # but when there are conflicts, meet in the middle via watershed
+    new_labels2 = morph.propagate_labels(components > 0, new_labels, conflict=0)
+    new_labels2 = segmentation.watershed(new_labels2, markers=new_labels, mask=(components > 0))
+    DSAVE('propagated', new_labels2)
     # dilate/grow labels from connected components against each other and bg
-    new_labels = morph.spread_labels(new_labels, maxdist=scale*2)
-    DSAVE('scale-spread', [new_labels, (components>0)])
+    new_labels = morph.spread_labels(new_labels2, maxdist=scale*2)
+    DSAVE('spread', new_labels)
     # now propagate again to catch smallest components like punctuation
-    new_labels = morph.propagate_labels_majority(components > 0, new_labels)
-    DSAVE('propagated-again', [new_labels, (components>0) & (new_labels==0)])
-    new_labels = morph.spread_labels(new_labels, maxdist=scale/2)
-    DSAVE('spread-again', [new_labels, (components>0)])
+    new_labels2 = morph.propagate_labels(binarized, new_labels, conflict=0)
+    new_labels2 = segmentation.watershed(new_labels2, markers=new_labels, mask=binarized)
+    DSAVE('propagated-again', [new_labels2, binarized & (new_labels2==0)])
+    new_labels = morph.spread_labels(new_labels2, maxdist=scale/2)
+    DSAVE('spread-again', [new_labels, binarized])
     # find polygon hull and modify line coords
     for i, line in enumerate(lines):
         new_label = new_labels == i + 1

From 1abc3b7b617b1c342908e6b69f6e706a14fc666f Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 2 Jun 2023 03:50:12 +0200
Subject: [PATCH 039/194] segment: adapt to OpenCV changes

---
 ocrd_cis/ocropy/segment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 478d1c05..9e2a6ee3 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -136,7 +136,7 @@ def getx(xy):
                     #plot_poly(hole, 'blue')
                     # cut child from outside...
                     # first get nearest point on child
-                    hole_idx = np.argmin([cv2.pointPolygonTest(contour, pt[0].tolist(), True)
+                    hole_idx = np.argmin([cv2.pointPolygonTest(contour, tuple(pt[0].tolist()), True)
                                           for pt in hole])
                     # now get nearest point on parent
                     # (we cannot use PolygonTest directly, because we must also interpolate

From 4c5542208fe2a65b5529e629c9a84d94c74fe705 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 21 Jun 2023 23:09:41 +0200
Subject: [PATCH 040/194] ocrd-tool: rm old ocrd-cis-ocropy-rec (gone in
 9e20991)

---
 ocrd_cis/ocrd-tool.json | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json
index e4224263..f518c051 100644
--- a/ocrd_cis/ocrd-tool.json
+++ b/ocrd_cis/ocrd-tool.json
@@ -302,26 +302,6 @@
 				}
 			}
 		},
-		"ocrd-cis-ocropy-rec": {
-			"executable": "ocrd-cis-ocropy-rec",
-			"categories": [
-				"Text recognition and optimization"
-			],
-			"steps": [
-				"recognition/text-recognition"
-			],
-			"input_file_grp": [
-				"OCR-D-GT-SEG-BLOCK",
-				"OCR-D-SEG-BLOCK"
-			],
-			"description": "Recognize text snippets",
-			"parameters": {
-				"model": {
-					"type": "string",
-					"description": "ocropy model to apply (e.g. fraktur.pyrnn)"
-				}
-			}
-		},
 		"ocrd-cis-ocropy-segment": {
 			"executable": "ocrd-cis-ocropy-segment",
 			"categories": [

From 4c9ad27d1ae088f6df3c08fe126bc462977e7fd9 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 22 Jun 2023 01:55:14 +0200
Subject: [PATCH 041/194] =?UTF-8?q?ocropy-train:=20improve/update=20OCR-D?=
 =?UTF-8?q?=20wrapper=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- discern constructor vs. processing ctxt (initLogging, setup)
- resolve start model via resmgr
- get all text regions (recursively)
- use true tempdir for extracted files
- use CWD for model output paths (instead of dist dir)
- use binarized derived image, if possible
- use ocropy nlbin (instead of OpenCV thresholding), otherwise
- skip segment if no/empty text transcription
- simplify/deduplify hierarchy levels
- improve logging and docs
---
 ocrd_cis/ocrd-tool.json  |  10 +-
 ocrd_cis/ocropy/train.py | 245 ++++++++++++++-------------------------
 2 files changed, 97 insertions(+), 158 deletions(-)

diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json
index f518c051..953ea1f8 100644
--- a/ocrd_cis/ocrd-tool.json
+++ b/ocrd_cis/ocrd-tool.json
@@ -298,7 +298,9 @@
 				},
 				"model": {
 					"type": "string",
-					"description": "ocropy model to apply (e.g. fraktur.pyrnn)"
+					"format": "uri",
+					"content-type": "application/gzip",
+					"description": "ocropy model to apply (e.g. fraktur.pyrnn.gz)"
 				}
 			}
 		},
@@ -418,13 +420,15 @@
 			"parameters": {
 				"textequiv_level": {
 					"type": "string",
-					"description": "PAGE XML hierarchy level granularity",
+					"description": "hierarchy level to extract GT pairs from",
 					"enum": ["line", "word", "glyph"],
 					"default": "line"
 				},
 				"model": {
 					"type": "string",
-					"description": "load model or create new one (e.g. fraktur.pyrnn)"
+					"format": "uri",
+					"content-type": "application/gzip",
+					"description": "load model (e.g. 'fraktur.pyrnn.gz') to init weights, or none to train from scratch"
 				},
 				"ntrain": {
 					"type": "number",
diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py
index 4427d47c..ceb26d21 100644
--- a/ocrd_cis/ocropy/train.py
+++ b/ocrd_cis/ocropy/train.py
@@ -1,25 +1,16 @@
 from __future__ import absolute_import
 
-import sys, os.path, cv2
+import sys
+import os
+import tempfile
+
 from ocrd_modelfactory import page_from_file
 from ocrd import Processor
 from ocrd_utils import getLogger
 from ocrd_cis import get_ocrd_tool
 
-sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-
-
 from .ocropus_rtrain import *
-
-np.seterr(divide='raise',over='raise',invalid='raise',under='ignore')
-
-
-
-
-def bounding_box(coord_points):
-    point_list = [[int(p) for p in pair.split(',')] for pair in coord_points.split(' ')]
-    x_coordinates, y_coordinates = zip(*point_list)
-    return (min(x_coordinates), min(y_coordinates), max(x_coordinates), max(y_coordinates))
+from .binarize import binarize
 
 
 def deletefiles(filelist):
@@ -36,172 +27,116 @@ def resize_keep_ratio(image, baseheight=48):
     return image
 
 
-def binarize(pil_image):
-    # Convert RGB to OpenCV
-    img = cv2.cvtColor(np.asarray(pil_image), cv2.COLOR_RGB2GRAY)
-
-    # global thresholding
-    #ret1,th1 = cv2.threshold(img,127,255,cv2.THRESH_BINARY)
-
-    # Otsu's thresholding
-    #ret2,th2 = cv2.threshold(img,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
-
-    # Otsu's thresholding after Gaussian filtering
-    blur = cv2.GaussianBlur(img,(5,5),0)
-    ret3,th3 = cv2.threshold(blur,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
-
-    bin_img = Image.fromarray(th3)
-    return bin_img
-
-
-
 class OcropyTrain(Processor):
 
     def __init__(self, *args, **kwargs):
-        self.log = getLogger('OcropyTrain')
+        self.oldcwd = os.getcwd()
         ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-ocropy-train']
         kwargs['version'] = ocrd_tool['version']
         super(OcropyTrain, self).__init__(*args, **kwargs)
+        if hasattr(self, 'input_file_grp'):
+            # processing context
+            self.setup()
 
-
-    def process(self):
-        """
-        Performs the training
-        """
+    def setup(self):
+        self.log = getLogger('processor.OcropyTrain')
         #print(self.parameter)
-        if self.parameter['textequiv_level'] not in ['line', 'word', 'glyph']:
-            raise Exception("currently only implemented at the line/glyph level")
-
-        filepath = os.path.dirname(os.path.abspath(__file__))
-
-
-
-
         if 'model' in self.parameter:
             model = self.parameter['model']
-            modelpath = filepath + '/models/' + model + '.gz'
-            outputpath = filepath + '/output/' + model
+            try:
+                modelpath = self.resolve_resource(model)
+            except SystemExit:
+                ocropydir = os.path.dirname(os.path.abspath(__file__))
+                modelpath = os.path.join(ocropydir, 'models', model)
+                self.log.info("Failed to resolve model '%s' path, trying '%s'", model, modelpath)
+            if not os.path.isfile(modelpath):
+                self.log.error("Could not find model '%s'. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s'",
+                               model, model)
+                sys.exit(1)
+            outputpath = os.path.join(self.oldcwd, 'output', model)
             if 'outputpath' in self.parameter:
-                outputpath = self.parameter + '/' + model
-            if os.path.isfile(modelpath) == False:
-                raise Exception("configured model " + model + " is not in models folder")
+                outputpath = os.path.join(self.parameter, model)
         else:
             modelpath = None
-            outputpath = filepath + '/output/' + 'lstm'
+            outputpath = os.path.join(self.oldcwd, 'output', 'lstm')
             if 'outputpath' in self.parameter:
-                outputpath = self.parameter + '/' +'lstm'
-
-        if 'ntrain' in self.parameter:
-            ntrain = self.parameter['ntrain']
-
-
+                outputpath = os.path.join(self.parameter, 'lstm')
+        os.makedirs(os.path.dirname(outputpath))
+        self.modelpath = modelpath
+        self.outputpath = outputpath
 
+    def process(self):
+        """
+        Trains a new model on the text lines from the input fileGrp,
+        extracted as temporary image-text file pairs.
+        """
         filelist = []
-
+        filepath = tempfile.mkdtemp(prefix='ocrd-cis-ocropy-train-')
         #self.log.info("Using model %s in %s for recognition", model)
         for (n, input_file) in enumerate(self.input_files):
             #self.log.info("INPUT FILE %i / %s", n, input_file)
             pcgts = page_from_file(self.workspace.download_file(input_file))
-            pil_image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename)
-
+            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
+            page = pcgts.get_Page()
+            page_image, page_coords, _ = self.workspace.image_from_page(page, page_id)
 
-            self.log.info("page %s", pcgts)
-            for region in pcgts.get_Page().get_TextRegion():
+            self.log.info("Extracting from page '%s'", page_id)
+            for region in page.get_AllRegions(classes=['Text']):
                 textlines = region.get_TextLine()
-                self.log.info("About to extract %i lines in region '%s'", len(textlines), region.id)
+                self.log.info("Extracting %i lines from region '%s'", len(textlines), region.id)
                 for line in textlines:
-
                     if self.parameter['textequiv_level'] == 'line':
-                        self.log.debug("Extracting line '%s'", line.id)
-
-                        #get box from points
-                        box = bounding_box(line.get_Coords().points)
-
-                        #crop word from page
-                        croped_image = pil_image.crop(box=box)
-
-                        #binarize with Otsu's thresholding after Gaussian filtering
-                        bin_image = binarize(croped_image)
-
-                        #resize image to 48 pixel height
-                        final_img = resize_keep_ratio(bin_image)
-
-                        #save temp image
-                        path = os.path.join(filepath, 'temp', str(input_file.ID) + str(region.id) + str(line.id))
-                        imgpath = path + '.png'
-                        final_img.save(imgpath)
-
-                        filelist.append(imgpath)
-
-                        #ground truth
-                        gt = line.get_TextEquiv()[0].Unicode.strip()
-                        gtpath = path + '.gt.txt'
-                        with open(gtpath, "w", encoding='utf-8') as f:
-                            f.write(gt)
-
-
-
-                    if self.parameter['textequiv_level'] == 'word' or 'glyph':
-                        for word in line.get_Word():
-
-                            if self.parameter['textequiv_level'] == 'word':
-                                self.log.debug("Extracting word '%s'", word.id)
-
-                                #get box from points
-                                box = bounding_box(word.get_Coords().points)
-
-                                #crop word from page
-                                croped_image = pil_image.crop(box=box)
-
-                                #binarize with Otsu's thresholding after Gaussian filtering
-                                bin_image = binarize(croped_image)
-
-                                #resize image to 48 pixel height
-                                final_img = resize_keep_ratio(bin_image)
-
-                                #save temp image
-                                path = os.path.join(filepath, 'temp', str(input_file.ID) + str(region.id) + str(line.id) + str(word.id))
-                                imgpath = path + '.png'
-                                final_img.save(imgpath)
-
+                        path = os.path.join(filepath, page_id + region.id + line.id)
+                        imgpath = self.extract_segment(path, line, page_image, page_coords)
+                        if imgpath:
+                            filelist.append(imgpath)
+                        continue
+                    for word in line.get_Word():
+                        if self.parameter['textequiv_level'] == 'word':
+                            path = os.path.join(filepath, page_id + region.id + line.id + word.id)
+                            imgpath = self.extract_segment(path, word, page_image, page_coords)
+                            if imgpath:
+                                filelist.append(imgpath)
+                            continue
+                        for glyph in word.get_Glyph():
+                            path = os.path.join(filepath, page_id + region.id + line.id + glyph.id)
+                            imgpath = self.extract_segment(path, glyph, page_image, page_coords)
+                            if imgpath:
                                 filelist.append(imgpath)
 
-                                #ground truth
-                                gt = word.get_TextEquiv()[0].Unicode.strip()
-                                gtpath = path + '.gt.txt'
-
-                                with open(gtpath, "w", encoding='utf-8') as f:
-                                    f.write(gt)
-
-                            else:
-                                for glyph in word.get_Glyph():
-                                    self.log.debug("Extracting glyph '%s'", glyph.id)
-
-                                    #get box from points
-                                    box = bounding_box(glyph.get_Coords().points)
-
-                                    #crop word from page
-                                    croped_image = pil_image.crop(box=box)
-
-                                    #binarize with Otsu's thresholding after Gaussian filtering
-                                    bin_image = binarize(croped_image)
-
-                                    #resize image to 48 pixel height
-                                    final_img = resize_keep_ratio(bin_image)
-
-                                    #save temp image
-                                    path = os.path.join(filepath, 'temp', str(input_file.ID) + str(region.id) + str(line.id) + str(word.id) + str(glyph.id))
-                                    imgpath = path + '.png'
-                                    final_img.save(imgpath)
-
-                                    filelist.append(imgpath)
-
-                                    #ground truth
-                                    gt = glyph.get_TextEquiv()[0].Unicode.strip()
-                                    with open(gtpath, "w", encoding='utf-8') as f:
-                                        f.write(gt)
-
-
-        rtrain(filelist, modelpath, outputpath, ntrain)
+        self.log.info("Training %s from %s on %i file pairs",
+                      self.outputpath,
+                      self.modelpath or 'scratch',
+                      len(filelist))
+        rtrain(filelist, self.modelpath, self.outputpath, self.parameter['ntrain'])
         deletefiles(filelist)
+
+    def extract_segment(self, path, segment, page_image, page_coords):
+        #ground truth
+        gt = segment.TextEquiv
+        if not gt:
+            return None
+        gt = gt[0].Unicode
+        if not gt or not gt.strip():
+            return None
+        gt = gt.strip()
+        gtpath = path + '.gt.txt'
+        with open(gtpath, "w", encoding='utf-8') as f:
+            f.write(gt)
+
+        self.log.debug("Extracting %s '%s'", segment.__class__.__name__, segment.id)
+        image, coords = self.workspace.image_from_segment(segment, page_image, page_coords)
+
+        if 'binarized' not in coords['features'].split(','):
+            # binarize with nlbin
+            image, _ = binarize(image, maxskew=0)
+
+        # resize image to 48 pixel height
+        image = resize_keep_ratio(image)
+
+        #save temp image
+        imgpath = path + '.png'
+        image.save(imgpath)
+
+        return imgpath

From 43a356a03221c6dd95b68c9bc0cb7b563f2b4870 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 22 Jun 2023 02:48:09 +0200
Subject: [PATCH 042/194] =?UTF-8?q?postcorrect:=20improve/update=20OCR-D?=
 =?UTF-8?q?=20wrapper=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- discern constructor vs. processing ctxt (initLogging)
- pass effective log level (instead of global CLI override)
- use mets_target instead of fixed `mets.xml`
- simplify
---
 ocrd_cis/postcorrect/cli.py | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/ocrd_cis/postcorrect/cli.py b/ocrd_cis/postcorrect/cli.py
index a5125b8d..42bedc04 100644
--- a/ocrd_cis/postcorrect/cli.py
+++ b/ocrd_cis/postcorrect/cli.py
@@ -5,19 +5,14 @@
 from ocrd import Processor
 from ocrd.decorators import ocrd_cli_options
 from ocrd.decorators import ocrd_cli_wrap_processor
-from ocrd_utils import getLogger
+from ocrd_utils import getLogger, getLevelName
 from ocrd_models.ocrd_mets import OcrdMets
 from ocrd_cis import JavaPostCorrector
 from ocrd_cis import get_ocrd_tool
 
-LOG_LEVEL = 'INFO'
-
 @click.command()
 @ocrd_cli_options
 def ocrd_cis_postcorrect(*args, **kwargs):
-    if 'log_level' in kwargs and kwargs['log_level']:
-        global LOG_LEVEL
-        LOG_LEVEL = kwargs['log_level']
     return ocrd_cli_wrap_processor(PostCorrector, *args, **kwargs)
 
 class PostCorrector(Processor):
@@ -26,21 +21,22 @@ def __init__(self, *args, **kwargs):
         kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-postcorrect']
         kwargs['version'] = ocrd_tool['version']
         super(PostCorrector, self).__init__(*args, **kwargs)
-        self.log = getLogger('cis.Processor.PostCorrector')
 
     def process(self):
-        ifgs = self.input_file_grp.split(",")  # input file groups
-        ofg = self.output_file_grp
+        self.log = getLogger('processor.CISPostCorrector')
         profiler = {}
         profiler["path"] = self.parameter["profilerPath"]
         profiler["config"] = self.parameter["profilerConfig"]
         profiler["noCache"] = True
         self.parameter["profiler"] = profiler
         self.parameter["runDM"] = True
-        metspath = os.path.join(self.workspace.directory, "mets.xml")
-        print(json.dumps(self.parameter, indent=4))
-        p = JavaPostCorrector(metspath, ",".join(ifgs), ofg, self.parameter, LOG_LEVEL)
+        self.log.debug(json.dumps(self.parameter, indent=4))
+        p = JavaPostCorrector(self.workspace.mets_target,
+                              self.input_file_grp,
+                              self.output_file_grp,
+                              self.parameter,
+                              getLevelName(self.log.getEffectiveLevel()))
         p.exe()
         # reload the mets file to prevent it from overriding the
         # updated version from the java process
-        self.workspace.mets = OcrdMets(filename=metspath)
+        self.reload_mets()

From 2783f615359bf6464d77084b80983b72b3461bae Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Tue, 11 Jul 2023 17:01:15 +0200
Subject: [PATCH 043/194] segment: fix baseline extraction

---
 ocrd_cis/ocropy/deskew.py  | 2 +-
 ocrd_cis/ocropy/segment.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index bb9904e0..4ed04218 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -39,7 +39,7 @@ def __init__(self, *args, **kwargs):
         super(OcropyDeskew, self).__init__(*args, **kwargs)
 
     def process(self):
-        """Deskew the regions of the workspace.
+        """Deskew the pages or regions of the workspace.
 
         Open and deserialise PAGE input files and their respective images,
         then iterate over the element hierarchy down to the TextRegion level.
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 9e2a6ee3..ac25a1fb 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -959,7 +959,7 @@ def add_baseline(baseline):
                 add_baseline(geom)
             continue
         add_baseline(baseline)
-    if not len(result):
+    if len(result) < 2:
         return None
     return LineString(result)
 

From fcc02fddedb66c47472eddeb16d437bdca32172d Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 17 Aug 2023 18:30:07 +0200
Subject: [PATCH 044/194] adapt to Numpy and Pillow deprecations

---
 ocrd_cis/div/cutter.py                     | 2 +-
 ocrd_cis/ocropy/ocrolib/time_morphology.py | 4 ++--
 ocrd_cis/ocropy/ocropus_rtrain.py          | 2 +-
 ocrd_cis/ocropy/recognize.py               | 2 +-
 ocrd_cis/ocropy/train.py                   | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/ocrd_cis/div/cutter.py b/ocrd_cis/div/cutter.py
index ee187a1b..6dc6a9a9 100644
--- a/ocrd_cis/div/cutter.py
+++ b/ocrd_cis/div/cutter.py
@@ -26,7 +26,7 @@ def bounding_box(coord_points):
 def resize_keep_ratio(image, baseheight=48):
     hpercent = (baseheight / float(image.size[1]))
     wsize = int((float(image.size[0] * float(hpercent))))
-    image = image.resize((wsize, baseheight), Image.ANTIALIAS)
+    image = image.resize((wsize, baseheight), Image.LANCZOS)
     return image
 
 
diff --git a/ocrd_cis/ocropy/ocrolib/time_morphology.py b/ocrd_cis/ocropy/ocrolib/time_morphology.py
index 51a8e406..2e241d94 100644
--- a/ocrd_cis/ocropy/ocrolib/time_morphology.py
+++ b/ocrd_cis/ocropy/ocrolib/time_morphology.py
@@ -29,10 +29,10 @@ def cv_contours(bin):
     return zip((contour[:,0,::-1], cv2.contourArea(contour)) for contour in contours)
 
 def rb_opening(bin, size):
-    return filters.uniform_filter(filters.uniform_filter(bin, size, np.float, mode='constant', cval=1) == 1, size, np.float, origin=-1) > 1e-7
+    return filters.uniform_filter(filters.uniform_filter(bin, size, float, mode='constant', cval=1) == 1, size, float, origin=-1) > 1e-7
 
 def rb_closing(bin, size):
-    return filters.uniform_filter(filters.uniform_filter(bin, size, np.float) > 1e-7, size, mode='constant', cval=1, origin=-1) == 1
+    return filters.uniform_filter(filters.uniform_filter(bin, size, float) > 1e-7, size, mode='constant', cval=1, origin=-1) == 1
 
 def r_closing(bin, size):
     return filters.minimum_filter(filters.maximum_filter(bin, size), size, origin=-1)
diff --git a/ocrd_cis/ocropy/ocropus_rtrain.py b/ocrd_cis/ocropy/ocropus_rtrain.py
index fc34ad20..b1469e42 100644
--- a/ocrd_cis/ocropy/ocropus_rtrain.py
+++ b/ocrd_cis/ocropy/ocropus_rtrain.py
@@ -45,7 +45,7 @@ def resize_keep_ratio(image, baseheight):
         baseheight = 48
         hpercent = (baseheight / float(image.size[1]))
         wsize = int((float(image.size[0] * float(hpercent))))
-        image = image.resize((wsize, baseheight), Image.ANTIALIAS)
+        image = image.resize((wsize, baseheight), Image.LANCZOS)
         return image
 
     # make sure an output file has been set
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 07c2ebd4..e9259c6e 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -35,7 +35,7 @@
 def resize_keep_ratio(image, baseheight=48):
     scale = baseheight / image.height
     wsize = round(image.width * scale)
-    image = image.resize((wsize, baseheight), Image.ANTIALIAS)
+    image = image.resize((wsize, baseheight), Image.LANCZOS)
     return image, scale
 
 # from ocropus-rpred process1, but without input files and without lineest/dewarping
diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py
index ceb26d21..d257a61f 100644
--- a/ocrd_cis/ocropy/train.py
+++ b/ocrd_cis/ocropy/train.py
@@ -23,7 +23,7 @@ def deletefiles(filelist):
 def resize_keep_ratio(image, baseheight=48):
     hpercent = (baseheight / float(image.size[1]))
     wsize = int((float(image.size[0] * float(hpercent))))
-    image = image.resize((wsize, baseheight), Image.ANTIALIAS)
+    image = image.resize((wsize, baseheight), Image.LANCZOS)
     return image
 
 

From 1512c81de109d44bd8ff105d52552d958bc5fdbb Mon Sep 17 00:00:00 2001
From: joschrew <jonas.schrewe@gwdg.de>
Date: Thu, 1 Feb 2024 09:12:16 +0100
Subject: [PATCH 045/194] Remove testing from Dockerfile

---
 Dockerfile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index f58112b8..cb81e78e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -53,6 +53,8 @@ RUN apt-get update \
 	&& apt-get -y install --no-install-recommends gcc wget default-jre-headless \
 	&& cd /build \
 	&& make install \
-	&& make test \
+	# test always fail, resources not available for download. Resources should be made available
+	# somewhere else, e.g. github.com/OCR-D/assets
+	# && make test \
 	&& cd / \
 	&& rm -rf /build

From 67905d73675b7b1ce2ea80caefb2fdf36d1f3ee1 Mon Sep 17 00:00:00 2001
From: joschrew <jonas.schrewe@gwdg.de>
Date: Fri, 9 Feb 2024 15:52:29 +0100
Subject: [PATCH 046/194] Add metadata to Dockerfile

---
 Dockerfile | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index cb81e78e..71e8b09f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,12 @@
 FROM ocrd/core:latest AS base
+ARG VCS_REF
+ARG BUILD_DATE
+LABEL \
+    maintainer="https://github.com/OCR-D/ocrd_cis/issues" \
+    org.label-schema.vcs-ref=$VCS_REF \
+    org.label-schema.vcs-url="https://github.com/OCR-D/ocrd_cis" \
+    org.label-schema.build-date=$BUILD_DATE
+
 ENV VERSION="Di 12. Mai 13:26:35 CEST 2020"
 ENV GITURL="https://github.com/cisocrgroup"
 ENV DOWNLOAD_URL="http://cis.lmu.de/~finkf"

From d5e81876e43cb016a0f148c315f0a292bf30fff1 Mon Sep 17 00:00:00 2001
From: joschrew <jonas.schrewe@gwdg.de>
Date: Mon, 12 Feb 2024 11:26:35 +0100
Subject: [PATCH 047/194] Set docker metadata with makefile

---
 Makefile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 730ba3f4..eebe029a 100644
--- a/Makefile
+++ b/Makefile
@@ -11,7 +11,10 @@ uninstall:
 	${PIP} uninstall ${PKG}
 
 docker-build: Dockerfile
-	docker build -t flobar/ocrd_cis:latest .
+	docker build \
+	--build-arg VCS_REF=$$(git rev-parse --short HEAD) \
+	--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
+	-t flobar/ocrd_cis:latest .
 docker-push: docker-build
 	docker push flobar/ocrd_cis:latest
 

From 320d5fd69d2b6efbacaee9b70275cf5e66c9794e Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Mon, 12 Feb 2024 15:44:19 +0100
Subject: [PATCH 048/194] try to fix tests by adapting URLs

---
 tests/test_lib.bash | 45 +++++++++++++++------------------------------
 1 file changed, 15 insertions(+), 30 deletions(-)

diff --git a/tests/test_lib.bash b/tests/test_lib.bash
index 5d38f482..8cfb0018 100644
--- a/tests/test_lib.bash
+++ b/tests/test_lib.bash
@@ -4,11 +4,13 @@ tmpdir=$(mktemp -d)
 trap "rm -rf $tmpdir" EXIT
 
 OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE"
-data_url="https://ocr-d-repo.scc.kit.edu/api/v1/dataresources/75ad9f94-dbaa-43e0-ab06-2ce24c497c61/data"
+# fixme: it does not work like this - the OCR-D GT repo uses different URL paths for different datasets
+# this is merely the path for blumenbach_anatomie_1805.ocrd.zip
+data_url="https://ocr-d-repo.scc.kit.edu/api/v1/dataresources/75ad9f94-dbaa-43e0-ab06-2ce24c497c61/data/"
 function ocrd_cis_download_bagit() {
 	local url="$data_url/$1"
 	mkdir -p "$tmpdir/download"
-	wget -P "$tmpdir/download" "$url"
+	wget -nc -P "$tmpdir/download" "$url"
 }
 
 function ocrd_cis_init_ws() {
@@ -19,33 +21,16 @@ function ocrd_cis_init_ws() {
 
 function ocrd_cis_align() {
 	# download ocr models
-	wget -P "$tmpdir/download" "http://cis.lmu.de/~finkf/fraktur1-00085000.pyrnn.gz"
-	wget -P "$tmpdir/download" "http://cis.lmu.de/~finkf/fraktur2-00062000.pyrnn.gz"
+	ocrd resmgr download ocrd-cis-ocropy-recognize fraktur.pyrnn.gz
+	ocrd resmgr download ocrd-cis-ocropy-recognize fraktur-jze.pyrnn.gz
 	# run ocr
-	ocrd-cis-ocropy-recognize --log-level DEBUG \
-							  --input-file-grp "OCR-D-GT-SEG-LINE" \
-							  --output-file-grp OCR-D-CIS-OCR-1 \
-							  --mets "$tmpws/mets.xml" \
-							  --parameter <(cat <<EOF
-{
-	"textequiv_level": "word",
-	"model": "$tmpdir/download/fraktur1-00085000.pyrnn.gz"
-}
-EOF
-										   )
-	ocrd-cis-ocropy-recognize --log-level DEBUG \
-							  --input-file-grp "OCR-D-GT-SEG-LINE" \
-							  --output-file-grp OCR-D-CIS-OCR-2 \
-							  --mets "$tmpws/mets.xml" \
-							  --parameter <(cat <<EOF
-{
-	"textequiv_level": "word",
-	"model": "$tmpdir/download/fraktur2-00062000.pyrnn.gz"
-}
-EOF
-										   )
-	ocrd-cis-align --log-level DEBUG \
-				   -I OCR-D-CIS-OCR-1,OCR-D-CIS-OCR-2,OCR-D-GT-SEG-LINE \
-				   -O OCR-D-CIS-ALIGN \
-				   -m $tmpws/mets.xml
+	ocrd-cis-ocropy-recognize -l DEBUG -m $tmpws/mets.xml \
+ 				-I $OCRD_CIS_FILEGRP -O OCR-D-CIS-OCR-1 \
+				-P textequiv_level word -P model fraktur.pyrnn.gz
+	ocrd-cis-ocropy-recognize -l DEBUG -m $tmpws/mets.xml \
+				-I $OCRD_CIS_FILEGRP -O OCR-D-CIS-OCR-2 \
+				-P textequiv_level word -P model fraktur-jze.pyrnn.gz
+	ocrd-cis-align -l DEBUG -m $tmpws/mets.xml \
+				-I OCR-D-CIS-OCR-1,OCR-D-CIS-OCR-2,$OCRD_CIS_FILEGRP \
+				-O OCR-D-CIS-ALIGN 
 }

From 1ee6442cc94998c83892396fbf1d32f06ea5dd29 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Mon, 12 Feb 2024 15:51:52 +0100
Subject: [PATCH 049/194] add CircleCI config

---
 .circleci/config.yml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 .circleci/config.yml

diff --git a/.circleci/config.yml b/.circleci/config.yml
new file mode 100644
index 00000000..87ec16ba
--- /dev/null
+++ b/.circleci/config.yml
@@ -0,0 +1,19 @@
+version: 2.1
+jobs:
+
+  test-python3:
+    docker:
+      - image: ocrd/core
+    environment:
+      PIP: pip3
+      PYTHON: python3
+    steps:
+      - checkout
+      - run: make install
+      - run: make test
+
+workflows:
+  version: 2
+  build-and-test:
+    jobs:
+      - test-python3

From a4dc20f0de12cebd8ba1b54b3478d6a065d52f35 Mon Sep 17 00:00:00 2001
From: joschrew <jonas.schrewe@gwdg.de>
Date: Tue, 13 Feb 2024 12:56:23 +0100
Subject: [PATCH 050/194] Update CircleCI config

This is just a dummy commit to try to trigger circleci
---
 .circleci/config.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 87ec16ba..9f1b1685 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,6 +1,5 @@
 version: 2.1
 jobs:
-
   test-python3:
     docker:
       - image: ocrd/core
@@ -11,7 +10,6 @@ jobs:
       - checkout
       - run: make install
       - run: make test
-
 workflows:
   version: 2
   build-and-test:

From c4f0724f97dc5880a64835d40ed265a540c4b2fd Mon Sep 17 00:00:00 2001
From: joschrew <jonas.schrewe@gwdg.de>
Date: Tue, 13 Feb 2024 14:27:05 +0100
Subject: [PATCH 051/194] Another try to fix tests

---
 tests/test_lib.bash | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/tests/test_lib.bash b/tests/test_lib.bash
index 8cfb0018..7e560fbe 100644
--- a/tests/test_lib.bash
+++ b/tests/test_lib.bash
@@ -4,13 +4,12 @@ tmpdir=$(mktemp -d)
 trap "rm -rf $tmpdir" EXIT
 
 OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE"
-# fixme: it does not work like this - the OCR-D GT repo uses different URL paths for different datasets
-# this is merely the path for blumenbach_anatomie_1805.ocrd.zip
-data_url="https://ocr-d-repo.scc.kit.edu/api/v1/dataresources/75ad9f94-dbaa-43e0-ab06-2ce24c497c61/data/"
+data_url="http://hdl.handle.net/21.11156/6B119B3C-A24A-424C-AC3C-27E64B051780"
 function ocrd_cis_download_bagit() {
-	local url="$data_url/$1"
-	mkdir -p "$tmpdir/download"
-	wget -nc -P "$tmpdir/download" "$url"
+	local destdir="$tmpdir/download"
+	mkdir -p "$destdir"
+	local dest="$destdir/$1"
+	wget -nc -O $dest $data_url
 }
 
 function ocrd_cis_init_ws() {
@@ -32,5 +31,5 @@ function ocrd_cis_align() {
 				-P textequiv_level word -P model fraktur-jze.pyrnn.gz
 	ocrd-cis-align -l DEBUG -m $tmpws/mets.xml \
 				-I OCR-D-CIS-OCR-1,OCR-D-CIS-OCR-2,$OCRD_CIS_FILEGRP \
-				-O OCR-D-CIS-ALIGN 
+				-O OCR-D-CIS-ALIGN
 }

From 71e4e50965d4794b77c2f006b984ed3b299385de Mon Sep 17 00:00:00 2001
From: joschrew <jonas.schrewe@gwdg.de>
Date: Tue, 13 Feb 2024 15:11:12 +0100
Subject: [PATCH 052/194] Debug circleci tests

---
 .circleci/config.yml | 2 +-
 tests/test_lib.bash  | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 9f1b1685..1f709dd4 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -9,7 +9,7 @@ jobs:
     steps:
       - checkout
       - run: make install
-      - run: make test
+      - run: make test V=""
 workflows:
   version: 2
   build-and-test:
diff --git a/tests/test_lib.bash b/tests/test_lib.bash
index 7e560fbe..d4f5162c 100644
--- a/tests/test_lib.bash
+++ b/tests/test_lib.bash
@@ -4,6 +4,8 @@ tmpdir=$(mktemp -d)
 trap "rm -rf $tmpdir" EXIT
 
 OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE"
+# fixme: it does not work like this - the OCR-D GT repo uses different URL paths for different datasets
+# this is merely the path for blumenbach_anatomie_1805.ocrd.zip
 data_url="http://hdl.handle.net/21.11156/6B119B3C-A24A-424C-AC3C-27E64B051780"
 function ocrd_cis_download_bagit() {
 	local destdir="$tmpdir/download"

From 1b5029532fd61dbe41cdc4455876b6fe2921c7d0 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Tue, 13 Feb 2024 22:39:09 +0100
Subject: [PATCH 053/194] tests: use proper new OCR-D GT URL

---
 tests/test_lib.bash | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/tests/test_lib.bash b/tests/test_lib.bash
index d4f5162c..0603929c 100644
--- a/tests/test_lib.bash
+++ b/tests/test_lib.bash
@@ -4,14 +4,11 @@ tmpdir=$(mktemp -d)
 trap "rm -rf $tmpdir" EXIT
 
 OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE"
-# fixme: it does not work like this - the OCR-D GT repo uses different URL paths for different datasets
-# this is merely the path for blumenbach_anatomie_1805.ocrd.zip
-data_url="http://hdl.handle.net/21.11156/6B119B3C-A24A-424C-AC3C-27E64B051780"
+data_url="https://github.com/OCR-D/gt_structure_text/releases/download/l1.1.20/"
 function ocrd_cis_download_bagit() {
-	local destdir="$tmpdir/download"
-	mkdir -p "$destdir"
-	local dest="$destdir/$1"
-	wget -nc -O $dest $data_url
+	local url="$data_url/$1"
+	mkdir -p "$tmpdir/download"
+	wget -nc -P "$tmpdir/download" "$url"
 }
 
 function ocrd_cis_init_ws() {

From f214c5970628ffcf4e053ba26b811f19bfd6590d Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 14 Feb 2024 02:19:55 +0100
Subject: [PATCH 054/194] tests: reuse downloaded bag files

---
 tests/test_lib.bash | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_lib.bash b/tests/test_lib.bash
index 0603929c..0ae12d56 100644
--- a/tests/test_lib.bash
+++ b/tests/test_lib.bash
@@ -7,13 +7,13 @@ OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE"
 data_url="https://github.com/OCR-D/gt_structure_text/releases/download/l1.1.20/"
 function ocrd_cis_download_bagit() {
 	local url="$data_url/$1"
-	mkdir -p "$tmpdir/download"
-	wget -nc -P "$tmpdir/download" "$url"
+	mkdir -p "$PWD/download"
+	wget -nc -P "$PWD/download" "$url"
 }
 
 function ocrd_cis_init_ws() {
 	ocrd_cis_download_bagit "$1"
-	ocrd zip spill -d "$tmpdir" "$tmpdir/download/$1"
+	ocrd zip spill -d "$tmpdir" "$PWD/download/$1"
 	tmpws="$tmpdir/${1%.ocrd.zip}"
 }
 

From 68dcb9035cb4e17b98f5d7b206eeb72dc2bee381 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 14 Feb 2024 02:20:47 +0100
Subject: [PATCH 055/194] ocrolib.common.load_object: find ocrolib in sys.path

---
 ocrd_cis/ocropy/ocrolib/common.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ocrd_cis/ocropy/ocrolib/common.py b/ocrd_cis/ocropy/ocrolib/common.py
index 1c0c3208..6741a676 100644
--- a/ocrd_cis/ocropy/ocrolib/common.py
+++ b/ocrd_cis/ocropy/ocrolib/common.py
@@ -445,6 +445,9 @@ class names that have changed."""
         LOG.info("# loading object '%s'", fname)
     if zip==0 and fname.endswith(".gz"):
         zip = 1
+    # most models will have been pickled with ocrolib at top level
+    # we therefore need to add ocrd_cis.ocropy to the search path
+    sys.path.append(os.path.dirname(os.path.dirname(__file__)))
     if zip>0:
         with gzip.GzipFile(fname,"rb") as stream:
         #with os.popen("gunzip < '%s'"%fname,"rb") as stream:

From 4f3ea7c94c01e295a64c3150c8efb0b1749d1953 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 15 Feb 2024 00:36:24 +0100
Subject: [PATCH 056/194] postcorrect: adapt processor to new OCR-D (mets:file
 with @LOCTYPE and only relative paths)

---
 ocrd_cis/postcorrect/cli.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/ocrd_cis/postcorrect/cli.py b/ocrd_cis/postcorrect/cli.py
index 42bedc04..dc3ee48e 100644
--- a/ocrd_cis/postcorrect/cli.py
+++ b/ocrd_cis/postcorrect/cli.py
@@ -37,6 +37,12 @@ def process(self):
                               self.parameter,
                               getLevelName(self.log.getEffectiveLevel()))
         p.exe()
-        # reload the mets file to prevent it from overriding the
-        # updated version from the java process
-        self.reload_mets()
+        # reload the mets file to prevent run_processor's save_mets
+        # from overriding the results from the Java process
+        self.workspace.reload_mets()
+        # workaround for cisocrgroup/ocrd-postcorrection#13 (absolute paths in output):
+        for output_file in self.workspace.find_files(file_grp=self.output_file_grp):
+            flocat = output_file._el.find('{http://www.loc.gov/METS/}FLocat')
+            flocat.attrib['LOCTYPE'] = 'OTHER'
+            flocat.attrib['OTHERLOCTYPE'] = 'FILE'
+            output_file.local_filename = os.path.relpath(output_file.local_filename, self.workspace.directory)

From 84e3acf16bfbd1ed0e6f2d02cfb36e2560126888 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 15 Feb 2024 00:39:05 +0100
Subject: [PATCH 057/194] test_lib: update to fixed GT repo URL, don't remove
 workspace on failure

---
 tests/test_lib.bash | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_lib.bash b/tests/test_lib.bash
index 0ae12d56..e9df9985 100644
--- a/tests/test_lib.bash
+++ b/tests/test_lib.bash
@@ -1,10 +1,11 @@
 #/bin/bash
 
 tmpdir=$(mktemp -d)
+trap "trap 'echo exiting without removing $tmpdir' EXIT" ERR
 trap "rm -rf $tmpdir" EXIT
 
 OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE"
-data_url="https://github.com/OCR-D/gt_structure_text/releases/download/l1.1.20/"
+data_url="https://github.com/OCR-D/gt_structure_text/releases/download/l1.1.19/"
 function ocrd_cis_download_bagit() {
 	local url="$data_url/$1"
 	mkdir -p "$PWD/download"

From 5e0660392434d7331afbf406d164831420d7626f Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 15 Feb 2024 00:41:27 +0100
Subject: [PATCH 058/194] tests: use downloadable recognizer models throughout,
 simplify scripts

---
 tests/run_add_zip_test.bash             | 20 ++--------
 tests/run_alignment_test.bash           | 22 +++--------
 tests/run_image_preprocessing_test.bash | 49 +++++--------------------
 tests/run_ocr_test.bash                 | 31 +++++-----------
 tests/run_postcorrection_test.bash      | 43 +++++++---------------
 tests/run_training_test.bash            | 17 ++-------
 tests/test_lib.bash                     | 19 +++++++---
 7 files changed, 58 insertions(+), 143 deletions(-)

diff --git a/tests/run_add_zip_test.bash b/tests/run_add_zip_test.bash
index 003c5e86..02de2db2 100644
--- a/tests/run_add_zip_test.bash
+++ b/tests/run_add_zip_test.bash
@@ -7,30 +7,18 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip
 pushd "$tmpws"
 found_files=0
 for file in $(ocrd workspace find -G OCR-D-GT-SEG-LINE); do
-	if [[ ! -f "$file" ]]; then
-		echo "cannot find ground truth file: $file"
-		exit 1
-	fi
+	[[ -f "$file" ]] || fail "cannot find ground truth file: $file"
 	found_files=$((found_files + 1))
 done
-if [[ $found_files != 3 ]]; then
-	echo "invalid number of files: $found_files"
-	exit 1
-fi
+(( found_files == 3 )) || fail "invalid number of files: $found_files"
 popd
 
 # test if there are 3 gt files
 pushd "$tmpws"
 found_files=0
 for file in $(ocrd workspace find -G OCR-D-IMG); do
-	if [[ ! -f "$file" ]]; then
-		echo "cannot find ground truth file: $file"
-		exit 1
-	fi
+	[[ -f "$file" ]] || fail "cannot find ground truth file: $file"
 	found_files=$((found_files + 1))
 done
-if [[ $found_files != 3 ]]; then
-	echo "invalid number of files: $found_files"
-	exit 1
-fi
+(( found_files == 3 )) || fail "invalid number of files: $found_files"
 popd
diff --git a/tests/run_alignment_test.bash b/tests/run_alignment_test.bash
index 1e9e3ea0..e8a3c79a 100644
--- a/tests/run_alignment_test.bash
+++ b/tests/run_alignment_test.bash
@@ -6,17 +6,11 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip
 # test if there are 3 gt files
 pushd "$tmpws"
 found_files=0
-for file in $(ocrd workspace find -G OCR-D-GT-SEG-LINE); do
-	if [[ ! -f "$file" ]]; then
-		echo "cannot find ground truth file: $file"
-		exit 1
-	fi
+for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do
+	[[ -f "$file" ]] || fail "cannot find ground truth file: $file"
 	found_files=$((found_files + 1))
 done
-if [[ $found_files != 3 ]]; then
-	echo "invalid number of files: $found_files"
-	exit 1
-fi
+(( found_files == 3 )) || fail "invalid number of files: $found_files"
 popd
 
 ocrd_cis_align
@@ -24,14 +18,8 @@ ocrd_cis_align
 pushd $tmpws
 found_files=0
 for file in $(ocrd workspace find -G OCR-D-CIS-ALIGN); do
-	if [[ ! -f "$file" ]]; then
-		echo "cannot find aligned file group workspace"
-		exit 1
-	fi
+	[[ -f "$file" ]] || fail "cannot find aligned file group workspace"
 	found_files=$((found_files + 1))
 done
-if [[ $found_files != 3 ]]; then
-	echo "invalid number of files: $found_files"
-	exit 1
-fi
+(( found_files == 3 )) || fail "invalid number of files: $found_files"
 popd
diff --git a/tests/run_image_preprocessing_test.bash b/tests/run_image_preprocessing_test.bash
index 4fd028e4..f80fc636 100644
--- a/tests/run_image_preprocessing_test.bash
+++ b/tests/run_image_preprocessing_test.bash
@@ -7,45 +7,16 @@ ocrd_cis_init_ws "blumenbach_anatomie_1805.ocrd.zip"
 # test if there are 3 gt files
 pushd "$tmpws"
 found_files=0
-for file in $(ocrd workspace find -G OCR-D-GT-SEG-LINE); do
-	if [[ ! -f "$file" ]]; then
-		echo "cannot find ground truth file: $file"
-		exit 1
-	fi
+for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do
+	[[ -f "$file" ]] || fail "cannot find ground truth file: $file"
 	found_files=$((found_files + 1))
 done
-if [[ $found_files != 3 ]]; then
-	echo "invalid number of files: $found_files"
-	exit 1
-fi
+(( found_files == 3 )) || fail "invalid number of files: $found_files"
+
+ocrd-cis-ocropy-binarize -l DEBUG -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN
+ocrd-cis-ocropy-clip -l DEBUG -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-IMG-CLIP
+ocrd-cis-ocropy-denoise -l DEBUG -I OCR-D-CIS-IMG-CLIP -O OCR-D-CIS-IMG-DEN
+ocrd-cis-ocropy-deskew -l DEBUG -I OCR-D-CIS-IMG-DEN  -O OCR-D-CIS-IMG-DES
+ocrd-cis-ocropy-dewarp -l DEBUG -I OCR-D-CIS-IMG-DES -O OCR-D-CIS-IMG-DEW
+ocrd-cis-ocropy-segment -l DEBUG -I OCR-D-CIS-IMG-DEW -O OCR-D-CIS-IMG-SEG
 popd
-
-ocrd-cis-ocropy-binarize --log-level DEBUG \
-						 --input-file-grp OCR-D-GT-SEG-LINE \
-						 --output-file-grp OCR-D-CIS-IMG-BIN \
-						 --mets "$tmpws/mets.xml"
-
-ocrd-cis-ocropy-clip --log-level DEBUG \
-					 --input-file-grp OCR-D-CIS-IMG-BIN \
-					 --output-file-grp OCR-D-CIS-IMG-CLIP \
-					 --mets "$tmpws/mets.xml"
-
-ocrd-cis-ocropy-denoise --log-level DEBUG \
-						--input-file-grp OCR-D-CIS-IMG-CLIP \
-						--output-file-grp OCR-D-CIS-IMG-DEN \
-						--mets "$tmpws/mets.xml"
-
-ocrd-cis-ocropy-deskew --log-level DEBUG \
-					   --input-file-grp OCR-D-CIS-IMG-DEN \
-					   --output-file-grp OCR-D-CIS-IMG-DES \
-					   --mets "$tmpws/mets.xml"
-
-ocrd-cis-ocropy-dewarp --log-level DEBUG \
-					   --input-file-grp OCR-D-CIS-IMG-DES \
-					   --output-file-grp OCR-D-CIS-IMG-DEW \
-					   --mets "$tmpws/mets.xml"
-
-ocrd-cis-ocropy-segment --log-level DEBUG \
-						--input-file-grp OCR-D-CIS-IMG-DEW \
-						--output-file-grp OCR-D-CIS-IMG-SEG \
-						--mets "$tmpws/mets.xml"
diff --git a/tests/run_ocr_test.bash b/tests/run_ocr_test.bash
index 6de88a7b..b10f6f6d 100644
--- a/tests/run_ocr_test.bash
+++ b/tests/run_ocr_test.bash
@@ -6,31 +6,18 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip
 # test if there are 3 gt files
 pushd "$tmpws"
 found_files=0
-for file in $(ocrd workspace find -G OCR-D-GT-SEG-LINE); do
-	if [[ ! -f "$file" ]]; then
-		echo "cannot find ground truth file: $file"
-		exit 1
-	fi
+for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do
+	[[ -f "$file" ]] || fail "cannot find ground truth file: $file"
 	found_files=$((found_files + 1))
 done
-if [[ $found_files != 3 ]]; then
-	echo "invalid number of files: $found_files"
-	exit 1
-fi
-popd
+(( $found_files == 3 )) || fail "invalid number of files: $found_files"
 
 # download ocr model
-wget -P "$tmpdir/download" "http://cis.lmu.de/~finkf/fraktur1-00085000.pyrnn.gz"
+ocrd resmgr download ocrd-cis-ocropy-recognize fraktur.pyrnn.gz
 
 # run ocr
-ocrd-cis-ocropy-recognize --log-level DEBUG \
-						  --input-file-grp "OCR-D-GT-SEG-LINE" \
-						  --output-file-grp OCR-D-CIS-OCR \
-						  --mets "$tmpws/mets.xml" \
-						  --parameter <(cat <<EOF
-{
-	"textequiv_level": "word",
-	"model": "$tmpdir/download/fraktur1-00085000.pyrnn.gz"
-}
-EOF
-)
+ocrd-cis-ocropy-binarize -l DEBUG -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN
+ocrd-cis-ocropy-recognize -l DEBUG -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR \
+	-P textequiv_level word -P model fraktur.pyrnn.gz
+
+popd
diff --git a/tests/run_postcorrection_test.bash b/tests/run_postcorrection_test.bash
index aef28ca5..d7f34ace 100644
--- a/tests/run_postcorrection_test.bash
+++ b/tests/run_postcorrection_test.bash
@@ -6,17 +6,11 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip
 # test if there are 3 gt files
 pushd "$tmpws"
 found_files=0
-for file in $(ocrd workspace find -G OCR-D-GT-SEG-LINE); do
-	if [[ ! -f "$file" ]]; then
-		echo "cannot find ground truth file: $file"
-		exit 1
-	fi
+for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do
+	[[ -f "$file" ]] || fail "cannot find ground truth file: $file"
 	found_files=$((found_files + 1))
 done
-if [[ $found_files != 3 ]]; then
-	echo "invalid number of files: $found_files"
-	exit 1
-fi
+(( found_files == 3 )) || fail "invalid number of files: $found_files"
 popd
 
 ocrd_cis_align
@@ -28,31 +22,20 @@ cat > /dev/null
 echo '{}'
 EOF
 chmod a+x "$tmpdir/bin/profiler.bash"
-ocrd-cis-postcorrect --log-level DEBUG \
-					 -I OCR-D-CIS-ALIGN \
-					 -O OCR-D-CIS-POSTCORRECT \
-					 -m $tmpws/mets.xml \
-					 --parameter <(cat <<EOF
-{
-"profilerPath": "$tmpdir/bin/profiler.bash",
-"profilerConfig": "ignored",
-"model": "$(ocrd-cis-data -model)",
-"nOCR": 2
-}
-EOF
-)
+ocrd-cis-postcorrect -l DEBUG \
+			-I OCR-D-CIS-ALIGN \
+			-O OCR-D-CIS-POSTCORRECT \
+			-m $tmpws/mets.xml \
+			-P profilerPath $tmpdir/bin/profiler.bash \
+			-P profilerConfig ignored \
+			-P model "$(ocrd-cis-data -model)" \
+			-P nOCR 2
 
 pushd $tmpws
 found_files=0
 for file in $(ocrd workspace find -G OCR-D-CIS-POSTCORRECT); do
-	if [[ ! -f "$file" ]]; then
-		echo "$file: not a file"
-		exit 1
-	fi
+	[[ -f "$file" ]] || fail "$file: not a file"
 	found_files=$((found_files + 1))
 done
-if [[ $found_files != 3 ]]; then
-	echo "invalid number of files: $found_files"
-	exit 1
-fi
+(( found_files == 3 )) || fail "invalid number of files: $found_files"
 popd
diff --git a/tests/run_training_test.bash b/tests/run_training_test.bash
index cd0456ec..ade1b68e 100644
--- a/tests/run_training_test.bash
+++ b/tests/run_training_test.bash
@@ -6,17 +6,11 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip
 # test if there are 3 gt files
 pushd "$tmpws"
 found_files=0
-for file in $(ocrd workspace find -G OCR-D-GT-SEG-LINE); do
-	if [[ ! -f "$file" ]]; then
-		echo "cannot find ground truth file: $file"
-		exit 1
-	fi
+for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do
+	[[ -f "$file" ]] || fail "cannot find ground truth file: $file"
 	found_files=$((found_files + 1))
 done
-if [[ $found_files != 3 ]]; then
-	echo "invalid number of files: $found_files"
-	exit 1
-fi
+(( found_files == 3 )) || fail "invalid number of files: $found_files"
 popd
 
 ocrd_cis_align
@@ -57,7 +51,4 @@ cat $(ocrd-cis-data -config) \
 	| sed -e "s#/path/to/train.dir#$tmpdir/train#"
 )
 
-if [[ ! -f $tmpdir/train/model.zip ]]; then
-	echo $tmpdir/train/model.zip not found
-	exit 1
-fi
+[[ -f "$tmpdir/train/model.zip" ]] || fail "$tmpdir/train/model.zip not found"
diff --git a/tests/test_lib.bash b/tests/test_lib.bash
index e9df9985..7e2824f2 100644
--- a/tests/test_lib.bash
+++ b/tests/test_lib.bash
@@ -23,13 +23,20 @@ function ocrd_cis_align() {
 	ocrd resmgr download ocrd-cis-ocropy-recognize fraktur.pyrnn.gz
 	ocrd resmgr download ocrd-cis-ocropy-recognize fraktur-jze.pyrnn.gz
 	# run ocr
-	ocrd-cis-ocropy-recognize -l DEBUG -m $tmpws/mets.xml \
- 				-I $OCRD_CIS_FILEGRP -O OCR-D-CIS-OCR-1 \
+        pushd $tmpws
+        ocrd-cis-ocropy-binarize -l DEBUG -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN
+	ocrd-cis-ocropy-recognize -l DEBUG \
+ 				-I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR-1 \
 				-P textequiv_level word -P model fraktur.pyrnn.gz
-	ocrd-cis-ocropy-recognize -l DEBUG -m $tmpws/mets.xml \
-				-I $OCRD_CIS_FILEGRP -O OCR-D-CIS-OCR-2 \
+	ocrd-cis-ocropy-recognize -l DEBUG \
+				-I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR-2 \
 				-P textequiv_level word -P model fraktur-jze.pyrnn.gz
-	ocrd-cis-align -l DEBUG -m $tmpws/mets.xml \
-				-I OCR-D-CIS-OCR-1,OCR-D-CIS-OCR-2,$OCRD_CIS_FILEGRP \
+	ocrd-cis-align -l DEBUG	-I OCR-D-CIS-OCR-1,OCR-D-CIS-OCR-2,$OCRD_CIS_FILEGRP \
 				-O OCR-D-CIS-ALIGN
+        popd
+}
+
+function fail() {
+    echo >&2 "$@"
+    false
 }

From 05895c9ff1855674dd08cbc72b4ca13646c3bda4 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 15 Feb 2024 00:58:29 +0100
Subject: [PATCH 059/194] tests: deactivate training (broken)

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index eebe029a..22f07508 100644
--- a/Makefile
+++ b/Makefile
@@ -18,7 +18,7 @@ docker-build: Dockerfile
 docker-push: docker-build
 	docker push flobar/ocrd_cis:latest
 
-TEST_SCRIPTS=$(sort $(wildcard tests/run_*.bash))
+TEST_SCRIPTS=$(sort $(filter-out tests/run_training_test.bash, $(wildcard tests/run_*.bash)))
 .PHONY: $(TEST_SCRIPTS)
 $(TEST_SCRIPTS):
 	bash $@ $V

From 29e6e3a6eea681c5aaa9b4f365e1f84c9e3dd250 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 15 Feb 2024 00:59:05 +0100
Subject: [PATCH 060/194] makefile: allow setting different tag for docker
 build

---
 Makefile | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 22f07508..a040cf9d 100644
--- a/Makefile
+++ b/Makefile
@@ -2,6 +2,7 @@ PY ?= python3
 PIP ?= pip3
 V ?= > /dev/null 2>&1
 PKG = ocrd_cis
+TAG = flobar/ocrd_cis
 
 install:
 	${PIP} install --upgrade pip .
@@ -14,14 +15,14 @@ docker-build: Dockerfile
 	docker build \
 	--build-arg VCS_REF=$$(git rev-parse --short HEAD) \
 	--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
-	-t flobar/ocrd_cis:latest .
+	-t $(TAG):latest .
 docker-push: docker-build
-	docker push flobar/ocrd_cis:latest
+	docker push $(TAG):latest
 
 TEST_SCRIPTS=$(sort $(filter-out tests/run_training_test.bash, $(wildcard tests/run_*.bash)))
 .PHONY: $(TEST_SCRIPTS)
 $(TEST_SCRIPTS):
 	bash $@ $V
 test: $(TEST_SCRIPTS)
-	echo $^
-.PHONY: install test
+	@echo $^
+.PHONY: install install-devel uninstall test docker-build docker-push

From 07662f22a1d26639dea77a60d9854d2a949eb20a Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 15 Feb 2024 00:59:31 +0100
Subject: [PATCH 061/194] CI: add CD

---
 .circleci/config.yml | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 1f709dd4..470197da 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -10,8 +10,32 @@ jobs:
       - checkout
       - run: make install
       - run: make test V=""
+
+  deploy-docker:
+    docker:
+      - image: circleci/buildpack-deps:stretch
+    environment:
+      DOCKER_TAG: ocrd/cis
+    steps:
+      - checkout
+      - setup_remote_docker: # https://circleci.com/docs/2.0/building-docker-images/
+         docker_layer_caching: true
+      - run: make docker TAG=$DOCKER_TAG
+      - run:
+          name: Login to Docker Hub
+          command: echo "$DOCKERHUB_PASS" | docker login --username "$DOCKERHUB_USER" --password-stdin
+      - run: docker push $DOCKER_TAG
+
 workflows:
   version: 2
   build-and-test:
     jobs:
       - test-python3
+  deploy:
+    jobs:
+      - deploy-docker:
+          filters:
+            branches:
+              only:
+                - master
+                - fix-alpha-shape

From 4673d9b342a25200313ed2ea31ab2b5796b4d4f8 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 25 Apr 2024 14:47:07 +0200
Subject: [PATCH 062/194] re/segment join_polygons: fix rare case of adjacent
 rings

---
 ocrd_cis/ocropy/segment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index ac25a1fb..077363e1 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -908,7 +908,7 @@ def join_polygons(polygons, loc='', scale=20):
     dists = np.eye(npoly, dtype=float)
     for i, j in pairs:
         dist = polygons[i].distance(polygons[j])
-        if dist == 0:
+        if dist < 1e-5:
             dist = 1e-5 # if pair merely touches, we still need to get an edge
         dists[i, j] = dist
         dists[j, i] = dist

From 338b840e46e378e5427e37a2a95777d64a861332 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 25 Apr 2024 14:52:27 +0200
Subject: [PATCH 063/194] re/segment join_baselines: adapt to Shapely, improve

---
 ocrd_cis/ocropy/segment.py | 119 +++++++++++++++++++++++++++++--------
 1 file changed, 94 insertions(+), 25 deletions(-)

diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 077363e1..49cb6776 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -11,6 +11,7 @@
 from shapely.prepared import prep
 from shapely.ops import unary_union, nearest_points
 from shapely.validation import explain_validity
+from shapely import set_precision
 
 from ocrd_modelfactory import page_from_file
 from ocrd_models.ocrd_page import (
@@ -931,37 +932,105 @@ def join_polygons(polygons, loc='', scale=20):
 
 def join_baselines(baselines, loc=''):
     LOG = getLogger('processor.OcropyResegment')
-    result = []
-    def add_baseline(baseline):
-        nonlocal result
-        base_x = [pt[0] for pt in result]
-        base_left = min(base_x, default=0)
-        base_right = max(base_x, default=0)
-        left = baseline.bounds[0]
-        right = baseline.bounds[2]
-        if baseline.coords[0][0] > baseline.coords[-1][0]:
-            baseline.coords = list(baseline.coords[::-1])
-        if left > base_right:
-            result.extend(baseline.coords)
-        elif right < base_left:
-            result = list(baseline.coords) + result
-        else:
-            LOG.warning("baseline part crosses existing x in %s", loc)
-            return
-        assert all(p1[0] < p2[0] for p1, p2 in zip(result[:-1], result[1:])), result
+    lines = []
     for baseline in baselines:
         if (baseline.is_empty or
             baseline.geom_type in ['Point', 'MultiPoint']):
             continue
-        if (baseline.geom_type == 'GeometryCollection' or
-            baseline.geom_type.startswith('Multi')):
+        elif baseline.geom_type == 'MultiLineString':
+            lines.extend(baseline.geoms)
+        elif baseline.geom_type == 'LineString':
+            lines.append(baseline)
+        elif baseline.geom_type == 'GeometryCollection':
             for geom in baseline.geoms:
-                add_baseline(geom)
-            continue
-        add_baseline(baseline)
-    if len(result) < 2:
+                if geom.geom_type == 'LineString':
+                    lines.append(geom)
+                elif geom.geom_type == 'MultiLineString':
+                    lines.extend(geom)
+                else:
+                    LOG.warning("ignoring baseline subtype %s in %s", geom.geom_type, loc)
+        else:
+            LOG.warning("ignoring baseline type %s in %s", baseline.geom_type, loc)
+    nlines = len(lines)
+    if nlines == 0:
+        return None
+    elif nlines == 1:
+        return lines[0]
+    # Shapely cannot reorder:
+    #result = line_merge(MultiLineString([line.normalize() for line in lines]))
+    # find min-dist path through all lines (travelling salesman)
+    pairs = itertools.combinations(range(nlines), 2)
+    dists = np.eye(nlines, dtype=float)
+    for i, j in pairs:
+        dist = lines[i].distance(lines[j])
+        if dist < 1e-5:
+            dist = 1e-5 # if pair merely touches, we still need to get an edge
+        dists[i, j] = dist
+        dists[j, i] = dist
+    dists = minimum_spanning_tree(dists, overwrite=True)
+    assert dists.nonzero()[0].size, dists
+    # get path
+    chains = []
+    for prevl, nextl in zip(*dists.nonzero()):
+        foundchains = []
+        for chain in chains:
+            if chain[0] == prevl:
+                found = chain, 0, nextl
+            elif chain[0] == nextl:
+                found = chain, 0, prevl
+            elif chain[-1] == prevl:
+                found = chain, -1, nextl
+            elif chain[-1] == nextl:
+                found = chain, -1, prevl
+            else:
+                continue
+            foundchains.append(found)
+        if len(foundchains):
+            assert len(foundchains) <= 2, foundchains
+            chain, pos, node = foundchains.pop()
+            if len(foundchains):
+                otherchain, otherpos, othernode = foundchains.pop()
+                assert node != othernode
+                assert chain[pos] == othernode
+                assert otherchain[otherpos] == node
+                if pos < 0 and otherpos < 0:
+                    chain.extend(reversed(otherchain))
+                    chains.remove(otherchain)
+                elif pos < 0 and otherpos == 0:
+                    chain.extend(otherchain)
+                    chains.remove(otherchain)
+                elif pos == 0 and otherpos == 0:
+                    otherchain.extend(reversed(chain))
+                    chains.remove(chain)
+                elif pos == 0 and otherpos < 0:
+                    otherchain.extend(chain)
+                    chains.remove(chain)
+            elif pos < 0:
+                chain.append(node)
+            else:
+                chain.insert(0, node)
+        else:
+            chains.append([prevl, nextl])
+    if len(chains) > 1:
+        LOG.warning("baseline merge impossible (no spanning tree) in %s", loc)
+        return None
+    assert len(chains) == 1, chains
+    assert len(chains[0]) == nlines, chains[0]
+    path = chains[0]
+    # get points
+    coords = []
+    for node in path:
+        line = lines[node]
+        coords.extend(line.normalize().coords)
+    result = LineString(coords)
+    if result.is_empty:
+        LOG.warning("baseline merge is empty in %s", loc)
         return None
-    return LineString(result)
+    assert result.geom_type == 'LineString', result.wkt
+    result = set_precision(result, 1.0)
+    if result.geom_type != 'LineString' or not result.is_valid:
+        result = LineString(np.round(line.coords))
+    return result
 
 def page_get_reading_order(ro, rogroup):
     """Add all elements from the given reading order group to the given dictionary.

From d2a52794789e7810b02de12f50b15b188f08c616 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 25 Apr 2024 15:09:48 +0200
Subject: [PATCH 064/194] resegment (lineest): fix/improve matching

---
 ocrd_cis/ocropy/resegment.py | 106 ++++++++++++++++++++---------------
 1 file changed, 61 insertions(+), 45 deletions(-)

diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index ad05792e..929edc3a 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -312,52 +312,68 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
         for i, new_line_poly in enumerate(new_line_polygons):
             for j, line_poly in enumerate(line_polygons):
                 # too strict: .contains
-                if line_poly.intersects(new_line_poly):
-                    inter = make_intersection(line_poly.context, new_line_poly)
-                    if not inter:
-                        continue
-                    new_line_mask = (new_line_labels == i+1) & parent_bin
-                    line_mask = line_labels[j] & parent_bin
-                    inter_mask = new_line_mask & line_mask
-                    if (not np.count_nonzero(inter_mask) or
-                        not np.count_nonzero(new_line_mask) or
-                        not np.count_nonzero(line_mask)):
-                        continue
-                    intersections[(i, j)] = inter
-                    fits_bg[i, j] = inter.area / new_line_poly.area
-                    covers_bg[i, j] = inter.area / line_poly.context.area
-                    fits_fg[i, j] = np.count_nonzero(inter_mask) / np.count_nonzero(new_line_mask)
-                    covers_fg[i, j] = np.count_nonzero(inter_mask) / np.count_nonzero(line_mask)
-                    # LOG.debug("new %d old %d (%s): %.1f%% / %.1f%% bg, %.1f%% / %.1f%% fg",
-                    #           i, j, lines[j].id,
-                    #           fits_bg[i,j]*100, covers_bg[i,j]*100,
-                    #           fits_fg[i,j]*100, covers_fg[i,j]*100)
-        # assign new lines to existing lines, if possible
-        assignments = np.ones(len(new_line_polygons), int) * -1
-        for i, new_line_poly in enumerate(new_line_polygons):
-            if not fits_bg[i].any():
-                LOG.debug("new line %d fits no existing line's background", i)
-                continue
-            if not fits_fg[i].any():
-                LOG.debug("new line %d fits no existing line's foreground", i)
-                continue
-            fits = (fits_bg[i] > 0.6) & (fits_fg[i] > 0.9)
-            if not fits.any():
-                j = np.argmax(fits_bg[i] * fits_fg[i])
-                LOG.debug("best fit '%s' for new line %d fits only %.1f%% bg / %.1f%% fg",
-                          lines[j].id, i, fits_bg[i,j] * 100, fits_fg[i,j] * 100)
+                if not line_poly.intersects(new_line_poly):
+                    continue
+                inter = make_intersection(line_poly.context, new_line_poly)
+                if not inter:
+                    continue
+                new_line_mask = (new_line_labels == i+1) & parent_bin
+                line_mask = line_labels[j] & parent_bin
+                inter_mask = new_line_mask & line_mask
+                if (not np.count_nonzero(inter_mask) or
+                    not np.count_nonzero(new_line_mask) or
+                    not np.count_nonzero(line_mask)):
+                    continue
+                intersections[(i, j)] = inter
+                fits_bg[i, j] = inter.area / new_line_poly.area
+                covers_bg[i, j] = inter.area / line_poly.context.area
+                fits_fg[i, j] = np.count_nonzero(inter_mask) / np.count_nonzero(new_line_mask)
+                covers_fg[i, j] = np.count_nonzero(inter_mask) / np.count_nonzero(line_mask)
+                # LOG.debug("new %d old %d (%s): %.1f%% / %.1f%% bg, %.1f%% / %.1f%% fg",
+                #           i, j, lines[j].id,
+                #           fits_bg[i,j]*100, covers_bg[i,j]*100,
+                #           fits_fg[i,j]*100, covers_fg[i,j]*100)
+        # assign existing lines to new lines (1:n), if possible
+        # start from best matches (forced alignment)
+        dim1 = len(new_line_polygons)
+        dim2 = len(line_polygons)
+        idx1 = np.arange(dim1)
+        idx2 = np.arange(dim2)
+        keep1 = np.ones(dim1, bool)
+        keep2 = np.ones(dim2, bool)
+        assignments = -1 * np.ones(dim1, int)
+        for _ in range(dim1):
+            fit_bg_view = fits_bg[np.ix_(keep1, keep2)]
+            if not fit_bg_view.size:
+                break
+            cov_bg_view = covers_bg[np.ix_(keep1, keep2)]
+            fit_fg_view = fits_fg[np.ix_(keep1, keep2)]
+            cov_fg_view = covers_fg[np.ix_(keep1, keep2)]
+            priority = cov_fg_view * cov_bg_view
+            ind1, ind2 = np.unravel_index(np.argmax(priority, axis=None), priority.shape)
+            fit_fg = fit_fg_view[ind1, ind2]
+            fit_bg = fit_bg_view[ind1, ind2]
+            cov_fg = cov_fg_view[ind1, ind2]
+            cov_bg = cov_bg_view[ind1, ind2]
+            # return to full view and assign next
+            ind1 = idx1[keep1][ind1]
+            ind2 = idx2[keep2][ind2]
+            #new_poly = new_line_polygons[ind1]
+            #poly = line_polygons[ind2]
+            # assignment must be new
+            assert assignments[ind1] < 0
+            assert keep1[ind1]
+            assert keep2[ind2]
+            # minimum threshold
+            if not (fit_bg > 0.6 and fit_fg > 0.7):
+                # skip next time
+                # LOG.debug("match for %s too large: %d%%fg / %d%%bg", lines[ind2].id, fit_fg*100, fit_bg*100)
+                covers_bg[ind1, ind2] = 0
+                covers_fg[ind1, ind2] = 0
                 continue
-            covers = covers_bg[i] * covers_fg[i] * fits
-            j = np.argmax(covers)
-            line = lines[j]
-            inter_polygon = intersections[(i,j)]
-            new_line_polygon = new_line_polygons[i]
-            new_center = inter_polygon.centroid
-            center = new_line_polygon.centroid
-            # FIXME: apply reasonable threshold for centroid distance
-            LOG.debug("new line for '%s' has centroid distance %.2f",
-                      line.id, center.distance(new_center))
-            assignments[i] = j
+            assignments[ind1] = ind2
+            keep1[ind1] = False
+            #keep2[ind2] = False
         # validate assignments retain enough area and do not loose unassigned matches
         line_polygons = [poly.context.buffer(-margin) for poly in line_polygons]
         for j, line in enumerate(lines):

From 8a71d8e84af01ce769a2c44e3f02d6a702efedb7 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 25 Apr 2024 15:12:41 +0200
Subject: [PATCH 065/194] resegment (lineest): use new polygons instead of
 intersections but ignore extend_margins

---
 ocrd_cis/ocrd-tool.json      |  2 +-
 ocrd_cis/ocropy/resegment.py | 57 ++++++++++++++++++------------------
 2 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json
index 953ea1f8..be763142 100644
--- a/ocrd_cis/ocrd-tool.json
+++ b/ocrd_cis/ocrd-tool.json
@@ -225,7 +225,7 @@
 				"extend_margins": {
 					"type": "number",
 					"format": "integer",
-					"description": "number of pixels to extend the input polygons in all directions",
+					"description": "(ignored)",
 					"default": 3
 				}
 			}
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 929edc3a..ddb8fcb5 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -183,7 +183,6 @@ def process(self):
     def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, lines, ignore):
         LOG = getLogger('processor.OcropyResegment')
         threshold = self.parameter['min_fraction']
-        margin = self.parameter['extend_margins']
         method = self.parameter['method']
         # prepare line segmentation
         parent_array = pil2array(parent_image)
@@ -206,32 +205,34 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
         line_labels = np.zeros_like(parent_bin, bool)
         line_labels = np.tile(line_labels[np.newaxis], (len(lines), 1, 1))
         line_polygons = []
-        for i, segment in enumerate(lines):
-            if self.parameter['baseline_only'] and segment.Baseline:
-                segment_baseline = baseline_of_segment(segment, parent_coords)
-                segment_polygon = polygon_from_baseline(segment_baseline, 30/zoom)
+        for i, line in enumerate(lines):
+            if self.parameter['baseline_only'] and line.Baseline:
+                line_base = baseline_of_segment(line, parent_coords)
+                line_poly = polygon_from_baseline(line_base, 30/zoom)
             else:
-                segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords)
-                segment_polygon = make_valid(Polygon(segment_polygon)).buffer(margin)
-            line_polygons.append(prep(segment_polygon))
-            segment_polygon = np.array(segment_polygon.exterior.coords, int)[:-1]
-            # draw.polygon: If any segment_polygon lies outside of parent
+                line_poly = coordinates_of_segment(line, parent_image, parent_coords)
+                line_poly = make_valid(Polygon(line_poly))
+            line_polygons.append(line_poly)
+        line_polygons = list(map(prep, line_polygons))
+        for i, line_polygon in enumerate(line_polygons):
+            polygon = np.array(line_polygon.context.exterior.coords, int)[:-1]
+            # draw.polygon: If any line_polygon lies outside of parent
             # (causing negative/above-max indices), either fully or partially,
             # then this will silently ignore them. The caller does not need
             # to concern herself with this.
-            segment_y, segment_x = draw.polygon(segment_polygon[:, 1],
-                                                segment_polygon[:, 0],
-                                                parent_bin.shape)
-            line_labels[i, segment_y, segment_x] = True
+            line_y, line_x = draw.polygon(polygon[:, 1],
+                                          polygon[:, 0],
+                                          parent_bin.shape)
+            line_labels[i, line_y, line_x] = True
         # only text region(s) may contain new text lines
-        for i, segment in enumerate(set(line.parent_object_ for line in lines)):
+        for i, region in enumerate(set(line.parent_object_ for line in lines)):
             LOG.debug('unmasking area of text region "%s" for "%s"',
-                      segment.id, page_id if fullpage else parent.id)
-            segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords)
-            segment_polygon = make_valid(Polygon(segment_polygon)).buffer(margin)
-            segment_polygon = np.array(segment_polygon.exterior.coords, int)[:-1]
-            ignore_bin[draw.polygon(segment_polygon[:, 1],
-                                    segment_polygon[:, 0],
+                      region.id, page_id if fullpage else parent.id)
+            region_polygon = coordinates_of_segment(region, parent_image, parent_coords)
+            region_polygon = make_valid(Polygon(region_polygon))
+            region_polygon = np.array(region_polygon.exterior.coords, int)[:-1]
+            ignore_bin[draw.polygon(region_polygon[:, 1],
+                                    region_polygon[:, 0],
                                     parent_bin.shape)] = False
         # mask/ignore overlapping neighbours
         for i, segment in enumerate(ignore):
@@ -295,11 +296,10 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
         new_line_polygons, new_line_labels = masks2polygons(
             new_line_labels, new_baselines, parent_bin, '%s "%s"' % (tag, parent.id),
             min_area=640/zoom/zoom)
-        DSAVE('line_labels', [np.mean(line_labels, axis=0), parent_bin])
+        DSAVE('line_labels', [np.argmax(np.insert(line_labels, 0, 0, axis=0), axis=0), parent_bin])
         DSAVE('new_line_labels', [new_line_labels, parent_bin])
-        new_line_polygons, new_baselines = list(zip(*[
-            (make_valid(Polygon(line_poly)), LineString(baseline))
-            for _, line_poly, baseline in new_line_polygons])) or ([], [])
+        new_line_polygons, new_baselines = list(zip(*[(Polygon(poly), LineString(base))
+                                                      for _, poly, base in new_line_polygons])) or ([], [])
         # polygons for intersecting pairs
         intersections = dict()
         # ratio of overlap between intersection and new line
@@ -375,7 +375,6 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             keep1[ind1] = False
             #keep2[ind2] = False
         # validate assignments retain enough area and do not loose unassigned matches
-        line_polygons = [poly.context.buffer(-margin) for poly in line_polygons]
         for j, line in enumerate(lines):
             new_lines = np.nonzero(assignments == j)[0]
             if not np.prod(new_lines.shape):
@@ -404,9 +403,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             # combine all assigned new lines to single outline polygon
             if len(new_lines) > 1:
                 LOG.debug("joining %d new line polygons for '%s'", len(new_lines), line.id)
-            new_polygon = join_polygons([intersections[(i, j)]
+            new_polygon = join_polygons([new_line_polygons[i] #intersections[(i, j)]
                                          for i in new_lines], loc=line.id, scale=scale)
-            line_polygons[j] = new_polygon
             new_baseline = join_baselines([new_polygon.intersection(new_baselines[i])
                                            for i in new_lines], loc=line.id)
             # convert back to absolute (page) coordinates:
@@ -422,6 +420,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                 new_baseline = coordinates_for_segment(new_baseline.coords,
                                                        parent_image, parent_coords)
                 line.set_Baseline(BaselineType(points=points_from_polygon(new_baseline)))
+            line_polygons[j] = prep(new_polygon)
             # now also ensure the assigned lines do not overlap other existing lines
             for i in new_lines:
                 for otherj in np.nonzero(fits_fg[i] > 0.1)[0]:
@@ -429,7 +428,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                         continue
                     otherline = lines[otherj]
                     LOG.debug("subtracting new '%s' from overlapping '%s'", line.id, otherline.id)
-                    other_polygon = diff_polygons(line_polygons[otherj], new_polygon)
+                    other_polygon = diff_polygons(line_polygons[otherj].context, new_polygon)
                     if other_polygon.is_empty:
                         continue
                     # convert back to absolute (page) coordinates:

From 6e95b3847ec5532c039062062129ce3c1c1a6bf7 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Mon, 6 May 2024 13:15:25 +0200
Subject: [PATCH 066/194] tests: update data_url (after force-push upstream)

---
 tests/test_lib.bash | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_lib.bash b/tests/test_lib.bash
index 7e2824f2..199e2a7b 100644
--- a/tests/test_lib.bash
+++ b/tests/test_lib.bash
@@ -5,7 +5,7 @@ trap "trap 'echo exiting without removing $tmpdir' EXIT" ERR
 trap "rm -rf $tmpdir" EXIT
 
 OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE"
-data_url="https://github.com/OCR-D/gt_structure_text/releases/download/l1.1.19/"
+data_url="https://github.com/OCR-D/gt_structure_text/releases/download/v1.4.3/"
 function ocrd_cis_download_bagit() {
 	local url="$data_url/$1"
 	mkdir -p "$PWD/download"

From b6c89572f5b5e78b181e5e28660597fef055ae3b Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 29 May 2024 17:08:13 +0200
Subject: [PATCH 067/194] resegment: expose parameter spread (analogous to
 segment)

---
 ocrd_cis/ocrd-tool.json      |  6 ++++++
 ocrd_cis/ocropy/resegment.py | 17 +++++++++--------
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json
index be763142..7e5203c1 100644
--- a/ocrd_cis/ocrd-tool.json
+++ b/ocrd_cis/ocrd-tool.json
@@ -222,6 +222,12 @@
 					"description": "share of foreground pixels that must be retained by the output polygons",
 					"default": 0.75
 				},
+				"spread": {
+					"type": "number",
+					"format": "float",
+					"description": "distance in points (pt) from the foreground to project textline labels into the background for polygonal contours; if zero, project half a scale/capheight",
+					"default": 2.4
+				},
 				"extend_margins": {
 					"type": "number",
 					"format": "integer",
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index ddb8fcb5..2495cadd 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -184,6 +184,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
         LOG = getLogger('processor.OcropyResegment')
         threshold = self.parameter['min_fraction']
         method = self.parameter['method']
+        maxdist = self.parameter['spread']/zoom*300/72 # in pt
         # prepare line segmentation
         parent_array = pil2array(parent_image)
         #parent_array, _ = common.binarize(parent_array, maxskew=0) # just in case still raw
@@ -273,19 +274,19 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                         new_labels[line_labels[i]] = i + 1
                         continue
                     line_baseline = baseline_of_segment(line, parent_coords)
-                    line_polygon = polygon_from_baseline(line_baseline, scale)
+                    line_polygon = polygon_from_baseline(line_baseline, maxdist or scale/2)
                     line_polygon = np.array(line_polygon.exterior.coords, int)[:-1]
                     line_y, line_x = draw.polygon(line_polygon[:, 1],
                                                   line_polygon[:, 0],
                                                   parent_bin.shape)
                     new_labels[line_y, line_x] = i + 1
             spread_dist(lines, line_labels, new_labels, parent_bin, components, parent_coords,
-                        scale=scale, loc=parent.id, threshold=threshold)
+                        maxdist=maxdist or scale/2, loc=parent.id, threshold=threshold)
             return
         try:
             new_line_labels, new_baselines, _, _, _, scale = compute_segmentation(
-                parent_bin, seps=ignore_bin, zoom=zoom, fullpage=fullpage,
-                maxseps=0, maxcolseps=len(ignore), maximages=0)
+                parent_bin, seps=ignore_bin, zoom=zoom, spread_dist=maxdist or scale/2,
+                fullpage=fullpage, maxseps=0, maxcolseps=len(ignore), maximages=0)
         except Exception as err:
             LOG.error('Cannot line-segment %s "%s": %s',
                       tag, page_id if fullpage else parent.id, err)
@@ -441,7 +442,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                     otherline.get_Coords().set_points(points_from_polygon(other_polygon))
 
 def spread_dist(lines, old_labels, new_labels, binarized, components, coords,
-                scale=43, loc='', threshold=0.9):
+                maxdist=43, loc='', threshold=0.9):
     """redefine line coordinates by contourizing spread of connected components propagated from new labels"""
     LOG = getLogger('processor.OcropyResegment')
     DSAVE('seeds', [new_labels, (components>0)])
@@ -452,13 +453,13 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords,
     new_labels2 = segmentation.watershed(new_labels2, markers=new_labels, mask=(components > 0))
     DSAVE('propagated', new_labels2)
     # dilate/grow labels from connected components against each other and bg
-    new_labels = morph.spread_labels(new_labels2, maxdist=scale*2)
+    new_labels = morph.spread_labels(new_labels2, maxdist=maxdist)
     DSAVE('spread', new_labels)
     # now propagate again to catch smallest components like punctuation
     new_labels2 = morph.propagate_labels(binarized, new_labels, conflict=0)
     new_labels2 = segmentation.watershed(new_labels2, markers=new_labels, mask=binarized)
     DSAVE('propagated-again', [new_labels2, binarized & (new_labels2==0)])
-    new_labels = morph.spread_labels(new_labels2, maxdist=scale/2)
+    new_labels = morph.spread_labels(new_labels2, maxdist=maxdist/4)
     DSAVE('spread-again', [new_labels, binarized])
     # find polygon hull and modify line coords
     for i, line in enumerate(lines):
@@ -496,7 +497,7 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords,
             # get alpha shape
             poly = join_polygons([make_valid(Polygon(contour))
                                   for contour in contours],
-                                 loc=line.id, scale=scale)
+                                 loc=line.id, scale=maxdist)
         poly = poly.exterior.coords[:-1]
         polygon = coordinates_for_segment(poly, None, coords)
         polygon = polygon_for_parent(polygon, line.parent_object_)

From 3346b4e2b5181398676b5a476e18c3866dbb6306 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 30 May 2024 01:14:33 +0200
Subject: [PATCH 068/194] test assets: workaround for core#1189 /
 gt_structure_text#2

---
 tests/test_lib.bash | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_lib.bash b/tests/test_lib.bash
index 199e2a7b..f28acb1e 100644
--- a/tests/test_lib.bash
+++ b/tests/test_lib.bash
@@ -5,7 +5,7 @@ trap "trap 'echo exiting without removing $tmpdir' EXIT" ERR
 trap "rm -rf $tmpdir" EXIT
 
 OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE"
-data_url="https://github.com/OCR-D/gt_structure_text/releases/download/v1.4.3/"
+data_url="https://github.com/OCR-D/gt_structure_text/releases/download/v1.2.4/"
 function ocrd_cis_download_bagit() {
 	local url="$data_url/$1"
 	mkdir -p "$PWD/download"

From 38ce45bf016546b748cce65031cad3fe24a35c0d Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 30 May 2024 08:23:40 +0200
Subject: [PATCH 069/194] CircleCI: install JRE

---
 .circleci/config.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 470197da..35f0a966 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -8,6 +8,7 @@ jobs:
       PYTHON: python3
     steps:
       - checkout
+      - run: apt-get update && apt-get -y install default-jre-headless
       - run: make install
       - run: make test V=""
 
@@ -20,7 +21,7 @@ jobs:
       - checkout
       - setup_remote_docker: # https://circleci.com/docs/2.0/building-docker-images/
          docker_layer_caching: true
-      - run: make docker TAG=$DOCKER_TAG
+      - run: make docker-build TAG=$DOCKER_TAG
       - run:
           name: Login to Docker Hub
           command: echo "$DOCKERHUB_PASS" | docker login --username "$DOCKERHUB_USER" --password-stdin

From 8d65708cc8ee6f42d00796d9dc1ed441b7cd7474 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 5 Jul 2024 13:26:38 +0200
Subject: [PATCH 070/194] resegment: fix 2 edge cases

---
 ocrd_cis/ocropy/resegment.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 2495cadd..a337b5e0 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -262,7 +262,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                 for i, label in enumerate(labels):
                     distances[i] = morph.dist_labels(label.astype(np.uint8))
                     # normalize the distances of all lines so larger ones do not displace smaller ones
-                    distances[i] = distances[i] / distances[i].max() * 255
+                    if distances[i].any():
+                        distances[i] = distances[i] / distances[i].max() * 255
                 # use depth to flatten overlapping lines as seed labels
                 new_labels = np.argmax(distances, axis=0)
             else:
@@ -496,7 +497,8 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords,
         else:
             # get alpha shape
             poly = join_polygons([make_valid(Polygon(contour))
-                                  for contour in contours],
+                                  for contour in contours
+                                  if len(contour) >= 4],
                                  loc=line.id, scale=maxdist)
         poly = poly.exterior.coords[:-1]
         polygon = coordinates_for_segment(poly, None, coords)

From eb4efe1e7bd21e9a1cf5d7c18dcca4d868a92f66 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 5 Jul 2024 13:33:30 +0200
Subject: [PATCH 071/194] ocrd-tool.json: add Ocropy default model resources

---
 ocrd_cis/ocrd-tool.json | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json
index 7e5203c1..a93917da 100644
--- a/ocrd_cis/ocrd-tool.json
+++ b/ocrd_cis/ocrd-tool.json
@@ -308,7 +308,33 @@
 					"content-type": "application/gzip",
 					"description": "ocropy model to apply (e.g. fraktur.pyrnn.gz)"
 				}
-			}
+			},
+			"resources": [
+				{
+					"url": "https://github.com/zuphilip/ocropy-models/raw/master/en-default.pyrnn.gz",
+					"name": "en-default.pyrnn.gz",
+					"description": "Default ocropy model for English",
+					"size": 83826134
+				},
+				{
+					"url": "https://github.com/zuphilip/ocropy-models/raw/master/fraktur.pyrnn.gz",
+					"name": "fraktur.pyrnn.gz",
+					"description": "Default ocropy fraktur model",
+					"size": 43882365
+				},
+				{
+					"url": "https://github.com/jze/ocropus-model_fraktur/raw/master/fraktur.pyrnn.gz",
+					"name": "fraktur-jze.pyrnn.gz",
+					"description": "ocropy fraktur model by github.com/jze",
+					"size": 2961298
+				},
+				{
+					"url": "https://github.com/chreul/OCR_Testdata_EarlyPrintedBooks/raw/master/LatinHist-98000.pyrnn.gz",
+					"name": "LatinHist.pyrnn.gz",
+					"description": "ocropy historical latin model by github.com/chreul",
+					"size": 16989864
+				}
+			]
 		},
 		"ocrd-cis-ocropy-segment": {
 			"executable": "ocrd-cis-ocropy-segment",

From 842b4c25e5cd1529aaa533dc0b5f552c16c53c1a Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 5 Jul 2024 13:47:03 +0200
Subject: [PATCH 072/194] docker: adapt to core using /build already

---
 Dockerfile | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 71e8b09f..efffa9d9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -24,14 +24,14 @@ RUN sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen \
 FROM base AS profiler
 RUN apt-get update \
 	&& apt-get -y install --no-install-recommends cmake g++ libcppunit-dev libxerces-c-dev \
-	&& git clone ${GITURL}/Profiler --branch devel --single-branch /build \
-	&& cd /build \
+	&& git clone ${GITURL}/Profiler --branch devel --single-branch /build/Profiler \
+	&& pushd /build/Profiler \
 	&& cmake -DCMAKE_BUILD_TYPE=release . \
 	&& make compileFBDic trainFrequencyList runDictSearch profiler \
 	&& mkdir /apps \
 	&& cp bin/compileFBDic bin/trainFrequencyList bin/profiler bin/runDictSearch /apps/ \
-	&& cd / \
-    && rm -rf /build
+	&& popd \
+    && rm -rf /build/Profiler
 
 FROM profiler AS languagemodel
 # install the profiler's language backend
@@ -40,13 +40,13 @@ COPY --from=profiler /apps/trainFrequencyList /apps/
 COPY --from=profiler /apps/runDictSearch /apps/
 RUN apt-get update \
 	&& apt-get -y install --no-install-recommends icu-devtools \
-	&& git clone ${GITURL}/Resources --branch master --single-branch /build \
-	&& cd /build/lexica \
+	&& git clone ${GITURL}/Resources --branch master --single-branch /build/Resources \
+	&& pushd /build/Resources/lexica \
 	&& PATH=$PATH:/apps make \
 	&& PATH=$PATH:/apps make test \
 	&& PATH=$PATH:/apps make install \
-	&& cd / \
-	&& rm -rf /build
+	&& popd \
+	&& rm -rf /build/Resources
 
 FROM base AS postcorrection
 # install ocrd_cis (python)
@@ -56,13 +56,13 @@ COPY --from=profiler /apps/profiler /apps/
 COPY --from=profiler /usr/lib/x86_64-linux-gnu/libicuuc.so /usr/lib//x86_64-linux-gnu/
 COPY --from=profiler /usr/lib/x86_64-linux-gnu/libicudata.so /usr/lib//x86_64-linux-gnu/
 COPY --from=profiler /usr/lib//x86_64-linux-gnu/libxerces-c-3.2.so /usr/lib//x86_64-linux-gnu/
-COPY . /build
+COPY . /build/ocrd_cis
 RUN apt-get update \
 	&& apt-get -y install --no-install-recommends gcc wget default-jre-headless \
-	&& cd /build \
+	&& pushd /build/ocrd_cis \
 	&& make install \
 	# test always fail, resources not available for download. Resources should be made available
 	# somewhere else, e.g. github.com/OCR-D/assets
 	# && make test \
-	&& cd / \
-	&& rm -rf /build
+	&& popd \
+	&& rm -rf /build/ocrd_cis

From 53ae7d69fac017100bcdae2573d643a28c6a8f84 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 5 Jul 2024 17:28:25 +0200
Subject: [PATCH 073/194] use importlib instead of pkg_resources via ocrd_utils

---
 ocrd_cis/ocrd_tool.py | 2 +-
 setup.py              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ocrd_cis/ocrd_tool.py b/ocrd_cis/ocrd_tool.py
index 8cd184fb..0f06e55f 100644
--- a/ocrd_cis/ocrd_tool.py
+++ b/ocrd_cis/ocrd_tool.py
@@ -1,5 +1,5 @@
 import json
-from pkg_resources import resource_string
+from ocrd_utils import resource_string
 
 
 def get_ocrd_tool():
diff --git a/setup.py b/setup.py
index a5e19979..fcdf0a44 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@
     packages=find_packages(),
     include_package_data=True,
     install_requires=[
-        'ocrd>=2.30',
+        'ocrd>=2.47',
         'click',
         'scipy',
         'numpy>=1.17.0',

From fed84da7c731c7e2ed3840122df7f5345c465534 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 31 Jul 2024 15:59:17 +0200
Subject: [PATCH 074/194] fix 53ae7d69 (already str not bytes)

---
 ocrd_cis/ocrd_tool.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ocrd_cis/ocrd_tool.py b/ocrd_cis/ocrd_tool.py
index 0f06e55f..36cb9d7e 100644
--- a/ocrd_cis/ocrd_tool.py
+++ b/ocrd_cis/ocrd_tool.py
@@ -3,5 +3,4 @@
 
 
 def get_ocrd_tool():
-    return json.loads(
-        resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
+    return json.loads(resource_string(__name__, 'ocrd-tool.json'))

From 5282092997ad6b2b53a3cef8c3c96fbb27066682 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 31 Jul 2024 16:03:48 +0200
Subject: [PATCH 075/194] recognize: replace python-levenshtein with rapidfuzz

---
 ocrd_cis/ocropy/recognize.py | 2 +-
 setup.py                     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index e9259c6e..74d858ab 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -5,7 +5,7 @@
 import numpy as np
 from PIL import Image
 
-import Levenshtein
+from rapidfuzz.distance import Levenshtein
 
 from ocrd_utils import (
     getLogger,
diff --git a/setup.py b/setup.py
index fcdf0a44..6df9445c 100644
--- a/setup.py
+++ b/setup.py
@@ -47,7 +47,7 @@
         'scikit-image',
         'networkx',
         'opencv-python-headless',
-        'python-Levenshtein'
+        'rapidfuzz'
     ],
     extras_require={
         'debug': ['matplotlib>3.0.0'],

From a382d6fbcb64f2890ba4f22a38b8d1484b88e3df Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 31 Jul 2024 16:21:34 +0200
Subject: [PATCH 076/194] fix+update dockerfile

---
 Dockerfile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index efffa9d9..e7b2249a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM ocrd/core:latest AS base
+FROM ocrd/core:v2.67.2 AS base
 ARG VCS_REF
 ARG BUILD_DATE
 LABEL \
@@ -7,10 +7,11 @@ LABEL \
     org.label-schema.vcs-url="https://github.com/OCR-D/ocrd_cis" \
     org.label-schema.build-date=$BUILD_DATE
 
-ENV VERSION="Di 12. Mai 13:26:35 CEST 2020"
 ENV GITURL="https://github.com/cisocrgroup"
 ENV DOWNLOAD_URL="http://cis.lmu.de/~finkf"
 
+SHELL ["/bin/bash", "-c"]
+
 # deps
 RUN apt-get update \
 	&& apt-get -y install --no-install-recommends locales

From 2ed2c4f89ab4611d24e0a9328479124f88750ca1 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 13 Aug 2024 10:41:03 +0200
Subject: [PATCH 077/194] add executable property

---
 ocrd_cis/ocropy/binarize.py  | 10 ++++++----
 ocrd_cis/ocropy/clip.py      |  8 +++++---
 ocrd_cis/ocropy/denoise.py   |  8 +++++---
 ocrd_cis/ocropy/deskew.py    |  6 +++++-
 ocrd_cis/ocropy/dewarp.py    | 10 ++++++----
 ocrd_cis/ocropy/recognize.py | 10 ++++++----
 ocrd_cis/ocropy/resegment.py |  8 +++++---
 ocrd_cis/ocropy/segment.py   |  8 +++++---
 ocrd_cis/ocropy/train.py     |  6 +++++-
 9 files changed, 48 insertions(+), 26 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 872185c3..7429d14a 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -28,8 +28,6 @@
 
 #sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 
-TOOL = 'ocrd-cis-ocropy-binarize'
-
 def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0):
     LOG = getLogger('processor.OcropyBinarize')
     LOG.debug('binarizing %dx%d image with method=%s', pil_image.width, pil_image.height, method)
@@ -71,13 +69,17 @@ class OcropyBinarize(Processor):
 
     def __init__(self, *args, **kwargs):
         self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL]
+        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
         super(OcropyBinarize, self).__init__(*args, **kwargs)
         if hasattr(self, 'output_file_grp'):
             # processing context
             self.setup()
-    
+
+    @property
+    def executable(self):
+        return 'ocrd-cis-ocropy-binarize'
+
     def setup(self):
         self.logger = getLogger('processor.OcropyBinarize')
         if self.parameter['grayscale'] and self.parameter['method'] != 'ocropy':
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index a305f09e..919b26b0 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -31,16 +31,18 @@
     pil2array, array2pil
 )
 
-TOOL = 'ocrd-cis-ocropy-clip'
-
 class OcropyClip(Processor):
 
     def __init__(self, *args, **kwargs):
         self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL]
+        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
         super(OcropyClip, self).__init__(*args, **kwargs)
 
+    @property
+    def executable(self):
+        return 'ocrd-cis-ocropy-clip'
+
     def process(self):
         """Clip text regions / lines of the workspace at intersections with neighbours.
 
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index cbbdf8cf..ac3c4dc5 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -19,16 +19,18 @@
     # binarize,
     remove_noise)
 
-TOOL = 'ocrd-cis-ocropy-denoise'
-
 class OcropyDenoise(Processor):
 
     def __init__(self, *args, **kwargs):
         self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL]
+        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
         super(OcropyDenoise, self).__init__(*args, **kwargs)
 
+    @property
+    def executable(self):
+        return 'ocrd-cis-ocropy-denoise'
+
     def process(self):
         """Despeckle the pages / regions / lines of the workspace.
 
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index 4ed04218..fe61fce3 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -34,10 +34,14 @@ class OcropyDeskew(Processor):
 
     def __init__(self, *args, **kwargs):
         ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = ocrd_tool['tools'][TOOL]
+        kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable]
         kwargs['version'] = ocrd_tool['version']
         super(OcropyDeskew, self).__init__(*args, **kwargs)
 
+    @property
+    def executable(self):
+        return 'ocrd-cis-ocropy-deskew'
+
     def process(self):
         """Deskew the pages or regions of the workspace.
 
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 7d3251bf..1bc4a805 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -24,8 +24,6 @@
 
 #sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 
-TOOL = 'ocrd-cis-ocropy-dewarp'
-
 class InvalidLine(Exception):
     """Line image does not allow dewarping and should be ignored."""
 
@@ -72,13 +70,17 @@ class OcropyDewarp(Processor):
 
     def __init__(self, *args, **kwargs):
         self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL]
+        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
         super(OcropyDewarp, self).__init__(*args, **kwargs)
         if hasattr(self, 'output_file_grp'):
             # processing context
             self.setup()
-    
+
+    @property
+    def executable(self):
+        return 'ocrd-cis-ocropy-dewarp'
+
     def setup(self):
         # defaults from ocrolib.lineest:
         self.lnorm = lineest.CenterNormalizer(
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 74d858ab..5734aa92 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -30,8 +30,6 @@
     check_line
 )
 
-TOOL = 'ocrd-cis-ocropy-recognize'
-
 def resize_keep_ratio(image, baseheight=48):
     scale = baseheight / image.height
     wsize = round(image.width * scale)
@@ -85,13 +83,17 @@ def __init__(self, *args, **kwargs):
         self.ocrd_tool = get_ocrd_tool()
         self.pad = 16 # ocropus-rpred default
         self.network = None # set in process
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL]
+        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
         super(OcropyRecognize, self).__init__(*args, **kwargs)
         if hasattr(self, 'output_file_grp'):
             # processing context
             self.setup()
-    
+
+    @property
+    def executable(self):
+        return 'ocrd-cis-ocropy-recognize'
+
     def setup(self):
         self.logger = getLogger('processor.OcropyRecognize')
         # from ocropus-rpred:
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index a337b5e0..2b1f73c3 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -46,16 +46,18 @@
     diff_polygons
 )
 
-TOOL = 'ocrd-cis-ocropy-resegment'
-
 class OcropyResegment(Processor):
 
     def __init__(self, *args, **kwargs):
         self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL]
+        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
         super().__init__(*args, **kwargs)
 
+    @property
+    def executable(self):
+        return 'ocrd-cis-ocropy-resegment'
+
     def process(self):
         """Resegment lines of the workspace.
 
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 49cb6776..1624597e 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -58,8 +58,6 @@
     lines2regions
 )
 
-TOOL = 'ocrd-cis-ocropy-segment'
-
 def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True):
     """Convert label masks into polygon coordinates.
 
@@ -248,10 +246,14 @@ class OcropySegment(Processor):
 
     def __init__(self, *args, **kwargs):
         self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL]
+        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
         super(OcropySegment, self).__init__(*args, **kwargs)
 
+    @property
+    def executable(self):
+        return 'ocrd-cis-ocropy-segment'
+
     def process(self):
         """Segment pages into regions+lines, tables into cells+lines, or regions into lines.
         
diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py
index d257a61f..46e9d258 100644
--- a/ocrd_cis/ocropy/train.py
+++ b/ocrd_cis/ocropy/train.py
@@ -32,13 +32,17 @@ class OcropyTrain(Processor):
     def __init__(self, *args, **kwargs):
         self.oldcwd = os.getcwd()
         ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-ocropy-train']
+        kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable]
         kwargs['version'] = ocrd_tool['version']
         super(OcropyTrain, self).__init__(*args, **kwargs)
         if hasattr(self, 'input_file_grp'):
             # processing context
             self.setup()
 
+    @property
+    def executable(self):
+        return 'ocrd-cis-ocropy-train'
+
     def setup(self):
         self.log = getLogger('processor.OcropyTrain')
         #print(self.parameter)

From 61e6caf06ff479d4e6a8c59d85254d5a25fa79e4 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 13 Aug 2024 10:54:46 +0200
Subject: [PATCH 078/194] add setup method if missing

---
 ocrd_cis/ocropy/binarize.py  | 10 ++++++----
 ocrd_cis/ocropy/clip.py      |  5 +++++
 ocrd_cis/ocropy/denoise.py   |  5 +++++
 ocrd_cis/ocropy/deskew.py    |  5 +++++
 ocrd_cis/ocropy/dewarp.py    |  4 +++-
 ocrd_cis/ocropy/recognize.py |  4 +++-
 ocrd_cis/ocropy/resegment.py |  5 +++++
 ocrd_cis/ocropy/segment.py   |  5 +++++
 ocrd_cis/ocropy/train.py     |  2 +-
 9 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 7429d14a..f42ff2bd 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -68,6 +68,7 @@ def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zo
 class OcropyBinarize(Processor):
 
     def __init__(self, *args, **kwargs):
+        self.logger = getLogger('processor.OcropyBinarize')
         self.ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
@@ -81,10 +82,11 @@ def executable(self):
         return 'ocrd-cis-ocropy-binarize'
 
     def setup(self):
-        self.logger = getLogger('processor.OcropyBinarize')
-        if self.parameter['grayscale'] and self.parameter['method'] != 'ocropy':
-            self.logger.critical('requested method %s does not support grayscale normalized output',
-                                 self.parameter['method'])
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
+        method = self.parameter['method']
+        if self.parameter['grayscale'] and method != 'ocropy':
+            self.logger.critical(f'Requested method {method} does not support grayscale normalized output')
             raise Exception('only method=ocropy allows grayscale=true')
 
     def process(self):
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 919b26b0..d11b8eae 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -34,6 +34,7 @@
 class OcropyClip(Processor):
 
     def __init__(self, *args, **kwargs):
+        self.logger = getLogger('processor.OcropyClip')
         self.ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
@@ -43,6 +44,10 @@ def __init__(self, *args, **kwargs):
     def executable(self):
         return 'ocrd-cis-ocropy-clip'
 
+    def setup(self):
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
+
     def process(self):
         """Clip text regions / lines of the workspace at intersections with neighbours.
 
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index ac3c4dc5..fc1b582e 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -22,6 +22,7 @@
 class OcropyDenoise(Processor):
 
     def __init__(self, *args, **kwargs):
+        self.logger = getLogger('processor.OcropyDenoise')
         self.ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
@@ -31,6 +32,10 @@ def __init__(self, *args, **kwargs):
     def executable(self):
         return 'ocrd-cis-ocropy-denoise'
 
+    def setup(self):
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
+
     def process(self):
         """Despeckle the pages / regions / lines of the workspace.
 
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index fe61fce3..1ffaec62 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -33,6 +33,7 @@ def deskew(pil_image, maxskew=2):
 class OcropyDeskew(Processor):
 
     def __init__(self, *args, **kwargs):
+        self.logger = getLogger('processor.OcropyDeskew')
         ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable]
         kwargs['version'] = ocrd_tool['version']
@@ -42,6 +43,10 @@ def __init__(self, *args, **kwargs):
     def executable(self):
         return 'ocrd-cis-ocropy-deskew'
 
+    def setup(self):
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
+
     def process(self):
         """Deskew the pages or regions of the workspace.
 
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 1bc4a805..89a62e11 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -69,6 +69,7 @@ def padvert(image, range_):
 class OcropyDewarp(Processor):
 
     def __init__(self, *args, **kwargs):
+        self.logger = getLogger('processor.OcropyDewarp')
         self.ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
@@ -82,6 +83,8 @@ def executable(self):
         return 'ocrd-cis-ocropy-dewarp'
 
     def setup(self):
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
         # defaults from ocrolib.lineest:
         self.lnorm = lineest.CenterNormalizer(
             params=(self.parameter['range'],
@@ -91,7 +94,6 @@ def setup(self):
                     #  dependency between smoothness
                     #  and extra params)
                     0.3))
-        self.logger = getLogger('processor.OcropyDewarp')
 
     def process(self):
         """Dewarp the lines of the workspace.
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 5734aa92..fdeaed27 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -80,6 +80,7 @@ def recognize(image, pad, network, check=True):
 class OcropyRecognize(Processor):
 
     def __init__(self, *args, **kwargs):
+        self.logger = getLogger('processor.OcropyRecognize')
         self.ocrd_tool = get_ocrd_tool()
         self.pad = 16 # ocropus-rpred default
         self.network = None # set in process
@@ -95,7 +96,8 @@ def executable(self):
         return 'ocrd-cis-ocropy-recognize'
 
     def setup(self):
-        self.logger = getLogger('processor.OcropyRecognize')
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
         # from ocropus-rpred:
         self.network = load_object(self.get_model(), verbose=1)
         for x in self.network.walk():
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 2b1f73c3..d9a92390 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -49,6 +49,7 @@
 class OcropyResegment(Processor):
 
     def __init__(self, *args, **kwargs):
+        self.logger = getLogger('processor.OcropyResegment')
         self.ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
@@ -58,6 +59,10 @@ def __init__(self, *args, **kwargs):
     def executable(self):
         return 'ocrd-cis-ocropy-resegment'
 
+    def setup(self):
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
+
     def process(self):
         """Resegment lines of the workspace.
 
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 1624597e..7488eefe 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -245,6 +245,7 @@ def getx(xy):
 class OcropySegment(Processor):
 
     def __init__(self, *args, **kwargs):
+        self.logger = getLogger('processor.OcropySegment')
         self.ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
@@ -254,6 +255,10 @@ def __init__(self, *args, **kwargs):
     def executable(self):
         return 'ocrd-cis-ocropy-segment'
 
+    def setup(self):
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
+
     def process(self):
         """Segment pages into regions+lines, tables into cells+lines, or regions into lines.
         
diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py
index 46e9d258..25317c4d 100644
--- a/ocrd_cis/ocropy/train.py
+++ b/ocrd_cis/ocropy/train.py
@@ -30,6 +30,7 @@ def resize_keep_ratio(image, baseheight=48):
 class OcropyTrain(Processor):
 
     def __init__(self, *args, **kwargs):
+        self.log = getLogger('processor.OcropyTrain')
         self.oldcwd = os.getcwd()
         ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable]
@@ -44,7 +45,6 @@ def executable(self):
         return 'ocrd-cis-ocropy-train'
 
     def setup(self):
-        self.log = getLogger('processor.OcropyTrain')
         #print(self.parameter)
         if 'model' in self.parameter:
             model = self.parameter['model']

From a0965c2aa7d6315f001606bc1c6043a020095ef9 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 13 Aug 2024 14:02:55 +0200
Subject: [PATCH 079/194] add self.logger wherever missing

---
 ocrd_cis/ocropy/clip.py      |  20 +++---
 ocrd_cis/ocropy/denoise.py   |  16 ++---
 ocrd_cis/ocropy/deskew.py    |  14 ++--
 ocrd_cis/ocropy/resegment.py |  74 +++++++++----------
 ocrd_cis/ocropy/segment.py   | 136 ++++++++++++++++++-----------------
 5 files changed, 129 insertions(+), 131 deletions(-)

diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index d11b8eae..4c0eebea 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -83,13 +83,12 @@ def process(self):
         # too. However, region-level clipping _must_ be run before region-level
         # deskewing, because that would make segments incomensurable with their
         # neighbours.
-        LOG = getLogger('processor.OcropyClip')
         level = self.parameter['level-of-operation']
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
-            LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
+            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
             file_id = make_file_id(input_file, self.output_file_grp)
 
             pcgts = page_from_file(self.workspace.download_file(input_file))
@@ -105,7 +104,7 @@ def process(self):
                 dpi = page_image_info.resolution
                 if page_image_info.resolutionUnit == 'cm':
                     dpi *= 2.54
-                LOG.info('Page "%s" uses %f DPI', page_id, dpi)
+                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
                 zoom = 300.0/dpi
             else:
                 zoom = 1
@@ -127,7 +126,7 @@ def process(self):
                 page.get_TableRegion() +
                 page.get_UnknownRegion())
             if not num_texts:
-                LOG.warning('Page "%s" contains no text regions', page_id)
+                self.logger.warning('Page "%s" contains no text regions', page_id)
             background = ImageStat.Stat(page_image)
             # workaround for Pillow#4925
             if len(background.bands) > 1:
@@ -158,7 +157,7 @@ def process(self):
                 if level == 'region':
                     if region.get_AlternativeImage():
                         # FIXME: This should probably be an exception (bad workflow configuration).
-                        LOG.warning('Page "%s" region "%s" already contains image data: skipping',
+                        self.logger.warning('Page "%s" region "%s" already contains image data: skipping',
                                     page_id, region.id)
                         continue
                     shape = prep(shapes[i])
@@ -176,7 +175,7 @@ def process(self):
                 # level == 'line':
                 lines = region.get_TextLine()
                 if not lines:
-                    LOG.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
+                    self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
                     continue
                 region_image, region_coords = self.workspace.image_from_segment(
                     region, page_image, page_coords, feature_selector='binarized')
@@ -194,7 +193,7 @@ def process(self):
                 for j, line in enumerate(lines):
                     if line.get_AlternativeImage():
                         # FIXME: This should probably be an exception (bad workflow configuration).
-                        LOG.warning('Page "%s" region "%s" line "%s" already contains image data: skipping',
+                        self.logger.warning('Page "%s" region "%s" line "%s" already contains image data: skipping',
                                     page_id, region.id, line.id)
                         continue
                     shape = prep(shapes[j])
@@ -219,13 +218,12 @@ def process(self):
                 local_filename=file_path,
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts))
-            LOG.info('created file ID: %s, file_grp: %s, path: %s',
+            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
                      file_id, self.output_file_grp, out.local_filename)
 
     def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
                         background_image, parent_image, parent_coords, parent_bin,
                         page_id, file_id):
-        LOG = getLogger('processor.OcropyClip')
         # initialize AlternativeImage@comments classes from parent, except
         # for those operations that can apply on multiple hierarchy levels:
         features = ','.join(
@@ -237,7 +235,7 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
         segment_bbox = bbox_from_polygon(segment_polygon)
         for neighbour, neighbour_mask in neighbours:
             if not np.any(segment_mask > neighbour_mask):
-                LOG.info('Ignoring enclosing neighbour "%s" of segment "%s" on page "%s"',
+                self.logger.info('Ignoring enclosing neighbour "%s" of segment "%s" on page "%s"',
                          neighbour.id, segment.id, page_id)
                 continue
             # find connected components that (only) belong to the neighbour:
@@ -247,7 +245,7 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
             num_foreground = np.count_nonzero(segment_mask * parent_bin)
             if not num_intruders:
                 continue
-            LOG.debug('segment "%s" vs neighbour "%s": suppressing %d of %d pixels on page "%s"',
+            self.logger.debug('segment "%s" vs neighbour "%s": suppressing %d of %d pixels on page "%s"',
                       segment.id, neighbour.id, num_intruders, num_foreground, page_id)
             # suppress in segment_mask so these intruders can stay in the neighbours
             # (are not removed from both sides)
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index fc1b582e..d6a4f7ff 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -57,13 +57,12 @@ def process(self):
 
         Produce a new output file by serialising the resulting hierarchy.
         """
-        LOG = getLogger('processor.OcropyDenoise')
         level = self.parameter['level-of-operation']
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
-            LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
+            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
             file_id = make_file_id(input_file, self.output_file_grp)
 
             pcgts = page_from_file(self.workspace.download_file(input_file))
@@ -80,7 +79,7 @@ def process(self):
                 dpi = page_image_info.resolution
                 if page_image_info.resolutionUnit == 'cm':
                     dpi *= 2.54
-                LOG.info('Page "%s" uses %f DPI', page_id, dpi)
+                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
                 zoom = 300.0/dpi
             else:
                 zoom = 1
@@ -91,7 +90,7 @@ def process(self):
             else:
                 regions = page.get_AllRegions(classes=['Text'], order='reading-order')
                 if not regions:
-                    LOG.warning('Page "%s" contains no text regions', page_id)
+                    self.logger.warning('Page "%s" contains no text regions', page_id)
                 for region in regions:
                     region_image, region_xywh = self.workspace.image_from_segment(
                         region, page_image, page_xywh,
@@ -102,7 +101,7 @@ def process(self):
                         continue
                     lines = region.get_TextLine()
                     if not lines:
-                        LOG.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
+                        self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
                     for line in lines:
                         line_image, line_xywh = self.workspace.image_from_segment(
                             line, region_image, region_xywh,
@@ -121,15 +120,14 @@ def process(self):
                 local_filename=file_path,
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts))
-            LOG.info('created file ID: %s, file_grp: %s, path: %s',
+            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
                      file_id, self.output_file_grp, out.local_filename)
 
     def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id, file_id):
-        LOG = getLogger('processor.OcropyDenoise')
         if not segment_image.width or not segment_image.height:
-            LOG.warning("Skipping '%s' with zero size", file_id)
+            self.logger.warning("Skipping '%s' with zero size", file_id)
             return
-        LOG.info("About to despeckle '%s'", file_id)
+        self.logger.info("About to despeckle '%s'", file_id)
         bin_image = remove_noise(segment_image,
                                  maxsize=self.parameter['noise_maxsize']/zoom*300/72) # in pt
         # update METS (add the image file):
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index 1ffaec62..63bb6b97 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -65,13 +65,12 @@ def process(self):
 
         Produce a new output file by serialising the resulting hierarchy.
         """
-        LOG = getLogger('processor.OcropyDeskew')
         level = self.parameter['level-of-operation']
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
-            LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
+            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
             file_id = make_file_id(input_file, self.output_file_grp)
 
             pcgts = page_from_file(self.workspace.download_file(input_file))
@@ -95,7 +94,7 @@ def process(self):
                 else: # region
                     regions = page.get_AllRegions(classes=['Text'], order='reading-order')
                 if not regions:
-                    LOG.warning('Page "%s" contains no text regions', page_id)
+                    self.logger.warning('Page "%s" contains no text regions', page_id)
                 for region in regions:
                     # process region:
                     region_image, region_coords = self.workspace.image_from_segment(
@@ -118,23 +117,22 @@ def process(self):
                 local_filename=file_path,
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts))
-            LOG.info('created file ID: %s, file_grp: %s, path: %s',
+            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
                      file_id, self.output_file_grp, out.local_filename)
 
     def _process_segment(self, segment, segment_image, segment_coords, segment_id, page_id, file_id):
-        LOG = getLogger('processor.OcropyDeskew')
         if not segment_image.width or not segment_image.height:
-            LOG.warning("Skipping %s with zero size", segment_id)
+            self.logger.warning("Skipping %s with zero size", segment_id)
             return
         angle0 = segment_coords['angle'] # deskewing (w.r.t. top image) already applied to segment_image
-        LOG.info("About to deskew %s", segment_id)
+        self.logger.info("About to deskew %s", segment_id)
         angle = deskew(segment_image, maxskew=self.parameter['maxskew']) # additional angle to be applied
         # segment angle: PAGE orientation is defined clockwise,
         # whereas PIL/ndimage rotation is in mathematical direction:
         orientation = -(angle + angle0)
         orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
         segment.set_orientation(orientation) # also removes all deskewed AlternativeImages
-        LOG.info("Found angle for %s: %.1f", segment_id, angle)
+        self.logger.info("Found angle for %s: %.1f", segment_id, angle)
         # delegate reflection, rotation and re-cropping to core:
         if isinstance(segment, PageType):
             segment_image, segment_coords, _ = self.workspace.image_from_page(
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index d9a92390..2261cf3e 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -105,7 +105,6 @@ def process(self):
 
         Produce a new output file by serialising the resulting hierarchy.
         """
-        LOG = getLogger('processor.OcropyResegment')
         # This makes best sense for bad/coarse line segmentation, like current GT
         # or as postprocessing for bbox-only steps like Tesseract.
         # Most notably, it can convert rectangles to polygons (polygonalization),
@@ -120,7 +119,7 @@ def process(self):
         assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for n, input_file in enumerate(self.input_files):
-            LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
+            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
             file_id = make_file_id(input_file, self.output_file_grp)
 
             pcgts = page_from_file(self.workspace.download_file(input_file))
@@ -136,7 +135,7 @@ def process(self):
                 dpi = page_image_info.resolution
                 if page_image_info.resolutionUnit == 'cm':
                     dpi *= 2.54
-                LOG.info('Page "%s" uses %f DPI', page_id, dpi)
+                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
                 zoom = 300.0/dpi
             else:
                 zoom = 1
@@ -156,14 +155,14 @@ def process(self):
                       page.get_CustomRegion())
             regions = page.get_AllRegions(classes=['Text'])
             if not regions:
-                LOG.warning('Page "%s" contains no text regions', page_id)
+                self.logger.warning('Page "%s" contains no text regions', page_id)
             elif level == 'page':
                 lines = [line for region in regions
                          for line in region.get_TextLine()]
                 if lines:
                     self._process_segment(page, page_image, page_coords, page_id, zoom, lines, ignore)
                 else:
-                    LOG.warning('Page "%s" contains no text regions with lines', page_id)
+                    self.logger.warning('Page "%s" contains no text regions with lines', page_id)
             else:
                 for region in regions:
                     lines = region.get_TextLine()
@@ -172,7 +171,7 @@ def process(self):
                             region, page_image, page_coords, feature_selector='binarized')
                         self._process_segment(region, region_image, region_coords, page_id, zoom, lines, ignore)
                     else:
-                        LOG.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
+                        self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
 
             # update METS (add the PAGE file):
             file_path = os.path.join(self.output_file_grp, file_id + '.xml')
@@ -184,11 +183,10 @@ def process(self):
                 local_filename=file_path,
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts))
-            LOG.info('created file ID: %s, file_grp: %s, path: %s',
+            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
                      file_id, self.output_file_grp, out.local_filename)
 
     def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, lines, ignore):
-        LOG = getLogger('processor.OcropyResegment')
         threshold = self.parameter['min_fraction']
         method = self.parameter['method']
         maxdist = self.parameter['spread']/zoom*300/72 # in pt
@@ -206,7 +204,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             fullpage = False
             report = check_region(parent_bin, zoom)
         if report:
-            LOG.warning('Invalid %s "%s": %s', tag,
+            self.logger.warning('Invalid %s "%s": %s', tag,
                         page_id if fullpage else parent.id, report)
             return
         # get existing line labels:
@@ -234,7 +232,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             line_labels[i, line_y, line_x] = True
         # only text region(s) may contain new text lines
         for i, region in enumerate(set(line.parent_object_ for line in lines)):
-            LOG.debug('unmasking area of text region "%s" for "%s"',
+            self.logger.debug('unmasking area of text region "%s" for "%s"',
                       region.id, page_id if fullpage else parent.id)
             region_polygon = coordinates_of_segment(region, parent_image, parent_coords)
             region_polygon = make_valid(Polygon(region_polygon))
@@ -244,14 +242,14 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                                     parent_bin.shape)] = False
         # mask/ignore overlapping neighbours
         for i, segment in enumerate(ignore):
-            LOG.debug('masking area of %s "%s" for "%s"', type(segment).__name__[:-4],
+            self.logger.debug('masking area of %s "%s" for "%s"', type(segment).__name__[:-4],
                       segment.id, page_id if fullpage else parent.id)
             segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords)
             ignore_bin[draw.polygon(segment_polygon[:, 1],
                                     segment_polygon[:, 0],
                                     parent_bin.shape)] = True
         if method != 'lineest':
-            LOG.debug('calculating connected component and distance transforms for "%s"', parent.id)
+            self.logger.debug('calculating connected component and distance transforms for "%s"', parent.id)
             bin = parent_bin & ~ ignore_bin
             components, _ = morph.label(bin)
             # estimate glyph scale (roughly)
@@ -260,7 +258,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                 counts = np.sqrt(3 * counts)
                 scale = int(np.median(counts[(5/zoom < counts) & (counts < 100/zoom)]))
                 components *= (counts > 15/zoom)[components]
-                LOG.debug("estimated scale: %d", scale)
+                self.logger.debug("estimated scale: %d", scale)
             else:
                 scale = 43
             if method == 'ccomps':
@@ -278,7 +276,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                 new_labels = np.zeros_like(parent_bin, np.uint8)
                 for i, line in enumerate(lines):
                     if line.Baseline is None:
-                        LOG.warning("Skipping '%s' without baseline", line.id)
+                        self.logger.warning("Skipping '%s' without baseline", line.id)
                         new_labels[line_labels[i]] = i + 1
                         continue
                     line_baseline = baseline_of_segment(line, parent_coords)
@@ -289,22 +287,23 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                                                   parent_bin.shape)
                     new_labels[line_y, line_x] = i + 1
             spread_dist(lines, line_labels, new_labels, parent_bin, components, parent_coords,
-                        maxdist=maxdist or scale/2, loc=parent.id, threshold=threshold)
+                        maxdist=maxdist or scale/2, loc=parent.id, threshold=threshold, logger=self.logger)
             return
         try:
+            # TODO: 'scale' passed as a param may not be always defined (mehmedGIT)
             new_line_labels, new_baselines, _, _, _, scale = compute_segmentation(
                 parent_bin, seps=ignore_bin, zoom=zoom, spread_dist=maxdist or scale/2,
                 fullpage=fullpage, maxseps=0, maxcolseps=len(ignore), maximages=0)
         except Exception as err:
-            LOG.error('Cannot line-segment %s "%s": %s',
+            self.logger.error('Cannot line-segment %s "%s": %s',
                       tag, page_id if fullpage else parent.id, err)
             return
-        LOG.info("Found %d new line labels for %d existing lines on %s '%s'",
+        self.logger.info("Found %d new line labels for %d existing lines on %s '%s'",
                  new_line_labels.max(), len(lines), tag, parent.id)
         # polygonalize and prepare comparison
         new_line_polygons, new_line_labels = masks2polygons(
             new_line_labels, new_baselines, parent_bin, '%s "%s"' % (tag, parent.id),
-            min_area=640/zoom/zoom)
+            min_area=640/zoom/zoom, logger=self.logger)
         DSAVE('line_labels', [np.argmax(np.insert(line_labels, 0, 0, axis=0), axis=0), parent_bin])
         DSAVE('new_line_labels', [new_line_labels, parent_bin])
         new_line_polygons, new_baselines = list(zip(*[(Polygon(poly), LineString(base))
@@ -387,41 +386,41 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
         for j, line in enumerate(lines):
             new_lines = np.nonzero(assignments == j)[0]
             if not np.prod(new_lines.shape):
-                LOG.debug("no lines for '%s' match or fit", line.id)
+                self.logger.debug("no lines for '%s' match or fit", line.id)
                 continue
             covers = np.sum(covers_bg[new_lines,j])
             if covers < threshold / 3:
-                LOG.debug("new lines for '%s' only cover %.1f%% bg",
+                self.logger.debug("new lines for '%s' only cover %.1f%% bg",
                           line.id, covers * 100)
                 continue
             covers = np.sum(covers_fg[new_lines,j])
             if covers < threshold:
-                LOG.debug("new lines for '%s' only cover %.1f%% fg",
+                self.logger.debug("new lines for '%s' only cover %.1f%% fg",
                           line.id, covers * 100)
                 continue
             looses = (assignments < 0) & (covers_bg[:,j] > 0.1)
             if looses.any():
                 covers = np.sum(covers_bg[np.nonzero(looses)[0],j])
-                LOG.debug("new lines for '%s' would loose %d non-matching segments totalling %.1f%% bg",
+                self.logger.debug("new lines for '%s' would loose %d non-matching segments totalling %.1f%% bg",
                           line.id, np.count_nonzero(looses), covers * 100)
                 continue
             line_count = np.count_nonzero(line_labels[j] & parent_bin)
             new_count = covers * line_count
-            LOG.debug('Black pixels before/after resegment of line "%s": %d/%d',
+            self.logger.debug('Black pixels before/after resegment of line "%s": %d/%d',
                       line.id, line_count, new_count)
             # combine all assigned new lines to single outline polygon
             if len(new_lines) > 1:
-                LOG.debug("joining %d new line polygons for '%s'", len(new_lines), line.id)
+                self.logger.debug("joining %d new line polygons for '%s'", len(new_lines), line.id)
             new_polygon = join_polygons([new_line_polygons[i] #intersections[(i, j)]
                                          for i in new_lines], loc=line.id, scale=scale)
             new_baseline = join_baselines([new_polygon.intersection(new_baselines[i])
-                                           for i in new_lines], loc=line.id)
+                                           for i in new_lines], loc=line.id, logger=self.logger)
             # convert back to absolute (page) coordinates:
             line_polygon = coordinates_for_segment(new_polygon.exterior.coords[:-1],
                                                    parent_image, parent_coords)
             line_polygon = polygon_for_parent(line_polygon, line.parent_object_)
             if line_polygon is None:
-                LOG.warning("Ignoring extant new polygon for line '%s'", line.id)
+                self.logger.warning("Ignoring extant new polygon for line '%s'", line.id)
                 return
             # annotate result:
             line.get_Coords().set_points(points_from_polygon(line_polygon))
@@ -436,7 +435,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                     if j == otherj:
                         continue
                     otherline = lines[otherj]
-                    LOG.debug("subtracting new '%s' from overlapping '%s'", line.id, otherline.id)
+                    self.logger.debug("subtracting new '%s' from overlapping '%s'", line.id, otherline.id)
                     other_polygon = diff_polygons(line_polygons[otherj].context, new_polygon)
                     if other_polygon.is_empty:
                         continue
@@ -445,14 +444,15 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                                                             parent_image, parent_coords)
                     other_polygon = polygon_for_parent(other_polygon, otherline.parent_object_)
                     if other_polygon is None:
-                        LOG.warning("Ignoring extant new polygon for line '%s'", otherline.id)
+                        self.logger.warning("Ignoring extant new polygon for line '%s'", otherline.id)
                         continue
                     otherline.get_Coords().set_points(points_from_polygon(other_polygon))
 
 def spread_dist(lines, old_labels, new_labels, binarized, components, coords,
-                maxdist=43, loc='', threshold=0.9):
+                maxdist=43, loc='', threshold=0.9, logger = None):
     """redefine line coordinates by contourizing spread of connected components propagated from new labels"""
-    LOG = getLogger('processor.OcropyResegment')
+    if not logger:
+        raise ValueError(f"Logger has not been passed by the caller")
     DSAVE('seeds', [new_labels, (components>0)])
     # allocate to connected components consistently
     # (ignoring smallest components like punctuation)
@@ -477,29 +477,29 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords,
             continue
         count = np.count_nonzero(old_label)
         if not count:
-            LOG.warning("skipping zero-area line '%s'", line.id)
+            logger.warning("skipping zero-area line '%s'", line.id)
             continue
         covers = np.count_nonzero(new_label) / count
         if covers < threshold / 3:
-            LOG.debug("new line for '%s' only covers %.1f%% bg",
+            logger.debug("new line for '%s' only covers %.1f%% bg",
                       line.id, covers * 100)
             continue
         count = np.count_nonzero(old_label * binarized)
         if not count:
-            LOG.warning("skipping binary-empty line '%s'", line.id)
+            logger.warning("skipping binary-empty line '%s'", line.id)
             continue
         covers = np.count_nonzero(new_label * binarized) / count
         if covers < threshold:
-            LOG.debug("new line for '%s' only covers %.1f%% fg",
+            logger.debug("new line for '%s' only covers %.1f%% fg",
                       line.id, covers * 100)
             continue
-        LOG.debug('Black pixels before/after resegment of line "%s": %d/%d',
+        logger.debug('Black pixels before/after resegment of line "%s": %d/%d',
                   line.id, count, covers * count)
         contours = [contour[:,::-1] # get x,y order again
                     for contour, area in morph.find_contours(new_label)]
         #LOG.debug("joining %d subsegments for %s", len(contours), line.id)
         if len(contours) == 0:
-            LOG.warning("no contours for %s - keeping", line.id)
+            logger.warning("no contours for %s - keeping", line.id)
             continue
         else:
             # get alpha shape
@@ -511,7 +511,7 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords,
         polygon = coordinates_for_segment(poly, None, coords)
         polygon = polygon_for_parent(polygon, line.parent_object_)
         if polygon is None:
-            LOG.warning("Ignoring extant line for %s", line.id)
+            logger.warning("Ignoring extant line for %s", line.id)
             continue
         line.get_Coords().set_points(points_from_polygon(polygon))
 
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 7488eefe..35f309b6 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -58,7 +58,7 @@
     lines2regions
 )
 
-def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True):
+def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True, logger=None):
     """Convert label masks into polygon coordinates.
 
     Given a Numpy array of background labels ``bg_labels``,
@@ -75,7 +75,8 @@ def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=N
     - these polygons as a list of label, polygon, baseline tuples, and
     - a Numpy array of new background labels for that list.
     """
-    LOG = getLogger('processor.OcropySegment')
+    if not logger:
+        raise ValueError(f"Logger has not been passed by the caller")
     # find sharp baseline
     if baselines is not None:
         def getx(xy):
@@ -92,7 +93,7 @@ def getx(xy):
         bg_mask = np.array(bg_labels == label, bool)
         if not np.count_nonzero(bg_mask * fg_bin):
             # ignore if missing foreground
-            LOG.debug('skipping label %d in %s due to empty fg',
+            logger.debug('skipping label %d in %s due to empty fg',
                       label, name)
             continue
         # simplify to convex hull
@@ -101,7 +102,7 @@ def getx(xy):
             conflicts = np.setdiff1d(hull * simplify,
                                      bg_mask * simplify)
             if conflicts.any():
-                LOG.debug('Cannot simplify %d: convex hull would create additional intersections %s',
+                logger.debug('Cannot simplify %d: convex hull would create additional intersections %s',
                           label, str(conflicts))
             else:
                 bg_mask = hull
@@ -130,7 +131,7 @@ def getx(xy):
                     if len(hole) < 3:
                         idx_hole = hier[0, idx_hole, 0]
                         continue
-                    LOG.debug("label %d contour %d [%d pts] has hole %d [%d pts]",
+                    logger.debug("label %d contour %d [%d pts] has hole %d [%d pts]",
                               label, idx, len(contour), idx_hole, len(hole))
                     #plot_poly(hole, 'blue')
                     # cut child from outside...
@@ -172,7 +173,7 @@ def getx(xy):
                         diff2 = (interpol[interpol_idx+1:interpol_idx+2] - cispoint2) // 5
                     cispoint1 = cispoint1 + diff1
                     cispoint2 = cispoint2 + diff2
-                    LOG.debug("stitching at interpolation pos %d hole pos %d", interpol_idx, hole_idx)
+                    logger.debug("stitching at interpolation pos %d hole pos %d", interpol_idx, hole_idx)
                     # now stitch together outer (up to cision), inner (re-arranged around cision), outer (rest)
                     # (this works, because inner contours have inverse direction)
                     contour = np.concatenate([contour[:contour_idx], cispoint1,
@@ -181,7 +182,7 @@ def getx(xy):
                     #plot_poly(contour, 'green')
                     idx_hole = hier[0, idx_hole, 0]
                 #plot_poly(contour, 'red')
-                LOG.debug("adding label %d contour %d [%d pts]", label, idx, len(contour))
+                logger.debug("adding label %d contour %d [%d pts]", label, idx, len(contour))
                 contours.append(contour)
                 idx = hier[0, idx, 0]
         else:
@@ -207,7 +208,7 @@ def getx(xy):
             contour = contours[i]
             area = areas[i]
             if min_area and area < min_area and area / total_area < 0.1:
-                LOG.warning('Label %d contour %d is too small (%d/%d) in %s',
+                logger.warning('Label %d contour %d is too small (%d/%d) in %s',
                             label, i, area, total_area, name)
                 continue
             # simplify shape:
@@ -217,22 +218,22 @@ def getx(xy):
             # simplify and validate:
             polygon = Polygon(polygon)
             if not polygon.is_valid:
-                #LOG.debug(polygon.wkt)
-                LOG.debug(explain_validity(polygon))
+                #logger.debug(polygon.wkt)
+                logger.debug(explain_validity(polygon))
             polygon = make_valid(polygon)
             if not polygon.is_valid:
                 #LOG.debug(polygon.wkt)
-                LOG.warning(explain_validity(polygon))
+                logger.warning(explain_validity(polygon))
             poly = polygon.exterior.coords[:-1] # keep open
             if len(poly) < 4:
-                LOG.warning('Label %d contour %d for %s has less than 4 points', label, i, name)
+                logger.warning('Label %d contour %d for %s has less than 4 points', label, i, name)
                 continue
             # get baseline segments intersecting with this line mask
             # and concatenate them from left to right
             if baselines is not None:
                 base = join_baselines([baseline.intersection(polygon)
                                        for baseline in baselines
-                                       if baseline.intersects(polygon)], name)
+                                       if baseline.intersects(polygon)], name, logger)
                 if base is not None:
                     base = base.coords
             else:
@@ -324,7 +325,6 @@ def process(self):
         
         Produce a new output file by serialising the resulting hierarchy.
         """
-        LOG = getLogger('processor.OcropySegment')
         # FIXME: allow passing a-priori info on reading order / textline order
         # (and then pass on as ``bt`` and ``rl``; however, there may be a mixture
         #  of different scripts; also, vertical writing needs internal rotation
@@ -339,7 +339,7 @@ def process(self):
         assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
-            LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
+            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
             file_id = make_file_id(input_file, self.output_file_grp)
 
             pcgts = page_from_file(self.workspace.download_file(input_file))
@@ -356,7 +356,7 @@ def process(self):
                 dpi = page_image_info.resolution
                 if page_image_info.resolutionUnit == 'cm':
                     dpi *= 2.54
-                LOG.info('Page "%s" uses %f DPI', page_id, dpi)
+                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
                 zoom = 300.0/dpi
             else:
                 zoom = 1
@@ -393,7 +393,7 @@ def process(self):
                 if regions:
                     # page is already region-segmented
                     if overwrite_regions:
-                        LOG.info('removing existing TextRegions in page "%s"', page_id)
+                        self.logger.info('removing existing TextRegions in page "%s"', page_id)
                         # we could remove all other region types as well,
                         # but this is more flexible (for workflows with
                         # specialized separator/image/table detectors):
@@ -401,7 +401,7 @@ def process(self):
                         page.set_ReadingOrder(None)
                         ro = None
                     else:
-                        LOG.warning('keeping existing TextRegions in page "%s"', page_id)
+                        self.logger.warning('keeping existing TextRegions in page "%s"', page_id)
                         ignore.extend(regions)
                 # create reading order if necessary
                 if not ro or overwrite_order:
@@ -425,20 +425,20 @@ def process(self):
                 ignore.extend(page.get_TextRegion())
                 regions = list(page.get_TableRegion())
                 if not regions:
-                    LOG.warning('Page "%s" contains no table regions', page_id)
+                    self.logger.warning('Page "%s" contains no table regions', page_id)
                 for region in regions:
                     subregions = region.get_TextRegion()
                     if subregions:
                         # table is already cell-segmented
                         if overwrite_regions:
-                            LOG.info('removing existing TextRegions in table "%s"', region.id)
+                            self.logger.info('removing existing TextRegions in table "%s"', region.id)
                             region.set_TextRegion([])
                             roelem = reading_order.get(region.id)
                             # replace by empty group with same index and ref
                             # (which can then take the cells as subregions)
-                            reading_order[region.id] = page_subgroup_in_reading_order(roelem)
+                            reading_order[region.id] = page_subgroup_in_reading_order(roelem, self.logger)
                         else:
-                            LOG.warning('skipping table "%s" with existing TextRegions', region.id)
+                            self.logger.warning('skipping table "%s" with existing TextRegions', region.id)
                             continue
                     # TODO: also allow grayscale_normalized (try/except?)
                     region_image, region_coords = self.workspace.image_from_segment(
@@ -449,24 +449,24 @@ def process(self):
                     # create reading order group if necessary
                     roelem = reading_order.get(region.id)
                     if not roelem:
-                        LOG.warning("Page '%s' table region '%s' is not referenced in reading order (%s)",
+                        self.logger.warning("Page '%s' table region '%s' is not referenced in reading order (%s)",
                                     page_id, region.id, "no target to add cells to")
                     elif overwrite_order:
                         # replace by empty ordered group with same (index and) ref
                         # (which can then take the cells as subregions)
-                        roelem = page_subgroup_in_reading_order(roelem)
+                        roelem = page_subgroup_in_reading_order(roelem, self.logger)
                         reading_order[region.id] = roelem
                     elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)):
-                        LOG.warning("Page '%s' table region '%s' already has an ordered group (%s)",
+                        self.logger.warning("Page '%s' table region '%s' already has an ordered group (%s)",
                                     page_id, region.id, "cells will be appended")
                     elif isinstance(roelem, (UnorderedGroupType, UnorderedGroupIndexedType)):
-                        LOG.warning("Page '%s' table region '%s' already has an unordered group (%s)",
+                        self.logger.warning("Page '%s' table region '%s' already has an unordered group (%s)",
                                     page_id, region.id, "cells will not be appended")
                         roelem = None
                     else:
                         # replace regionRef(Indexed) by group with same index and ref
                         # (which can then take the cells as subregions)
-                        roelem = page_subgroup_in_reading_order(roelem)
+                        roelem = page_subgroup_in_reading_order(roelem, self.logger)
                         reading_order[region.id] = roelem
                     # go get TextRegions with TextLines (and SeparatorRegions)
                     self._process_element(region, subignore, region_image, region_coords,
@@ -488,14 +488,14 @@ def process(self):
                         region.add_TextRegion(subregion)
                         regions.append(subregion)
                 if not regions:
-                    LOG.warning('Page "%s" contains no text regions', page_id)
+                    self.logger.warning('Page "%s" contains no text regions', page_id)
                 for region in regions:
                     if region.get_TextLine():
                         if overwrite_lines:
-                            LOG.info('removing existing TextLines in page "%s" region "%s"', page_id, region.id)
+                            self.logger.info('removing existing TextLines in page "%s" region "%s"', page_id, region.id)
                             region.set_TextLine([])
                         else:
-                            LOG.warning('keeping existing TextLines in page "%s" region "%s"', page_id, region.id)
+                            self.logger.warning('keeping existing TextLines in page "%s" region "%s"', page_id, region.id)
                             ignore.extend(region.get_TextLine())
                     # TODO: also allow grayscale_normalized (try/except?)
                     region_image, region_coords = self.workspace.image_from_segment(
@@ -519,7 +519,7 @@ def process(self):
                 local_filename=file_path,
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts))
-            LOG.info('created file ID: %s, file_grp: %s, path: %s',
+            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
                      file_id, self.output_file_grp, out.local_filename)
 
     def _process_element(self, element, ignore, image, coords, element_id, file_id, page_id, zoom=1.0, rogroup=None):
@@ -540,16 +540,15 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
         in full page/table mode, then combine all separators among them with the
         newly detected separators to guide region segmentation.
         """
-        LOG = getLogger('processor.OcropySegment')
         if not image.width or not image.height:
-            LOG.warning("Skipping '%s' with zero size", element_id)
+            self.logger.warning("Skipping '%s' with zero size", element_id)
             return
         element_array = pil2array(image)
         element_bin = np.array(element_array <= midrange(element_array), bool)
         sep_bin = np.zeros_like(element_bin, bool)
         ignore_labels = np.zeros_like(element_bin, int)
         for i, segment in enumerate(ignore):
-            LOG.debug('masking foreground of %s "%s" for "%s"',
+            self.logger.debug('masking foreground of %s "%s" for "%s"',
                       type(segment).__name__[:-4], segment.id, element_id)
             # mark these segments (e.g. separator regions, tables, images)
             # for workflows where they have been detected already;
@@ -583,7 +582,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             element_name = 'region'
             fullpage = False
             report = check_region(element_bin, zoom)
-        LOG.info('computing line segmentation for %s "%s"', element_name, element_id)
+        self.logger.info('computing line segmentation for %s "%s"', element_name, element_id)
         # TODO: we should downscale if DPI is large enough to save time
         try:
             if report:
@@ -601,14 +600,14 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                 csminheight=self.parameter['csminheight'])
         except Exception as err:
             if isinstance(element, TextRegionType):
-                LOG.error('Cannot line-segment region "%s": %s', element_id, err)
+                self.logger.error('Cannot line-segment region "%s": %s', element_id, err)
                 # as a fallback, add a single text line comprising the whole region:
                 element.add_TextLine(TextLineType(id=element_id + "_line", Coords=element.get_Coords()))
             else:
-                LOG.error('Cannot line-segment %s "%s": %s', element_name, element_id, err)
+                self.logger.error('Cannot line-segment %s "%s": %s', element_name, element_id, err)
             return
 
-        LOG.info('Found %d text lines for %s "%s"',
+        self.logger.info('Found %d text lines for %s "%s"',
                  len(np.unique(line_labels)) - 1,
                  element_name, element_id)
         # post-process line labels
@@ -631,11 +630,11 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     gap_height=self.parameter['gap_height'],
                     gap_width=self.parameter['gap_width'],
                     scale=scale, zoom=zoom)
-                LOG.info('Found %d text regions for %s "%s"',
+                self.logger.info('Found %d text regions for %s "%s"',
                          len(np.unique(region_labels)) - 1,
                          element_name, element_id)
             except Exception as err:
-                LOG.error('Cannot region-segment %s "%s": %s',
+                self.logger.error('Cannot region-segment %s "%s": %s',
                           element_name, element_id, err)
                 region_labels = np.where(line_labels > len(ignore), 1 + len(ignore), line_labels)
             
@@ -669,7 +668,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     region = ignore[region_line_labels0[0] - 1]
                     if rogroup and region.parent_object_ is element and not isinstance(region, SeparatorRegionType):
                         index = page_add_to_reading_order(rogroup, region.id, index)
-                    LOG.debug('Region label %d is for ignored region "%s"',
+                    self.logger.debug('Region label %d is for ignored region "%s"',
                               region_label, region.id)
                     continue
                 # normal case: new lines inside new regions
@@ -685,11 +684,13 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                 regions, _ = masks2polygons(region_mask * region_label, None, element_bin,
                                             '%s "%s"' % (element_name, element_id),
                                             min_area=6000/zoom/zoom,
-                                            simplify=ignore_labels * ~(sep_bin))
+                                            simplify=ignore_labels * ~(sep_bin),
+                                            logger=self.logger)
                 # find contours for lines (can be non-contiguous)
                 lines, _ = masks2polygons(region_line_labels, baselines, element_bin,
                                           'region "%s"' % element_id,
-                                          min_area=640/zoom/zoom)
+                                          min_area=640/zoom/zoom,
+                                          logger=self.logger)
                 # create new lines in new regions (allocating by intersection)
                 line_polys = [Polygon(polygon) for _, polygon, _ in lines]
                 for _, region_polygon, _ in regions:
@@ -698,12 +699,12 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     region_polygon = coordinates_for_segment(region_polygon, image, coords)
                     region_polygon = polygon_for_parent(region_polygon, element)
                     if region_polygon is None:
-                        LOG.warning('Ignoring extant region contour for region label %d', region_label)
+                        self.logger.warning('Ignoring extant region contour for region label %d', region_label)
                         continue
                     # annotate result:
                     region_no += 1
                     region_id = element_id + "_region%04d" % region_no
-                    LOG.debug('Region label %d becomes ID "%s"', region_label, region_id)
+                    self.logger.debug('Region label %d becomes ID "%s"', region_label, region_id)
                     region = TextRegionType(
                         id=region_id, Coords=CoordsType(
                         points=points_from_polygon(region_polygon)))
@@ -717,13 +718,13 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                         line_polygon = coordinates_for_segment(line_polygon, image, coords)
                         line_polygon = polygon_for_parent(line_polygon, region)
                         if line_polygon is None:
-                            LOG.warning('Ignoring extant line contour for region label %d line label %d',
+                            self.logger.warning('Ignoring extant line contour for region label %d line label %d',
                                         region_label, line_label)
                             continue
                         # annotate result:
                         line_no += 1
                         line_id = region_id + "_line%04d" % line_no
-                        LOG.debug('Line label %d becomes ID "%s"', line_label, line_id)
+                        self.logger.debug('Line label %d becomes ID "%s"', line_label, line_id)
                         line = TextLineType(id=line_id,
                                             Coords=CoordsType(points=points_from_polygon(line_polygon)))
                         if line_baseline:
@@ -733,22 +734,22 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     # if the region has received text lines, keep it
                     if region.get_TextLine():
                         element.add_TextRegion(region)
-                        LOG.info('Added region "%s" with %d lines for %s "%s"',
+                        self.logger.info('Added region "%s" with %d lines for %s "%s"',
                                  region_id, line_no, element_name, element_id)
                         if rogroup:
                             index = page_add_to_reading_order(rogroup, region.id, index)
             # add additional image/non-text regions from compute_segmentation
             # (e.g. drop-capitals or images) ...
-            LOG.info('Found %d large image regions for %s "%s"', images.max(), element_name, element_id)
+            self.logger.info('Found %d large image regions for %s "%s"', images.max(), element_name, element_id)
             # find contours around region labels (can be non-contiguous):
             image_polygons, _ = masks2polygons(images, None, element_bin,
-                                               '%s "%s"' % (element_name, element_id))
+                                               '%s "%s"' % (element_name, element_id), self.logger)
             for image_label, polygon, _ in image_polygons:
                 # convert back to absolute (page) coordinates:
                 region_polygon = coordinates_for_segment(polygon, image, coords)
                 region_polygon = polygon_for_parent(region_polygon, element)
                 if region_polygon is None:
-                    LOG.warning('Ignoring extant region contour for image label %d', image_label)
+                    self.logger.warning('Ignoring extant region contour for image label %d', image_label)
                     continue
                 region_no += 1
                 # annotate result:
@@ -757,17 +758,17 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     id=region_id, Coords=CoordsType(
                     points=points_from_polygon(region_polygon))))
             # split detected separator labels into separator regions:
-            LOG.info('Found %d separators for %s "%s"', seplines.max(), element_name, element_id)
+            self.logger.info('Found %d separators for %s "%s"', seplines.max(), element_name, element_id)
             # find contours around region labels (can be non-contiguous):
             sep_polygons, _ = masks2polygons(seplines, None, element_bin,
                                              '%s "%s"' % (element_name, element_id),
-                                             open_holes=True, reorder=False)
+                                             open_holes=True, reorder=False, logger=self.logger)
             for sep_label, polygon, _ in sep_polygons:
                 # convert back to absolute (page) coordinates:
                 region_polygon = coordinates_for_segment(polygon, image, coords)
                 region_polygon = polygon_for_parent(region_polygon, element)
                 if region_polygon is None:
-                    LOG.warning('Ignoring extant region contour for separator %d', sep_label)
+                    self.logger.warning('Ignoring extant region contour for separator %d', sep_label)
                     continue
                 # annotate result:
                 region_no += 1
@@ -795,14 +796,14 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             # find contours around labels (can be non-contiguous):
             line_polygons, _ = masks2polygons(line_labels, baselines, element_bin,
                                               'region "%s"' % element_id,
-                                              min_area=640/zoom/zoom)
+                                              min_area=640/zoom/zoom, logger=self.logger)
             line_no = 0
             for line_label, polygon, baseline in line_polygons:
                 # convert back to absolute (page) coordinates:
                 line_polygon = coordinates_for_segment(polygon, image, coords)
                 line_polygon = polygon_for_parent(line_polygon, element)
                 if line_polygon is None:
-                    LOG.warning('Ignoring extant line contour for line label %d',
+                    self.logger.warning('Ignoring extant line contour for line label %d',
                                 line_label)
                     continue
                 # annotate result:
@@ -937,8 +938,9 @@ def join_polygons(polygons, loc='', scale=20):
         jointp = make_valid(jointp)
     return jointp
 
-def join_baselines(baselines, loc=''):
-    LOG = getLogger('processor.OcropyResegment')
+def join_baselines(baselines, loc='', logger = None):
+    if not logger:
+        raise ValueError(f"Logger has not been passed by the caller")
     lines = []
     for baseline in baselines:
         if (baseline.is_empty or
@@ -955,9 +957,9 @@ def join_baselines(baselines, loc=''):
                 elif geom.geom_type == 'MultiLineString':
                     lines.extend(geom)
                 else:
-                    LOG.warning("ignoring baseline subtype %s in %s", geom.geom_type, loc)
+                    logger.warning("ignoring baseline subtype %s in %s", geom.geom_type, loc)
         else:
-            LOG.warning("ignoring baseline type %s in %s", baseline.geom_type, loc)
+            logger.warning("ignoring baseline type %s in %s", baseline.geom_type, loc)
     nlines = len(lines)
     if nlines == 0:
         return None
@@ -1019,7 +1021,7 @@ def join_baselines(baselines, loc=''):
         else:
             chains.append([prevl, nextl])
     if len(chains) > 1:
-        LOG.warning("baseline merge impossible (no spanning tree) in %s", loc)
+        logger.warning("baseline merge impossible (no spanning tree) in %s", loc)
         return None
     assert len(chains) == 1, chains
     assert len(chains[0]) == nlines, chains[0]
@@ -1031,7 +1033,7 @@ def join_baselines(baselines, loc=''):
         coords.extend(line.normalize().coords)
     result = LineString(coords)
     if result.is_empty:
-        LOG.warning("baseline merge is empty in %s", loc)
+        logger.warning("baseline merge is empty in %s", loc)
         return None
     assert result.geom_type == 'LineString', result.wkt
     result = set_precision(result, 1.0)
@@ -1080,7 +1082,7 @@ def page_add_to_reading_order(rogroup, region_id, index=None):
             index += 1
     return index
 
-def page_subgroup_in_reading_order(roelem):
+def page_subgroup_in_reading_order(roelem, logger = None):
     """Replace given RO element by an equivalent OrderedGroup.
     
     Given a ReadingOrder element ``roelem`` (of any type),
@@ -1094,12 +1096,14 @@ def page_subgroup_in_reading_order(roelem):
     
     Return the new group object.
     """
-    LOG = getLogger('processor.OcropySegment')
+    if not logger:
+        raise ValueError(f"Logger has not been passed by the caller")
+
     if not roelem:
-        LOG.error('Cannot subgroup from empty ReadingOrder element')
+        logger.error('Cannot subgroup from empty ReadingOrder element')
         return roelem
     if not roelem.parent_object_:
-        LOG.error('Cannot subgroup from orphan ReadingOrder element')
+        logger.error('Cannot subgroup from orphan ReadingOrder element')
         return roelem
     if isinstance(roelem, (OrderedGroupType,OrderedGroupIndexedType)) and not (
             roelem.get_OrderedGroupIndexed() or

From dbccae58d9213d5df4e072502a7eae8484902ef6 Mon Sep 17 00:00:00 2001
From: kba <unixprog@gmail.com>
Date: Tue, 13 Aug 2024 14:57:16 +0200
Subject: [PATCH 080/194] require core >= 3.0.0a1

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 6df9445c..38f09abd 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@
     packages=find_packages(),
     include_package_data=True,
     install_requires=[
-        'ocrd>=2.47',
+        'ocrd>=3.0.0a1',
         'click',
         'scipy',
         'numpy>=1.17.0',

From 8557a26dc75cf858f9e6819296389f71ab972cf3 Mon Sep 17 00:00:00 2001
From: kba <unixprog@gmail.com>
Date: Tue, 13 Aug 2024 15:26:32 +0200
Subject: [PATCH 081/194] port part of binarize to core v3

---
 ocrd_cis/ocropy/binarize.py | 157 ++++++++++++++++--------------------
 1 file changed, 70 insertions(+), 87 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index f42ff2bd..c3b4cded 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -1,9 +1,13 @@
 from __future__ import absolute_import
+import logging
 
 import os.path
+import PIL
 import cv2
 import numpy as np
 from PIL import Image
+from os.path import join
+from ocrd_models import OcrdExif
 
 #import kraken.binarization
 
@@ -15,11 +19,10 @@
 )
 from ocrd_modelfactory import page_from_file
 from ocrd_models.ocrd_page import (
-    to_xml, AlternativeImageType
+    OcrdPage, to_xml, AlternativeImageType
 )
 from ocrd import Processor
 
-from .. import get_ocrd_tool
 from . import common
 from .common import (
     pil2array, array2pil,
@@ -64,18 +67,20 @@ def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zo
             raise Exception('unknown binarization method %s' % method)
         return Image.fromarray(th), 0
 
+def determine_zoom(dpi: float, page_image_info: OcrdExif) -> float:
+    if dpi > 0:
+        zoom = 300.0/dpi
+    elif page_image_info.resolution != 1:
+        dpi = page_image_info.resolution
+        if page_image_info.resolutionUnit == 'cm':
+            dpi *= 2.54
+        zoom = 300.0/dpi
+    else:
+        zoom = 1
+    return zoom
 
 class OcropyBinarize(Processor):
-
-    def __init__(self, *args, **kwargs):
-        self.logger = getLogger('processor.OcropyBinarize')
-        self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
-        kwargs['version'] = self.ocrd_tool['version']
-        super(OcropyBinarize, self).__init__(*args, **kwargs)
-        if hasattr(self, 'output_file_grp'):
-            # processing context
-            self.setup()
+    logger : logging.Logger
 
     @property
     def executable(self):
@@ -84,16 +89,16 @@ def executable(self):
     def setup(self):
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
+        self.logger = getLogger('processor.OcropyBinarize')
         method = self.parameter['method']
         if self.parameter['grayscale'] and method != 'ocropy':
             self.logger.critical(f'Requested method {method} does not support grayscale normalized output')
             raise Exception('only method=ocropy allows grayscale=true')
 
-    def process(self):
+    def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
         """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace.
 
-        Open and deserialise PAGE input files and their respective images,
-        then iterate over the element hierarchy down to the requested
+        THEN Iterate over the PAGE-XML element hierarchy down to the requested
         ``level-of-operation``.
 
         Next, for each file, crop each segment image according to the layout
@@ -109,80 +114,61 @@ def process(self):
 
         Reference each new image in the AlternativeImage of the element.
 
-        Produce a new output file by serialising the resulting hierarchy.
+        Return a PAGE-XML with AlternativeImage and the arguments for ``workspace.save_image_file``.
         """
         level = self.parameter['level-of-operation']
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
+        assert self.workspace
+        self.logger.debug(f'Level of operation: "{level}"')
 
-        for (n, input_file) in enumerate(self.input_files):
-            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
-            file_id = make_file_id(input_file, self.output_file_grp)
+        pcgts = input_pcgts[0]
+        page = pcgts.get_Page()
+        assert page
 
-            pcgts = page_from_file(self.workspace.download_file(input_file))
-            self.add_metadata(pcgts)
-            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
-            page = pcgts.get_Page()
-                
-            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
-                page, page_id, feature_filter='binarized')
-            if self.parameter['dpi'] > 0:
-                zoom = 300.0/self.parameter['dpi']
-            elif page_image_info.resolution != 1:
-                dpi = page_image_info.resolution
-                if page_image_info.resolutionUnit == 'cm':
-                    dpi *= 2.54
-                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
-                zoom = 300.0/dpi
-            else:
-                zoom = 1
-            
-            if level == 'page':
-                self.process_page(page, page_image, page_xywh, zoom,
-                                  input_file.pageId, file_id)
-            else:
-                if level == 'table':
-                    regions = page.get_TableRegion()
-                else: # region
-                    regions = page.get_AllRegions(classes=['Text'], order='reading-order')
-                if not regions:
-                    self.logger.warning('Page "%s" contains no text regions', page_id)
-                for region in regions:
-                    region_image, region_xywh = self.workspace.image_from_segment(
-                        region, page_image, page_xywh, feature_filter='binarized')
-                    if level == 'region':
-                        self.process_region(region, region_image, region_xywh, zoom,
-                                            input_file.pageId, file_id + '_' + region.id)
-                        continue
-                    lines = region.get_TextLine()
-                    if not lines:
-                        self.logger.warning('Page "%s" region "%s" contains no text lines',
-                                            page_id, region.id)
-                    for line in lines:
-                        line_image, line_xywh = self.workspace.image_from_segment(
-                            line, region_image, region_xywh, feature_filter='binarized')
-                        self.process_line(line, line_image, line_xywh, zoom,
-                                          input_file.pageId, region.id,
-                                          file_id + '_' + region.id + '_' + line.id)
+        page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
+        zoom = determine_zoom(self.parameter['dpi'], page_image_info)
+        self.logger.info('Page "%s" uses %f DPI', page_id, self.parameter['dpi'])
+        
+        ret = [pcgts]
+        if level == 'page':
+            try:
+                ret.append(self.process_page(page, page_image, page_xywh, zoom, page_id, output_file_id))
+            except ValueError as e:
+                self.logger.exception(e)
+        else:
+            # TODO
+            raise NotImplementedError
+            if level == 'table':
+                regions = page.get_TableRegion()
+            else: # region
+                regions = page.get_AllRegions(classes=['Text'], order='reading-order')
+            if not regions:
+                self.logger.warning('Page "%s" contains no text regions', page_id)
+            for region in regions:
+                region_image, region_xywh = self.workspace.image_from_segment(
+                    region, page_image, page_xywh, feature_filter='binarized')
+                if level == 'region':
+                    self.process_region(region, region_image, region_xywh, zoom,
+                                        input_file.pageId, file_id + '_' + region.id)
+                    continue
+                lines = region.get_TextLine()
+                if not lines:
+                    self.logger.warning('Page "%s" region "%s" contains no text lines',
+                                        page_id, region.id)
+                for line in lines:
+                    line_image, line_xywh = self.workspace.image_from_segment(
+                        line, region_image, region_xywh, feature_filter='binarized')
+                    self.process_line(line, line_image, line_xywh, zoom,
+                                      input_file.pageId, region.id,
+                                      file_id + '_' + region.id + '_' + line.id)
 
-            # update METS (add the PAGE file):
-            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
-            pcgts.set_pcGtsId(file_id)
-            out = self.workspace.add_file(
-                ID=file_id,
-                file_grp=self.output_file_grp,
-                pageId=input_file.pageId,
-                local_filename=file_path,
-                mimetype=MIMETYPE_PAGE,
-                content=to_xml(pcgts))
-            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
-                             file_id, self.output_file_grp, out.local_filename)
+        return ret
 
-    def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id):
+    def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> tuple[Image.Image, str, str]:
         if not page_image.width or not page_image.height:
-            self.logger.warning("Skipping page '%s' with zero size", page_id)
-            return
+            raise ValueError("Skipping page '%s' with zero size", page_id)
         self.logger.info("About to binarize page '%s'", page_id)
+        assert self.output_file_grp
+
         features = page_xywh['features']
         if 'angle' in page_xywh and page_xywh['angle']:
             # orientation has already been annotated (by previous deskewing),
@@ -216,13 +202,10 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id):
         else:
             file_id += '.IMG-BIN'
             features += ',binarized'
-        file_path = self.workspace.save_image_file(
-            bin_image, file_id, self.output_file_grp,
-            page_id=page_id)
+        bin_image_path = join(self.output_file_grp, f'{file_id}.png')
         # update PAGE (reference the image file):
-        page.add_AlternativeImage(AlternativeImageType(
-            filename=file_path,
-            comments=features))
+        page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features))
+        return (bin_image, file_id, bin_image_path)
 
     def process_region(self, region, region_image, region_xywh, zoom, page_id, file_id):
         if not region_image.width or not region_image.height:

From 278b706246e24ec0fc0b5030aff6d16673bad817 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 13 Aug 2024 16:08:10 +0200
Subject: [PATCH 082/194] move: determine_zoom to common.py

---
 ocrd_cis/ocropy/binarize.py  | 18 ++----------------
 ocrd_cis/ocropy/clip.py      | 15 +++------------
 ocrd_cis/ocropy/common.py    | 14 +++++++++++++-
 ocrd_cis/ocropy/denoise.py   | 15 ++++-----------
 ocrd_cis/ocropy/deskew.py    |  6 +-----
 ocrd_cis/ocropy/dewarp.py    | 18 ++++--------------
 ocrd_cis/ocropy/resegment.py | 14 ++++----------
 ocrd_cis/ocropy/segment.py   | 13 +++----------
 8 files changed, 34 insertions(+), 79 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index c3b4cded..b5e2bc7e 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -7,7 +7,6 @@
 import numpy as np
 from PIL import Image
 from os.path import join
-from ocrd_models import OcrdExif
 
 #import kraken.binarization
 
@@ -25,9 +24,8 @@
 
 from . import common
 from .common import (
-    pil2array, array2pil,
     # binarize,
-    remove_noise)
+     array2pil, determine_zoom, pil2array, remove_noise)
 
 #sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 
@@ -67,18 +65,6 @@ def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zo
             raise Exception('unknown binarization method %s' % method)
         return Image.fromarray(th), 0
 
-def determine_zoom(dpi: float, page_image_info: OcrdExif) -> float:
-    if dpi > 0:
-        zoom = 300.0/dpi
-    elif page_image_info.resolution != 1:
-        dpi = page_image_info.resolution
-        if page_image_info.resolutionUnit == 'cm':
-            dpi *= 2.54
-        zoom = 300.0/dpi
-    else:
-        zoom = 1
-    return zoom
-
 class OcropyBinarize(Processor):
     logger : logging.Logger
 
@@ -126,7 +112,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
 
         page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
         zoom = determine_zoom(self.parameter['dpi'], page_image_info)
-        self.logger.info('Page "%s" uses %f DPI', page_id, self.parameter['dpi'])
+        self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI")
         
         ret = [pcgts]
         if level == 'page':
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 4c0eebea..3b854897 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -28,8 +28,7 @@
 from .ocrolib import midrange, morph
 from .common import (
     # binarize,
-    pil2array, array2pil
-)
+    array2pil, determine_zoom, pil2array)
 
 class OcropyClip(Processor):
 
@@ -98,16 +97,8 @@ def process(self):
             
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id, feature_selector='binarized')
-            if self.parameter['dpi'] > 0:
-                zoom = 300.0/self.parameter['dpi']
-            elif page_image_info.resolution != 1:
-                dpi = page_image_info.resolution
-                if page_image_info.resolutionUnit == 'cm':
-                    dpi *= 2.54
-                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
-                zoom = 300.0/dpi
-            else:
-                zoom = 1
+            zoom = determine_zoom(self.parameter['dpi'], page_image_info)
+            self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI")
 
             # FIXME: what about text regions inside table regions?
             regions = list(page.get_TextRegion())
diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index 3cb9e4c4..1804c29d 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -10,7 +10,7 @@
 from skimage.morphology import medial_axis
 import networkx as nx
 from PIL import Image
-
+from ocrd_models import OcrdExif
 from . import ocrolib
 from .ocrolib import morph, psegutils, sl
 # for decorators (type-checks etc):
@@ -2102,3 +2102,15 @@ def find_topological():
     #     rlabels[region_hull] = region
     # DSAVE('rlabels_closed', rlabels)
     return rlabels
+
+def determine_zoom(dpi: float, page_image_info: OcrdExif) -> float:
+    if dpi > 0:
+        zoom = 300.0/dpi
+    elif page_image_info.resolution != 1:
+        dpi = page_image_info.resolution
+        if page_image_info.resolutionUnit == 'cm':
+            dpi *= 2.54
+        zoom = 300.0/dpi
+    else:
+        zoom = 1
+    return zoom
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index d6a4f7ff..d8554a3e 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -17,7 +17,7 @@
 from .. import get_ocrd_tool
 from .common import (
     # binarize,
-    remove_noise)
+    determine_zoom, remove_noise)
 
 class OcropyDenoise(Processor):
 
@@ -73,16 +73,9 @@ def process(self):
             page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                 page, page_id,
                 feature_selector='binarized' if level == 'page' else '')
-            if self.parameter['dpi'] > 0:
-                zoom = 300.0/self.parameter['dpi']
-            elif page_image_info.resolution != 1:
-                dpi = page_image_info.resolution
-                if page_image_info.resolutionUnit == 'cm':
-                    dpi *= 2.54
-                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
-                zoom = 300.0/dpi
-            else:
-                zoom = 1
+
+            zoom = determine_zoom(self.parameter['dpi'], page_image_info)
+            self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI")
 
             if level == 'page':
                 self.process_segment(page, page_image, page_xywh, zoom,
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index 63bb6b97..055ab27d 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -17,14 +17,10 @@
 
 from .. import get_ocrd_tool
 from . import common
-from .common import (
-    pil2array
-)
+from .common import pil2array
 
 #sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 
-TOOL = 'ocrd-cis-ocropy-deskew'
-
 def deskew(pil_image, maxskew=2):
     array = pil2array(pil_image)
     _, angle = common.binarize(array, maxskew=maxskew)
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 89a62e11..4c9a1bdb 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -17,10 +17,7 @@
 
 from .. import get_ocrd_tool
 from .ocrolib import lineest
-from .common import (
-    pil2array, array2pil,
-    check_line,
-)
+from .common import array2pil, check_line, determine_zoom, pil2array
 
 #sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 
@@ -128,16 +125,9 @@ def process(self):
                 
             page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                 page, page_id)
-            if self.parameter['dpi'] > 0:
-                zoom = 300.0/self.parameter['dpi']
-            elif page_image_info.resolution != 1:
-                dpi = page_image_info.resolution
-                if page_image_info.resolutionUnit == 'cm':
-                    dpi *= 2.54
-                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
-                zoom = 300.0/dpi
-            else:
-                zoom = 1
+
+            zoom = determine_zoom(self.parameter['dpi'], page_image_info)
+            self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI")
 
             regions = page.get_AllRegions(classes=['Text'], order='reading-order')
             if not regions:
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 2261cf3e..e4681b23 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -30,6 +30,7 @@
     pil2array,
     odd,
     DSAVE,
+    determine_zoom,
     # binarize,
     check_page,
     check_region,
@@ -129,16 +130,9 @@ def process(self):
 
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id, feature_selector='binarized')
-            if self.parameter['dpi'] > 0:
-                zoom = 300.0/self.parameter['dpi']
-            elif page_image_info.resolution != 1:
-                dpi = page_image_info.resolution
-                if page_image_info.resolutionUnit == 'cm':
-                    dpi *= 2.54
-                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
-                zoom = 300.0/dpi
-            else:
-                zoom = 1
+
+            zoom = determine_zoom(self.parameter['dpi'], page_image_info)
+            self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI")
 
             ignore = (page.get_ImageRegion() +
                       page.get_LineDrawingRegion() +
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 35f309b6..e13c3d71 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -53,6 +53,7 @@
     pil2array,
     array2pil,
     check_page, check_region,
+    determine_zoom,
     hmerge_line_seeds,
     compute_segmentation,
     lines2regions
@@ -350,16 +351,8 @@ def process(self):
             # TODO: also allow grayscale_normalized (try/except?)
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id, feature_selector='binarized')
-            if self.parameter['dpi'] > 0:
-                zoom = 300.0/self.parameter['dpi']
-            elif page_image_info.resolution != 1:
-                dpi = page_image_info.resolution
-                if page_image_info.resolutionUnit == 'cm':
-                    dpi *= 2.54
-                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
-                zoom = 300.0/dpi
-            else:
-                zoom = 1
+            zoom = determine_zoom(self.parameter['dpi'], page_image_info)
+            self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI")
 
             # aggregate existing regions so their foreground can be ignored
             ignore = (page.get_ImageRegion() +

From 6beec175ed89e321cae93917dbe02bd2809cd83b Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 13 Aug 2024 16:14:31 +0200
Subject: [PATCH 083/194] move: logger init to setup()

---
 ocrd_cis/ocropy/binarize.py  | 6 +++---
 ocrd_cis/ocropy/clip.py      | 4 +++-
 ocrd_cis/ocropy/denoise.py   | 5 +++--
 ocrd_cis/ocropy/deskew.py    | 5 +++--
 ocrd_cis/ocropy/dewarp.py    | 5 +++--
 ocrd_cis/ocropy/recognize.py | 5 +++--
 ocrd_cis/ocropy/resegment.py | 5 +++--
 ocrd_cis/ocropy/segment.py   | 6 ++++--
 ocrd_cis/ocropy/train.py     | 5 +++--
 9 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index b5e2bc7e..cc34690e 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import
-import logging
+from logging import Logger
 
 import os.path
 import PIL
@@ -66,16 +66,16 @@ def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zo
         return Image.fromarray(th), 0
 
 class OcropyBinarize(Processor):
-    logger : logging.Logger
+    logger: Logger
 
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-binarize'
 
     def setup(self):
+        self.logger = getLogger('processor.OcropyBinarize')
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
-        self.logger = getLogger('processor.OcropyBinarize')
         method = self.parameter['method']
         if self.parameter['grayscale'] and method != 'ocropy':
             self.logger.critical(f'Requested method {method} does not support grayscale normalized output')
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 3b854897..1b7fb28b 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -1,4 +1,5 @@
 from __future__ import absolute_import
+from logging import Logger
 
 import os.path
 import numpy as np
@@ -31,9 +32,9 @@
     array2pil, determine_zoom, pil2array)
 
 class OcropyClip(Processor):
+    logger: Logger
 
     def __init__(self, *args, **kwargs):
-        self.logger = getLogger('processor.OcropyClip')
         self.ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
@@ -44,6 +45,7 @@ def executable(self):
         return 'ocrd-cis-ocropy-clip'
 
     def setup(self):
+        self.logger = getLogger('processor.OcropyClip')
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
 
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index d8554a3e..34750a53 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import
-
+from logging import Logger
 import os.path
 
 from ocrd_utils import (
@@ -20,9 +20,9 @@
     determine_zoom, remove_noise)
 
 class OcropyDenoise(Processor):
+    logger: Logger
 
     def __init__(self, *args, **kwargs):
-        self.logger = getLogger('processor.OcropyDenoise')
         self.ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
@@ -33,6 +33,7 @@ def executable(self):
         return 'ocrd-cis-ocropy-denoise'
 
     def setup(self):
+        self.logger = getLogger('processor.OcropyDenoise')
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
 
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index 055ab27d..2eb898ca 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import
-
+from logging import Logger
 import os.path
 
 from ocrd_utils import (
@@ -27,9 +27,9 @@ def deskew(pil_image, maxskew=2):
     return angle
 
 class OcropyDeskew(Processor):
+    logger: Logger
 
     def __init__(self, *args, **kwargs):
-        self.logger = getLogger('processor.OcropyDeskew')
         ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable]
         kwargs['version'] = ocrd_tool['version']
@@ -40,6 +40,7 @@ def executable(self):
         return 'ocrd-cis-ocropy-deskew'
 
     def setup(self):
+        self.logger = getLogger('processor.OcropyDeskew')
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
 
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 4c9a1bdb..cad280c6 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import
-
+from logging import Logger
 import os.path
 import numpy as np
 
@@ -64,9 +64,9 @@ def padvert(image, range_):
     return array2pil(line)
 
 class OcropyDewarp(Processor):
+    logger: Logger
 
     def __init__(self, *args, **kwargs):
-        self.logger = getLogger('processor.OcropyDewarp')
         self.ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
@@ -80,6 +80,7 @@ def executable(self):
         return 'ocrd-cis-ocropy-dewarp'
 
     def setup(self):
+        self.logger = getLogger('processor.OcropyDewarp')
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
         # defaults from ocrolib.lineest:
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index fdeaed27..8e147fea 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import
-
+from logging import Logger
 import sys
 import os.path
 import numpy as np
@@ -78,9 +78,9 @@ def recognize(image, pad, network, check=True):
 
 
 class OcropyRecognize(Processor):
+    logger: Logger
 
     def __init__(self, *args, **kwargs):
-        self.logger = getLogger('processor.OcropyRecognize')
         self.ocrd_tool = get_ocrd_tool()
         self.pad = 16 # ocropus-rpred default
         self.network = None # set in process
@@ -96,6 +96,7 @@ def executable(self):
         return 'ocrd-cis-ocropy-recognize'
 
     def setup(self):
+        self.logger = getLogger('processor.OcropyRecognize')
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
         # from ocropus-rpred:
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index e4681b23..1e920b0f 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import
-
+from logging import Logger
 import os.path
 import numpy as np
 from skimage import draw, segmentation
@@ -48,9 +48,9 @@
 )
 
 class OcropyResegment(Processor):
+    logger: Logger
 
     def __init__(self, *args, **kwargs):
-        self.logger = getLogger('processor.OcropyResegment')
         self.ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
@@ -61,6 +61,7 @@ def executable(self):
         return 'ocrd-cis-ocropy-resegment'
 
     def setup(self):
+        self.logger = getLogger('processor.OcropyResegment')
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
 
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index e13c3d71..3b89bda6 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import
-
+from logging import Logger
 import os.path
 import itertools
 import numpy as np
@@ -245,9 +245,10 @@ def getx(xy):
 
 
 class OcropySegment(Processor):
+    logger: Logger
 
     def __init__(self, *args, **kwargs):
-        self.logger = getLogger('processor.OcropySegment')
+
         self.ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
@@ -258,6 +259,7 @@ def executable(self):
         return 'ocrd-cis-ocropy-segment'
 
     def setup(self):
+        self.logger = getLogger('processor.OcropySegment')
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
 
diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py
index 25317c4d..61a918c7 100644
--- a/ocrd_cis/ocropy/train.py
+++ b/ocrd_cis/ocropy/train.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import
-
+from logging import Logger
 import sys
 import os
 import tempfile
@@ -28,9 +28,9 @@ def resize_keep_ratio(image, baseheight=48):
 
 
 class OcropyTrain(Processor):
+    log: Logger
 
     def __init__(self, *args, **kwargs):
-        self.log = getLogger('processor.OcropyTrain')
         self.oldcwd = os.getcwd()
         ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable]
@@ -45,6 +45,7 @@ def executable(self):
         return 'ocrd-cis-ocropy-train'
 
     def setup(self):
+        self.log = getLogger('processor.OcropyTrain')
         #print(self.parameter)
         if 'model' in self.parameter:
             model = self.parameter['model']

From 1b2fea3ed5b7c9d1a02f2dcabe0770aa3eb87da6 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 13 Aug 2024 16:16:55 +0200
Subject: [PATCH 084/194] refactor: log -> logger

---
 ocrd_cis/ocropy/train.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py
index 61a918c7..9278da92 100644
--- a/ocrd_cis/ocropy/train.py
+++ b/ocrd_cis/ocropy/train.py
@@ -28,7 +28,7 @@ def resize_keep_ratio(image, baseheight=48):
 
 
 class OcropyTrain(Processor):
-    log: Logger
+    logger: Logger
 
     def __init__(self, *args, **kwargs):
         self.oldcwd = os.getcwd()
@@ -45,7 +45,7 @@ def executable(self):
         return 'ocrd-cis-ocropy-train'
 
     def setup(self):
-        self.log = getLogger('processor.OcropyTrain')
+        self.logger = getLogger('processor.OcropyTrain')
         #print(self.parameter)
         if 'model' in self.parameter:
             model = self.parameter['model']
@@ -54,9 +54,9 @@ def setup(self):
             except SystemExit:
                 ocropydir = os.path.dirname(os.path.abspath(__file__))
                 modelpath = os.path.join(ocropydir, 'models', model)
-                self.log.info("Failed to resolve model '%s' path, trying '%s'", model, modelpath)
+                self.logger.info("Failed to resolve model '%s' path, trying '%s'", model, modelpath)
             if not os.path.isfile(modelpath):
-                self.log.error("Could not find model '%s'. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s'",
+                self.logger.error("Could not find model '%s'. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s'",
                                model, model)
                 sys.exit(1)
             outputpath = os.path.join(self.oldcwd, 'output', model)
@@ -78,18 +78,18 @@ def process(self):
         """
         filelist = []
         filepath = tempfile.mkdtemp(prefix='ocrd-cis-ocropy-train-')
-        #self.log.info("Using model %s in %s for recognition", model)
+        #self.logger.info("Using model %s in %s for recognition", model)
         for (n, input_file) in enumerate(self.input_files):
-            #self.log.info("INPUT FILE %i / %s", n, input_file)
+            #self.logger.info("INPUT FILE %i / %s", n, input_file)
             pcgts = page_from_file(self.workspace.download_file(input_file))
             page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
             page = pcgts.get_Page()
             page_image, page_coords, _ = self.workspace.image_from_page(page, page_id)
 
-            self.log.info("Extracting from page '%s'", page_id)
+            self.logger.info("Extracting from page '%s'", page_id)
             for region in page.get_AllRegions(classes=['Text']):
                 textlines = region.get_TextLine()
-                self.log.info("Extracting %i lines from region '%s'", len(textlines), region.id)
+                self.logger.info("Extracting %i lines from region '%s'", len(textlines), region.id)
                 for line in textlines:
                     if self.parameter['textequiv_level'] == 'line':
                         path = os.path.join(filepath, page_id + region.id + line.id)
@@ -110,7 +110,7 @@ def process(self):
                             if imgpath:
                                 filelist.append(imgpath)
 
-        self.log.info("Training %s from %s on %i file pairs",
+        self.logger.info("Training %s from %s on %i file pairs",
                       self.outputpath,
                       self.modelpath or 'scratch',
                       len(filelist))
@@ -130,7 +130,7 @@ def extract_segment(self, path, segment, page_image, page_coords):
         with open(gtpath, "w", encoding='utf-8') as f:
             f.write(gt)
 
-        self.log.debug("Extracting %s '%s'", segment.__class__.__name__, segment.id)
+        self.logger.debug("Extracting %s '%s'", segment.__class__.__name__, segment.id)
         image, coords = self.workspace.image_from_segment(segment, page_image, page_coords)
 
         if 'binarized' not in coords['features'].split(','):

From fe33494814e845cfd969a5f1a51234ceadb865a3 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 13 Aug 2024 16:32:17 +0200
Subject: [PATCH 085/194] remove: unused imports

---
 ocrd_cis/ocropy/binarize.py  | 19 +++++++-----------
 ocrd_cis/ocropy/clip.py      |  4 ++--
 ocrd_cis/ocropy/denoise.py   |  4 ++--
 ocrd_cis/ocropy/deskew.py    |  4 ++--
 ocrd_cis/ocropy/dewarp.py    |  4 ++--
 ocrd_cis/ocropy/recognize.py | 20 +++++++++---------
 ocrd_cis/ocropy/resegment.py |  9 +++------
 ocrd_cis/ocropy/segment.py   |  4 ++--
 ocrd_cis/ocropy/train.py     | 39 ++++++++++++++++++------------------
 9 files changed, 49 insertions(+), 58 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index cc34690e..5d3fc7c3 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -1,12 +1,12 @@
 from __future__ import absolute_import
 from logging import Logger
 
-import os.path
-import PIL
 import cv2
 import numpy as np
 from PIL import Image
-from os.path import join
+from os.path import abspath, dirname, join
+
+from typing import Tuple
 
 #import kraken.binarization
 
@@ -16,18 +16,13 @@
     assert_file_grp_cardinality,
     MIMETYPE_PAGE
 )
-from ocrd_modelfactory import page_from_file
-from ocrd_models.ocrd_page import (
-    OcrdPage, to_xml, AlternativeImageType
-)
+from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml
 from ocrd import Processor
 
 from . import common
-from .common import (
-    # binarize,
-     array2pil, determine_zoom, pil2array, remove_noise)
+from .common import array2pil, determine_zoom, pil2array, remove_noise
 
-#sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+#sys.path.append(dirname(abspath(__file__)))
 
 def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0):
     LOG = getLogger('processor.OcropyBinarize')
@@ -149,7 +144,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
 
         return ret
 
-    def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> tuple[Image.Image, str, str]:
+    def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]:
         if not page_image.width or not page_image.height:
             raise ValueError("Skipping page '%s' with zero size", page_id)
         self.logger.info("About to binarize page '%s'", page_id)
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 1b7fb28b..b70d1fb0 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -1,7 +1,7 @@
 from __future__ import absolute_import
 from logging import Logger
 
-import os.path
+from os.path import join
 import numpy as np
 from PIL import Image, ImageStat, ImageOps
 from shapely.geometry import Polygon
@@ -202,7 +202,7 @@ def process(self):
                                              input_file.pageId, file_id + '_' + region.id + '_' + line.id)
 
             # update METS (add the PAGE file):
-            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
+            file_path = join(self.output_file_grp, file_id + '.xml')
             pcgts.set_pcGtsId(file_id)
             out = self.workspace.add_file(
                 ID=file_id,
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index 34750a53..7cf74727 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -1,6 +1,6 @@
 from __future__ import absolute_import
 from logging import Logger
-import os.path
+from os.path import join
 
 from ocrd_utils import (
     getLogger,
@@ -105,7 +105,7 @@ def process(self):
                                              file_id + '_' + region.id + '_' + line.id)
 
             # update METS (add the PAGE file):
-            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
+            file_path = join(self.output_file_grp, file_id + '.xml')
             pcgts.set_pcGtsId(file_id)
             out = self.workspace.add_file(
                 ID=file_id,
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index 2eb898ca..bcd3be01 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -1,6 +1,6 @@
 from __future__ import absolute_import
 from logging import Logger
-import os.path
+from os.path import join
 
 from ocrd_utils import (
     getLogger,
@@ -105,7 +105,7 @@ def process(self):
                                           file_id + '_' + region.id)
 
             # update METS (add the PAGE file):
-            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
+            file_path = join(self.output_file_grp, file_id + '.xml')
             pcgts.set_pcGtsId(file_id)
             out = self.workspace.add_file(
                 ID=file_id,
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index cad280c6..6c27c5c6 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -1,6 +1,6 @@
 from __future__ import absolute_import
 from logging import Logger
-import os.path
+from os.path import join
 import numpy as np
 
 from ocrd_utils import (
@@ -172,7 +172,7 @@ def process(self):
                         comments=line_xywh['features'] + ',dewarped'))
 
             # update METS (add the PAGE file):
-            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
+            file_path = join(self.output_file_grp, file_id + '.xml')
             pcgts.set_pcGtsId(file_id)
             out = self.workspace.add_file(
                 ID=file_id,
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 8e147fea..f3ecf199 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -1,7 +1,7 @@
 from __future__ import absolute_import
 from logging import Logger
-import sys
-import os.path
+from sys import exit
+from os.path import abspath, dirname, isfile, join
 import numpy as np
 from PIL import Image
 
@@ -24,11 +24,9 @@
 from ocrd import Processor
 
 from .. import get_ocrd_tool
+from .common import check_line, pil2array
 from .ocrolib import lstm, load_object, midrange
-from .common import (
-    pil2array,
-    check_line
-)
+
 
 def resize_keep_ratio(image, baseheight=48):
     scale = baseheight / image.height
@@ -112,20 +110,20 @@ def get_model(self):
         be resolved with OcrdResourceManager to a valid readeable file and
         returns it.  If not, it checks if the model can be found in the
         dirname(__file__)/models/ directory."""
-        canread = lambda p: os.path.isfile(p) and os.access(p, os.R_OK)
+        canread = lambda p: isfile(p) and os.access(p, os.R_OK)
         try:
             model = self.resolve_resource(self.parameter['model'])
             if canread(model):
                 return model
         except SystemExit:
-            ocropydir = os.path.dirname(os.path.abspath(__file__))
-            path = os.path.join(ocropydir, 'models', self.parameter['model'])
+            ocropydir = dirname(abspath(__file__))
+            path = join(ocropydir, 'models', self.parameter['model'])
             self.logger.info("Failed to resolve model with OCR-D/core mechanism, trying %s", path)
             if canread(path):
                 return path
         self.logger.error("Could not find model %s. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s",
                 self.parameter['model'], self.parameter['model'])
-        sys.exit(1)
+        exit(1)
 
     def process(self):
         """Recognize lines / words / glyphs of the workspace.
@@ -176,7 +174,7 @@ def process(self):
 
             # update METS (add the PAGE file):
             file_id = make_file_id(input_file, self.output_file_grp)
-            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
+            file_path = join(self.output_file_grp, file_id + '.xml')
             pcgts.set_pcGtsId(file_id)
             out = self.workspace.add_file(
                 ID=file_id,
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 1e920b0f..329694d0 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -1,16 +1,13 @@
 from __future__ import absolute_import
 from logging import Logger
-import os.path
+from os.path import join
 import numpy as np
 from skimage import draw, segmentation
 from shapely.geometry import Polygon, LineString
 from shapely.prepared import prep
-from shapely.ops import unary_union
 
 from ocrd_modelfactory import page_from_file
-from ocrd_models.ocrd_page import (
-    to_xml, PageType, BaselineType
-)
+from ocrd_models.ocrd_page import BaselineType, PageType, to_xml
 from ocrd import Processor
 from ocrd_utils import (
     getLogger,
@@ -169,7 +166,7 @@ def process(self):
                         self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
 
             # update METS (add the PAGE file):
-            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
+            file_path = join(self.output_file_grp, file_id + '.xml')
             pcgts.set_pcGtsId(file_id)
             out = self.workspace.add_file(
                 ID=file_id,
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 3b89bda6..446fc628 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -1,6 +1,6 @@
 from __future__ import absolute_import
 from logging import Logger
-import os.path
+from os.path import join
 import itertools
 import numpy as np
 from scipy.sparse.csgraph import minimum_spanning_tree
@@ -505,7 +505,7 @@ def process(self):
                                           input_file.pageId, zoom)
 
             # update METS (add the PAGE file):
-            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
+            file_path = join(self.output_file_grp, file_id + '.xml')
             pcgts.set_pcGtsId(file_id)
             out = self.workspace.add_file(
                 ID=file_id,
diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py
index 9278da92..ff460523 100644
--- a/ocrd_cis/ocropy/train.py
+++ b/ocrd_cis/ocropy/train.py
@@ -1,7 +1,8 @@
 from __future__ import absolute_import
 from logging import Logger
-import sys
-import os
+from sys import exit
+from os import getcwd, makedirs, remove
+from os.path import abspath, dirname, exists, join, isfile
 import tempfile
 
 from ocrd_modelfactory import page_from_file
@@ -15,10 +16,10 @@
 
 def deletefiles(filelist):
     for file in filelist:
-        if os.path.exists(file):
-            os.remove(file)
-        if os.path.exists(file[:-3]+'gt.txt'):
-            os.remove(file[:-3]+'gt.txt')
+        if exists(file):
+            remove(file)
+        if exists(file[:-3]+'gt.txt'):
+            remove(file[:-3]+'gt.txt')
 
 def resize_keep_ratio(image, baseheight=48):
     hpercent = (baseheight / float(image.size[1]))
@@ -31,7 +32,7 @@ class OcropyTrain(Processor):
     logger: Logger
 
     def __init__(self, *args, **kwargs):
-        self.oldcwd = os.getcwd()
+        self.oldcwd = getcwd()
         ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable]
         kwargs['version'] = ocrd_tool['version']
@@ -52,22 +53,22 @@ def setup(self):
             try:
                 modelpath = self.resolve_resource(model)
             except SystemExit:
-                ocropydir = os.path.dirname(os.path.abspath(__file__))
-                modelpath = os.path.join(ocropydir, 'models', model)
+                ocropydir = dirname(abspath(__file__))
+                modelpath = join(ocropydir, 'models', model)
                 self.logger.info("Failed to resolve model '%s' path, trying '%s'", model, modelpath)
-            if not os.path.isfile(modelpath):
+            if not isfile(modelpath):
                 self.logger.error("Could not find model '%s'. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s'",
                                model, model)
-                sys.exit(1)
-            outputpath = os.path.join(self.oldcwd, 'output', model)
+                exit(1)
+            outputpath = join(self.oldcwd, 'output', model)
             if 'outputpath' in self.parameter:
-                outputpath = os.path.join(self.parameter, model)
+                outputpath = join(self.parameter, model)
         else:
             modelpath = None
-            outputpath = os.path.join(self.oldcwd, 'output', 'lstm')
+            outputpath = join(self.oldcwd, 'output', 'lstm')
             if 'outputpath' in self.parameter:
-                outputpath = os.path.join(self.parameter, 'lstm')
-        os.makedirs(os.path.dirname(outputpath))
+                outputpath = join(self.parameter, 'lstm')
+        makedirs(dirname(outputpath))
         self.modelpath = modelpath
         self.outputpath = outputpath
 
@@ -92,20 +93,20 @@ def process(self):
                 self.logger.info("Extracting %i lines from region '%s'", len(textlines), region.id)
                 for line in textlines:
                     if self.parameter['textequiv_level'] == 'line':
-                        path = os.path.join(filepath, page_id + region.id + line.id)
+                        path = join(filepath, page_id + region.id + line.id)
                         imgpath = self.extract_segment(path, line, page_image, page_coords)
                         if imgpath:
                             filelist.append(imgpath)
                         continue
                     for word in line.get_Word():
                         if self.parameter['textequiv_level'] == 'word':
-                            path = os.path.join(filepath, page_id + region.id + line.id + word.id)
+                            path = join(filepath, page_id + region.id + line.id + word.id)
                             imgpath = self.extract_segment(path, word, page_image, page_coords)
                             if imgpath:
                                 filelist.append(imgpath)
                             continue
                         for glyph in word.get_Glyph():
-                            path = os.path.join(filepath, page_id + region.id + line.id + glyph.id)
+                            path = join(filepath, page_id + region.id + line.id + glyph.id)
                             imgpath = self.extract_segment(path, glyph, page_image, page_coords)
                             if imgpath:
                                 filelist.append(imgpath)

From 3368a53e8341ab265ac5fa115a740cfc02bcc5ef Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 13 Aug 2024 16:34:21 +0200
Subject: [PATCH 086/194] remove: file grp cardinality checks inside process()

---
 ocrd_cis/ocropy/clip.py      | 2 --
 ocrd_cis/ocropy/denoise.py   | 2 --
 ocrd_cis/ocropy/deskew.py    | 2 --
 ocrd_cis/ocropy/dewarp.py    | 2 --
 ocrd_cis/ocropy/recognize.py | 2 --
 ocrd_cis/ocropy/resegment.py | 2 --
 ocrd_cis/ocropy/segment.py   | 3 ---
 7 files changed, 15 deletions(-)

diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index b70d1fb0..777b3d3d 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -85,8 +85,6 @@ def process(self):
         # deskewing, because that would make segments incomensurable with their
         # neighbours.
         level = self.parameter['level-of-operation']
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
             self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index 7cf74727..5d3b9d44 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -59,8 +59,6 @@ def process(self):
         Produce a new output file by serialising the resulting hierarchy.
         """
         level = self.parameter['level-of-operation']
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
             self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index bcd3be01..16b4bc81 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -63,8 +63,6 @@ def process(self):
         Produce a new output file by serialising the resulting hierarchy.
         """
         level = self.parameter['level-of-operation']
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
             self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 6c27c5c6..dbe512f2 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -112,8 +112,6 @@ def process(self):
 
         Produce a new output file by serialising the resulting hierarchy.
         """
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
             self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index f3ecf199..4b5da4b1 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -150,8 +150,6 @@ def process(self):
 
         Produce a new output file by serialising the resulting hierarchy.
         """
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
         maxlevel = self.parameter['textequiv_level']
 
         # self.logger.info("Using model %s in %s for recognition", model)
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 329694d0..378c2fd3 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -114,8 +114,6 @@ def process(self):
         # accuracy crucially depends on a good estimate of the images'
         # pixel density (at least if source input is not 300 DPI).
         level = self.parameter['level-of-operation']
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for n, input_file in enumerate(self.input_files):
             self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 446fc628..6feb6e29 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -338,9 +338,6 @@ def process(self):
         overwrite_order = self.parameter['overwrite_order']
         oplevel = self.parameter['level-of-operation']
 
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
-
         for (n, input_file) in enumerate(self.input_files):
             self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
             file_id = make_file_id(input_file, self.output_file_grp)

From ae97768ea73a900092f656c6ad42a64670525a11 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 13 Aug 2024 16:41:13 +0200
Subject: [PATCH 087/194] remove: constructors, adapt setup()

---
 ocrd_cis/ocropy/clip.py      |  7 -------
 ocrd_cis/ocropy/denoise.py   |  7 -------
 ocrd_cis/ocropy/deskew.py    |  7 -------
 ocrd_cis/ocropy/dewarp.py    | 10 ----------
 ocrd_cis/ocropy/recognize.py | 19 ++++++-------------
 ocrd_cis/ocropy/resegment.py |  7 -------
 ocrd_cis/ocropy/segment.py   |  8 --------
 ocrd_cis/ocropy/train.py     | 17 ++++-------------
 8 files changed, 10 insertions(+), 72 deletions(-)

diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 777b3d3d..62f68fcf 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -25,7 +25,6 @@
     MIMETYPE_PAGE
 )
 
-from .. import get_ocrd_tool
 from .ocrolib import midrange, morph
 from .common import (
     # binarize,
@@ -34,12 +33,6 @@
 class OcropyClip(Processor):
     logger: Logger
 
-    def __init__(self, *args, **kwargs):
-        self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
-        kwargs['version'] = self.ocrd_tool['version']
-        super(OcropyClip, self).__init__(*args, **kwargs)
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-clip'
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index 5d3b9d44..a68e2e3c 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -14,7 +14,6 @@
 )
 from ocrd import Processor
 
-from .. import get_ocrd_tool
 from .common import (
     # binarize,
     determine_zoom, remove_noise)
@@ -22,12 +21,6 @@
 class OcropyDenoise(Processor):
     logger: Logger
 
-    def __init__(self, *args, **kwargs):
-        self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
-        kwargs['version'] = self.ocrd_tool['version']
-        super(OcropyDenoise, self).__init__(*args, **kwargs)
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-denoise'
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index 16b4bc81..e41a557d 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -15,7 +15,6 @@
 )
 from ocrd import Processor
 
-from .. import get_ocrd_tool
 from . import common
 from .common import pil2array
 
@@ -29,12 +28,6 @@ def deskew(pil_image, maxskew=2):
 class OcropyDeskew(Processor):
     logger: Logger
 
-    def __init__(self, *args, **kwargs):
-        ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable]
-        kwargs['version'] = ocrd_tool['version']
-        super(OcropyDeskew, self).__init__(*args, **kwargs)
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-deskew'
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index dbe512f2..bb9e4098 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -15,7 +15,6 @@
 from ocrd import Processor
 from ocrd_utils import MIMETYPE_PAGE
 
-from .. import get_ocrd_tool
 from .ocrolib import lineest
 from .common import array2pil, check_line, determine_zoom, pil2array
 
@@ -66,15 +65,6 @@ def padvert(image, range_):
 class OcropyDewarp(Processor):
     logger: Logger
 
-    def __init__(self, *args, **kwargs):
-        self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
-        kwargs['version'] = self.ocrd_tool['version']
-        super(OcropyDewarp, self).__init__(*args, **kwargs)
-        if hasattr(self, 'output_file_grp'):
-            # processing context
-            self.setup()
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-dewarp'
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 4b5da4b1..5880675c 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -1,6 +1,8 @@
 from __future__ import absolute_import
 from logging import Logger
 from sys import exit
+from typing import Any
+from os import access, R_OK
 from os.path import abspath, dirname, isfile, join
 import numpy as np
 from PIL import Image
@@ -23,7 +25,6 @@
 )
 from ocrd import Processor
 
-from .. import get_ocrd_tool
 from .common import check_line, pil2array
 from .ocrolib import lstm, load_object, midrange
 
@@ -77,17 +78,8 @@ def recognize(image, pad, network, check=True):
 
 class OcropyRecognize(Processor):
     logger: Logger
-
-    def __init__(self, *args, **kwargs):
-        self.ocrd_tool = get_ocrd_tool()
-        self.pad = 16 # ocropus-rpred default
-        self.network = None # set in process
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
-        kwargs['version'] = self.ocrd_tool['version']
-        super(OcropyRecognize, self).__init__(*args, **kwargs)
-        if hasattr(self, 'output_file_grp'):
-            # processing context
-            self.setup()
+    network: Any
+    pad: int
 
     @property
     def executable(self):
@@ -95,6 +87,7 @@ def executable(self):
 
     def setup(self):
         self.logger = getLogger('processor.OcropyRecognize')
+        self.pad = 16
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
         # from ocropus-rpred:
@@ -110,7 +103,7 @@ def get_model(self):
         be resolved with OcrdResourceManager to a valid readeable file and
         returns it.  If not, it checks if the model can be found in the
         dirname(__file__)/models/ directory."""
-        canread = lambda p: isfile(p) and os.access(p, os.R_OK)
+        canread = lambda p: isfile(p) and access(p, R_OK)
         try:
             model = self.resolve_resource(self.parameter['model'])
             if canread(model):
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 378c2fd3..17b90f65 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -21,7 +21,6 @@
     MIMETYPE_PAGE
 )
 
-from .. import get_ocrd_tool
 from .ocrolib import midrange, morph
 from .common import (
     pil2array,
@@ -47,12 +46,6 @@
 class OcropyResegment(Processor):
     logger: Logger
 
-    def __init__(self, *args, **kwargs):
-        self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
-        kwargs['version'] = self.ocrd_tool['version']
-        super().__init__(*args, **kwargs)
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-resegment'
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 6feb6e29..f886e1d1 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -46,7 +46,6 @@
     MIMETYPE_PAGE
 )
 
-from .. import get_ocrd_tool
 from .ocrolib import midrange
 from .ocrolib import morph
 from .common import (
@@ -247,13 +246,6 @@ def getx(xy):
 class OcropySegment(Processor):
     logger: Logger
 
-    def __init__(self, *args, **kwargs):
-
-        self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
-        kwargs['version'] = self.ocrd_tool['version']
-        super(OcropySegment, self).__init__(*args, **kwargs)
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-segment'
diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py
index ff460523..08b68693 100644
--- a/ocrd_cis/ocropy/train.py
+++ b/ocrd_cis/ocropy/train.py
@@ -8,7 +8,6 @@
 from ocrd_modelfactory import page_from_file
 from ocrd import Processor
 from ocrd_utils import getLogger
-from ocrd_cis import get_ocrd_tool
 
 from .ocropus_rtrain import *
 from .binarize import binarize
@@ -30,16 +29,7 @@ def resize_keep_ratio(image, baseheight=48):
 
 class OcropyTrain(Processor):
     logger: Logger
-
-    def __init__(self, *args, **kwargs):
-        self.oldcwd = getcwd()
-        ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable]
-        kwargs['version'] = ocrd_tool['version']
-        super(OcropyTrain, self).__init__(*args, **kwargs)
-        if hasattr(self, 'input_file_grp'):
-            # processing context
-            self.setup()
+    old_cwd: str
 
     @property
     def executable(self):
@@ -47,6 +37,7 @@ def executable(self):
 
     def setup(self):
         self.logger = getLogger('processor.OcropyTrain')
+        self.old_cwd = getcwd()
         #print(self.parameter)
         if 'model' in self.parameter:
             model = self.parameter['model']
@@ -60,12 +51,12 @@ def setup(self):
                 self.logger.error("Could not find model '%s'. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s'",
                                model, model)
                 exit(1)
-            outputpath = join(self.oldcwd, 'output', model)
+            outputpath = join(self.old_cwd, 'output', model)
             if 'outputpath' in self.parameter:
                 outputpath = join(self.parameter, model)
         else:
             modelpath = None
-            outputpath = join(self.oldcwd, 'output', 'lstm')
+            outputpath = join(self.old_cwd, 'output', 'lstm')
             if 'outputpath' in self.parameter:
                 outputpath = join(self.parameter, 'lstm')
         makedirs(dirname(outputpath))

From 60d02d28040f5b1bc2b4f5497f5353d4f53d5c45 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 00:39:18 +0200
Subject: [PATCH 088/194] completed: OcropyBinarize

---
 ocrd_cis/ocropy/binarize.py | 138 +++++++++++++++++-------------------
 1 file changed, 65 insertions(+), 73 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 5d3fc7c3..0728f852 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -116,38 +116,36 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
             except ValueError as e:
                 self.logger.exception(e)
         else:
-            # TODO
-            raise NotImplementedError
             if level == 'table':
                 regions = page.get_TableRegion()
             else: # region
                 regions = page.get_AllRegions(classes=['Text'], order='reading-order')
             if not regions:
-                self.logger.warning('Page "%s" contains no text regions', page_id)
+                self.logger.warning(f"Page '{page_id}' contains no text regions")
             for region in regions:
                 region_image, region_xywh = self.workspace.image_from_segment(
                     region, page_image, page_xywh, feature_filter='binarized')
                 if level == 'region':
-                    self.process_region(region, region_image, region_xywh, zoom,
-                                        input_file.pageId, file_id + '_' + region.id)
-                    continue
+                    try:
+                        ret.append(self.process_region(region, region_image, region_xywh, zoom, page_id, file_id))
+                    except ValueError as e:
+                        self.logger.exception(e)
                 lines = region.get_TextLine()
                 if not lines:
-                    self.logger.warning('Page "%s" region "%s" contains no text lines',
-                                        page_id, region.id)
+                    self.logger.warning(f"Page '{page_id}' region '{region.id}' contains no text lines")
                 for line in lines:
                     line_image, line_xywh = self.workspace.image_from_segment(
                         line, region_image, region_xywh, feature_filter='binarized')
-                    self.process_line(line, line_image, line_xywh, zoom,
-                                      input_file.pageId, region.id,
-                                      file_id + '_' + region.id + '_' + line.id)
-
+                    try:
+                        ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, file_id))
+                    except ValueError as e:
+                        self.logger.exception(e)
         return ret
 
     def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]:
         if not page_image.width or not page_image.height:
-            raise ValueError("Skipping page '%s' with zero size", page_id)
-        self.logger.info("About to binarize page '%s'", page_id)
+            raise ValueError(f"Skipping page '{page_id}' with zero size")
+        self.logger.info(f"About to binarize page '{page_id}'")
         assert self.output_file_grp
 
         features = page_xywh['features']
@@ -157,18 +155,18 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> T
             maxskew = 0
         else:
             maxskew = self.parameter['maxskew']
-        bin_image, angle = binarize(page_image,
-                                    method=self.parameter['method'],
-                                    maxskew=maxskew,
-                                    threshold=self.parameter['threshold'],
-                                    nrm=self.parameter['grayscale'],
-                                    zoom=zoom)
+        bin_image, angle = binarize(
+            page_image,
+            method=self.parameter['method'],
+            maxskew=maxskew,
+            threshold=self.parameter['threshold'],
+            nrm=self.parameter['grayscale'],
+            zoom=zoom)
         if angle:
             features += ',deskewed'
         page_xywh['angle'] = angle
         if self.parameter['noise_maxsize']:
-            bin_image = remove_noise(
-                bin_image, maxsize=self.parameter['noise_maxsize'])
+            bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize'])
             features += ',despeckled'
         # annotate angle in PAGE (to allow consumers of the AlternativeImage
         # to do consistent coordinate transforms, and non-consumers
@@ -176,43 +174,43 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> T
         orientation = -page_xywh['angle']
         orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
         page.set_orientation(orientation)
-        # update METS (add the image file):
         if self.parameter['grayscale']:
             file_id += '.IMG-NRM'
             features += ',grayscale_normalized'
         else:
             file_id += '.IMG-BIN'
             features += ',binarized'
-        bin_image_path = join(self.output_file_grp, f'{file_id}.png')
+        bin_image_id = f'{file_id}.IMG-BIN'
+        bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png')
         # update PAGE (reference the image file):
         page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features))
-        return (bin_image, file_id, bin_image_path)
+        return bin_image, bin_image_id, bin_image_path
 
-    def process_region(self, region, region_image, region_xywh, zoom, page_id, file_id):
+    def process_region(self, region, region_image, region_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]:
         if not region_image.width or not region_image.height:
-            self.logger.warning("Skipping region '%s' with zero size", region.id)
-            return
-        self.logger.info("About to binarize page '%s' region '%s'", page_id, region.id)
+            raise ValueError(f"Skipping region '{region.id}' with zero size")
+        self.logger.info(f"About to binarize page '{page_id}' region '{region.id}'")
         features = region_xywh['features']
         if 'angle' in region_xywh and region_xywh['angle']:
             # orientation has already been annotated (by previous deskewing),
             # so skip deskewing here:
-            bin_image, _ = binarize(region_image,
-                                    method=self.parameter['method'],
-                                    maxskew=0,
-                                    nrm=self.parameter['grayscale'],
-                                    zoom=zoom)
+            bin_image, _ = binarize(
+                region_image,
+                method=self.parameter['method'],
+                maxskew=0,
+                nrm=self.parameter['grayscale'],
+                zoom=zoom)
         else:
-            bin_image, angle = binarize(region_image,
-                                        method=self.parameter['method'],
-                                        maxskew=self.parameter['maxskew'],
-                                        nrm=self.parameter['grayscale'],
-                                        zoom=zoom)
+            bin_image, angle = binarize(
+                region_image,
+                method=self.parameter['method'],
+                maxskew=self.parameter['maxskew'],
+                nrm=self.parameter['grayscale'],
+                zoom=zoom)
             if angle:
                 features += ',deskewed'
             region_xywh['angle'] = angle
-        bin_image = remove_noise(bin_image,
-                                 maxsize=self.parameter['noise_maxsize'])
+        bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize'])
         if self.parameter['noise_maxsize']:
             features += ',despeckled'
         # annotate angle in PAGE (to allow consumers of the AlternativeImage
@@ -221,33 +219,31 @@ def process_region(self, region, region_image, region_xywh, zoom, page_id, file_
         orientation = -region_xywh['angle']
         orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
         region.set_orientation(orientation)
-        # update METS (add the image file):
+        bin_image_id = f'{file_id}_{region.id}'
         if self.parameter['grayscale']:
-            file_id += '.IMG-NRM'
+            bin_image_id += '.IMG-NRM'
             features += ',grayscale_normalized'
         else:
-            file_id += '.IMG-BIN'
+            bin_image_id += '.IMG-BIN'
             features += ',binarized'
-        file_path = self.workspace.save_image_file(
-            bin_image, file_id, self.output_file_grp,
-            page_id=page_id)
+        bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png')
         # update PAGE (reference the image file):
-        region.add_AlternativeImage(AlternativeImageType(
-            filename=file_path,
-            comments=features))
+        region.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features))
+        return bin_image, bin_image_id, bin_image_path
 
-    def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id, file_id):
+    def process_line(
+        self, line, line_image, line_xywh, zoom, page_id, region_id, file_id
+    ) -> Tuple[Image.Image, str, str]:
         if not line_image.width or not line_image.height:
-            self.logger.warning("Skipping line '%s' with zero size", line.id)
-            return
-        self.logger.info("About to binarize page '%s' region '%s' line '%s'",
-                         page_id, region_id, line.id)
+            raise ValueError(f"Skipping line '{line.id}' with zero size")
+        self.logger.info(f"About to binarize page '{page_id}' region '{region_id}' line '{line.id}'")
         features = line_xywh['features']
-        bin_image, angle = binarize(line_image,
-                                    method=self.parameter['method'],
-                                    maxskew=self.parameter['maxskew'],
-                                    nrm=self.parameter['grayscale'],
-                                    zoom=zoom)
+        bin_image, angle = binarize(
+            line_image,
+            method=self.parameter['method'],
+            maxskew=self.parameter['maxskew'],
+            nrm=self.parameter['grayscale'],
+            zoom=zoom)
         if angle:
             features += ',deskewed'
         # annotate angle in PAGE (to allow consumers of the AlternativeImage
@@ -256,23 +252,19 @@ def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id, fi
         #orientation = -angle
         #orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
         #line.set_orientation(orientation) # does not exist on line level!
-        self.logger.warning("cannot add orientation %.2f to page '%s' region '%s' line '%s'",
-                            -angle, page_id, region_id, line.id)
-        bin_image = remove_noise(bin_image,
-                                 maxsize=self.parameter['noise_maxsize'])
+        self.logger.warning(f"cannot add orientation %.2f to page '{page_id}' region '{region_id}' line '{line.id}'",
+                            -angle)
+        bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize'])
         if self.parameter['noise_maxsize']:
             features += ',despeckled'
-        # update METS (add the image file):
+        bin_image_id = f'{file_id}_{region_id}_{line.id}'
         if self.parameter['grayscale']:
-            file_id += '.IMG-NRM'
+            bin_image_id += '.IMG-NRM'
             features += ',grayscale_normalized'
         else:
-            file_id += '.IMG-BIN'
+            bin_image_id += '.IMG-BIN'
             features += ',binarized'
-        file_path = self.workspace.save_image_file(
-            bin_image, file_id, self.output_file_grp,
-            page_id=page_id)
+        bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png')
         # update PAGE (reference the image file):
-        line.add_AlternativeImage(AlternativeImageType(
-            filename=file_path,
-            comments=features))
+        line.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features))
+        return bin_image, bin_image_id, bin_image_path

From dcaccd4b5bb357c4f73356aaed04fd8a4483caa8 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 00:46:34 +0200
Subject: [PATCH 089/194] remove file grp cardinality asserts

---
 ocrd_cis/ocropy/binarize.py  | 3 ---
 ocrd_cis/ocropy/clip.py      | 3 ---
 ocrd_cis/ocropy/denoise.py   | 3 ---
 ocrd_cis/ocropy/deskew.py    | 3 ---
 ocrd_cis/ocropy/dewarp.py    | 3 ---
 ocrd_cis/ocropy/recognize.py | 3 ---
 ocrd_cis/ocropy/resegment.py | 3 ---
 ocrd_cis/ocropy/segment.py   | 3 ---
 8 files changed, 24 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 0728f852..746aba5e 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -13,7 +13,6 @@
 from ocrd_utils import (
     getLogger,
     make_file_id,
-    assert_file_grp_cardinality,
     MIMETYPE_PAGE
 )
 from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml
@@ -69,8 +68,6 @@ def executable(self):
 
     def setup(self):
         self.logger = getLogger('processor.OcropyBinarize')
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
         method = self.parameter['method']
         if self.parameter['grayscale'] and method != 'ocropy':
             self.logger.critical(f'Requested method {method} does not support grayscale normalized output')
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 62f68fcf..3e76157b 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -15,7 +15,6 @@
 from ocrd_utils import (
     getLogger,
     make_file_id,
-    assert_file_grp_cardinality,
     coordinates_of_segment,
     polygon_from_points,
     bbox_from_polygon,
@@ -39,8 +38,6 @@ def executable(self):
 
     def setup(self):
         self.logger = getLogger('processor.OcropyClip')
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
 
     def process(self):
         """Clip text regions / lines of the workspace at intersections with neighbours.
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index a68e2e3c..24852f24 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -5,7 +5,6 @@
 from ocrd_utils import (
     getLogger,
     make_file_id,
-    assert_file_grp_cardinality,
     MIMETYPE_PAGE
 )
 from ocrd_modelfactory import page_from_file
@@ -27,8 +26,6 @@ def executable(self):
 
     def setup(self):
         self.logger = getLogger('processor.OcropyDenoise')
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
 
     def process(self):
         """Despeckle the pages / regions / lines of the workspace.
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index e41a557d..616864e1 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -5,7 +5,6 @@
 from ocrd_utils import (
     getLogger,
     make_file_id,
-    assert_file_grp_cardinality,
     MIMETYPE_PAGE
 )
 from ocrd_modelfactory import page_from_file
@@ -34,8 +33,6 @@ def executable(self):
 
     def setup(self):
         self.logger = getLogger('processor.OcropyDeskew')
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
 
     def process(self):
         """Deskew the pages or regions of the workspace.
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index bb9e4098..17b69bc5 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -6,7 +6,6 @@
 from ocrd_utils import (
     getLogger,
     make_file_id,
-    assert_file_grp_cardinality,
 )
 from ocrd_modelfactory import page_from_file
 from ocrd_models.ocrd_page import (
@@ -71,8 +70,6 @@ def executable(self):
 
     def setup(self):
         self.logger = getLogger('processor.OcropyDewarp')
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
         # defaults from ocrolib.lineest:
         self.lnorm = lineest.CenterNormalizer(
             params=(self.parameter['range'],
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 5880675c..40de2817 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -12,7 +12,6 @@
 from ocrd_utils import (
     getLogger,
     make_file_id,
-    assert_file_grp_cardinality,
     coordinates_for_segment,
     polygon_from_bbox,
     points_from_polygon,
@@ -88,8 +87,6 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropyRecognize')
         self.pad = 16
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
         # from ocropus-rpred:
         self.network = load_object(self.get_model(), verbose=1)
         for x in self.network.walk():
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 17b90f65..2483411d 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -12,7 +12,6 @@
 from ocrd_utils import (
     getLogger,
     make_file_id,
-    assert_file_grp_cardinality,
     coordinates_of_segment,
     coordinates_for_segment,
     points_from_polygon,
@@ -52,8 +51,6 @@ def executable(self):
 
     def setup(self):
         self.logger = getLogger('processor.OcropyResegment')
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
 
     def process(self):
         """Resegment lines of the workspace.
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index f886e1d1..9a1b8e11 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -38,7 +38,6 @@
 from ocrd_utils import (
     getLogger,
     make_file_id,
-    assert_file_grp_cardinality,
     coordinates_of_segment,
     coordinates_for_segment,
     points_from_polygon,
@@ -252,8 +251,6 @@ def executable(self):
 
     def setup(self):
         self.logger = getLogger('processor.OcropySegment')
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
 
     def process(self):
         """Segment pages into regions+lines, tables into cells+lines, or regions into lines.

From b178227763b834802b1e775623402b7bb5cdf84c Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 10:51:52 +0200
Subject: [PATCH 090/194] Update ocrd_cis/ocropy/binarize.py

Co-authored-by: Konstantin Baierer <kba@users.noreply.github.com>
---
 ocrd_cis/ocropy/binarize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 746aba5e..27a3667c 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -118,7 +118,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
             else: # region
                 regions = page.get_AllRegions(classes=['Text'], order='reading-order')
             if not regions:
-                self.logger.warning(f"Page '{page_id}' contains no text regions")
+                self.logger.warning(f"Page '{page_id}' contains no regions")
             for region in regions:
                 region_image, region_xywh = self.workspace.image_from_segment(
                     region, page_image, page_xywh, feature_filter='binarized')

From 67b6107e19c604063e9dae37473fcc48e04b4558 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 10:52:25 +0200
Subject: [PATCH 091/194] Update ocrd_cis/ocropy/binarize.py

Co-authored-by: Konstantin Baierer <kba@users.noreply.github.com>
---
 ocrd_cis/ocropy/binarize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 27a3667c..fea064af 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -76,7 +76,7 @@ def setup(self):
     def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
         """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace.
 
-        THEN Iterate over the PAGE-XML element hierarchy down to the requested
+        Iterate over the PAGE-XML element hierarchy down to the requested
         ``level-of-operation``.
 
         Next, for each file, crop each segment image according to the layout

From 06a98b1f601d80511e73b0c366a60f574e2a8e27 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 10:55:29 +0200
Subject: [PATCH 092/194] Update ocrd_cis/ocropy/binarize.py

Co-authored-by: Konstantin Baierer <kba@users.noreply.github.com>
---
 ocrd_cis/ocropy/binarize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index fea064af..7e355d73 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -71,7 +71,7 @@ def setup(self):
         method = self.parameter['method']
         if self.parameter['grayscale'] and method != 'ocropy':
             self.logger.critical(f'Requested method {method} does not support grayscale normalized output')
-            raise Exception('only method=ocropy allows grayscale=true')
+            raise ValueError('only method=ocropy allows grayscale=true')
 
     def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
         """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace.

From 1e6cd7bd53547de5c41f2100cdad8adc1a2091ca Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 10:55:45 +0200
Subject: [PATCH 093/194] Update ocrd_cis/ocropy/binarize.py

Co-authored-by: Konstantin Baierer <kba@users.noreply.github.com>
---
 ocrd_cis/ocropy/binarize.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 7e355d73..af60e613 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -21,7 +21,6 @@
 from . import common
 from .common import array2pil, determine_zoom, pil2array, remove_noise
 
-#sys.path.append(dirname(abspath(__file__)))
 
 def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0):
     LOG = getLogger('processor.OcropyBinarize')

From 71bb26d9c4f0b45498625b90c9e4cd136d8e667e Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 11:04:12 +0200
Subject: [PATCH 094/194] fix: potentially wrong dpi in logs

---
 ocrd_cis/ocropy/binarize.py  | 4 ++--
 ocrd_cis/ocropy/clip.py      | 4 ++--
 ocrd_cis/ocropy/common.py    | 4 ++--
 ocrd_cis/ocropy/denoise.py   | 4 ++--
 ocrd_cis/ocropy/dewarp.py    | 4 ++--
 ocrd_cis/ocropy/resegment.py | 4 ++--
 ocrd_cis/ocropy/segment.py   | 4 ++--
 7 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index af60e613..61e959ca 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -102,8 +102,8 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         assert page
 
         page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
-        zoom = determine_zoom(self.parameter['dpi'], page_image_info)
-        self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI")
+        zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info)
+        self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}")
         
         ret = [pcgts]
         if level == 'page':
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 3e76157b..3607399b 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -87,8 +87,8 @@ def process(self):
             
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id, feature_selector='binarized')
-            zoom = determine_zoom(self.parameter['dpi'], page_image_info)
-            self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI")
+            zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info)
+            self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}")
 
             # FIXME: what about text regions inside table regions?
             regions = list(page.get_TextRegion())
diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index 1804c29d..49e8f248 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -2103,7 +2103,7 @@ def find_topological():
     # DSAVE('rlabels_closed', rlabels)
     return rlabels
 
-def determine_zoom(dpi: float, page_image_info: OcrdExif) -> float:
+def determine_zoom(dpi: float, page_image_info: OcrdExif) -> (float, float):
     if dpi > 0:
         zoom = 300.0/dpi
     elif page_image_info.resolution != 1:
@@ -2113,4 +2113,4 @@ def determine_zoom(dpi: float, page_image_info: OcrdExif) -> float:
         zoom = 300.0/dpi
     else:
         zoom = 1
-    return zoom
+    return zoom, dpi
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index 24852f24..713af889 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -63,8 +63,8 @@ def process(self):
                 page, page_id,
                 feature_selector='binarized' if level == 'page' else '')
 
-            zoom = determine_zoom(self.parameter['dpi'], page_image_info)
-            self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI")
+            zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info)
+            self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}")
 
             if level == 'page':
                 self.process_segment(page, page_image, page_xywh, zoom,
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 17b69bc5..412724db 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -112,8 +112,8 @@ def process(self):
             page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                 page, page_id)
 
-            zoom = determine_zoom(self.parameter['dpi'], page_image_info)
-            self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI")
+            zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info)
+            self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}")
 
             regions = page.get_AllRegions(classes=['Text'], order='reading-order')
             if not regions:
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 2483411d..5bc9d008 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -117,8 +117,8 @@ def process(self):
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id, feature_selector='binarized')
 
-            zoom = determine_zoom(self.parameter['dpi'], page_image_info)
-            self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI")
+            zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info)
+            self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}")
 
             ignore = (page.get_ImageRegion() +
                       page.get_LineDrawingRegion() +
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 9a1b8e11..d171b6ed 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -339,8 +339,8 @@ def process(self):
             # TODO: also allow grayscale_normalized (try/except?)
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id, feature_selector='binarized')
-            zoom = determine_zoom(self.parameter['dpi'], page_image_info)
-            self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI")
+            zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info)
+            self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}")
 
             # aggregate existing regions so their foreground can be ignored
             ignore = (page.get_ImageRegion() +

From 64f02a32f938a00e01d6d390993246a617cbab5e Mon Sep 17 00:00:00 2001
From: kba <unixprog@gmail.com>
Date: Wed, 14 Aug 2024 11:14:31 +0200
Subject: [PATCH 095/194] binarize: don't conflate region/lines seg, pass
 output_file_id

---
 ocrd_cis/ocropy/binarize.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 61e959ca..817d4a8a 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -123,7 +123,8 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                     region, page_image, page_xywh, feature_filter='binarized')
                 if level == 'region':
                     try:
-                        ret.append(self.process_region(region, region_image, region_xywh, zoom, page_id, file_id))
+                        ret.append(self.process_region(region, region_image, region_xywh, zoom, region.id, output_file_id))
+                        continue
                     except ValueError as e:
                         self.logger.exception(e)
                 lines = region.get_TextLine()
@@ -133,8 +134,8 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                     line_image, line_xywh = self.workspace.image_from_segment(
                         line, region_image, region_xywh, feature_filter='binarized')
                     try:
-                        ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, file_id))
-                    except ValueError as e:
+                        ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, output_file_id))
+                    except alueError as e:
                         self.logger.exception(e)
         return ret
 

From d7c15c7738cdad474eb1999718c41371192e0e14 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 11:29:21 +0200
Subject: [PATCH 096/194] Update binarize.py

---
 ocrd_cis/ocropy/binarize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 817d4a8a..064a733e 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -135,7 +135,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                         line, region_image, region_xywh, feature_filter='binarized')
                     try:
                         ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, output_file_id))
-                    except alueError as e:
+                    except ValueError as e:
                         self.logger.exception(e)
         return ret
 

From 19566c0567b5b23bdc4596384d3867601045ca57 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 13:53:35 +0200
Subject: [PATCH 097/194] try to migrate recognize

---
 ocrd_cis/ocropy/recognize.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 40de2817..140a3c83 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -115,6 +115,30 @@ def get_model(self):
                 self.parameter['model'], self.parameter['model'])
         exit(1)
 
+    def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
+        maxlevel = self.parameter['textequiv_level']
+        assert self.workspace
+        self.logger.debug(f'Max level: "{maxlevel}"')
+
+        pcgts = input_pcgts[0]
+        page = pcgts.get_Page()
+        assert page
+
+        page_image, page_coords, _ = self.workspace.image_from_page(page, page_id)
+        ret = [pcgts]
+
+        self.logger.info(f"Recognizing text in page '{page_id}'")
+        # region, line, word, or glyph level:
+        regions = page.get_AllRegions(classes=['Text'])
+        if not regions:
+            self.logger.warning(f"Page '{page_id}' contains no text regions")
+        self.process_regions(regions, maxlevel, page_image, page_coords)
+
+        file_path = join(self.output_file_grp, output_file_id + '.xml')
+        ret.append((output_file_id, file_path))
+        return ret
+
+    # TODO: remove when `process_page_pcgts` is validated to be correct
     def process(self):
         """Recognize lines / words / glyphs of the workspace.
 

From 5f60976452011656fd05c1375055dd5ebd5f89d9 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 13:59:33 +0200
Subject: [PATCH 098/194] fix: migrate recognize

---
 ocrd_cis/ocropy/recognize.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 140a3c83..9729b480 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -125,18 +125,13 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         assert page
 
         page_image, page_coords, _ = self.workspace.image_from_page(page, page_id)
-        ret = [pcgts]
-
         self.logger.info(f"Recognizing text in page '{page_id}'")
         # region, line, word, or glyph level:
         regions = page.get_AllRegions(classes=['Text'])
         if not regions:
             self.logger.warning(f"Page '{page_id}' contains no text regions")
         self.process_regions(regions, maxlevel, page_image, page_coords)
-
-        file_path = join(self.output_file_grp, output_file_id + '.xml')
-        ret.append((output_file_id, file_path))
-        return ret
+        return [pcgts]
 
     # TODO: remove when `process_page_pcgts` is validated to be correct
     def process(self):

From e8b26035f0d4bd84e689ce92f8da805cb0adaf13 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 14:35:53 +0200
Subject: [PATCH 099/194] fix: detect_zoom logging

---
 ocrd_cis/ocropy/binarize.py  | 5 ++---
 ocrd_cis/ocropy/clip.py      | 4 ++--
 ocrd_cis/ocropy/common.py    | 5 +++--
 ocrd_cis/ocropy/denoise.py   | 3 +--
 ocrd_cis/ocropy/dewarp.py    | 3 +--
 ocrd_cis/ocropy/resegment.py | 3 +--
 ocrd_cis/ocropy/segment.py   | 3 +--
 7 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 064a733e..387c51dc 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -102,9 +102,8 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         assert page
 
         page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
-        zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info)
-        self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}")
-        
+        zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
+
         ret = [pcgts]
         if level == 'page':
             try:
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 3607399b..dd0de012 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -87,8 +87,8 @@ def process(self):
             
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id, feature_selector='binarized')
-            zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info)
-            self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}")
+            # TODO: zoom is not used anywhere, is it still useful to have this call here?
+            zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
 
             # FIXME: what about text regions inside table regions?
             regions = list(page.get_TextRegion())
diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index 49e8f248..095de5eb 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -2103,14 +2103,15 @@ def find_topological():
     # DSAVE('rlabels_closed', rlabels)
     return rlabels
 
-def determine_zoom(dpi: float, page_image_info: OcrdExif) -> (float, float):
+def determine_zoom(logger: logging.Logger, dpi: float, page_image_info: OcrdExif) -> float:
     if dpi > 0:
         zoom = 300.0/dpi
     elif page_image_info.resolution != 1:
         dpi = page_image_info.resolution
         if page_image_info.resolutionUnit == 'cm':
             dpi *= 2.54
+        logger.info(f"Page '{page_id}' uses {dpi} DPI.")
         zoom = 300.0/dpi
     else:
         zoom = 1
-    return zoom, dpi
+    return zoom
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index 713af889..78d11c28 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -63,8 +63,7 @@ def process(self):
                 page, page_id,
                 feature_selector='binarized' if level == 'page' else '')
 
-            zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info)
-            self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}")
+            zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
 
             if level == 'page':
                 self.process_segment(page, page_image, page_xywh, zoom,
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 412724db..9dddae44 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -112,8 +112,7 @@ def process(self):
             page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                 page, page_id)
 
-            zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info)
-            self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}")
+            zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
 
             regions = page.get_AllRegions(classes=['Text'], order='reading-order')
             if not regions:
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 5bc9d008..e8c52a69 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -117,8 +117,7 @@ def process(self):
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id, feature_selector='binarized')
 
-            zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info)
-            self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}")
+            zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
 
             ignore = (page.get_ImageRegion() +
                       page.get_LineDrawingRegion() +
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index d171b6ed..c092718f 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -339,8 +339,7 @@ def process(self):
             # TODO: also allow grayscale_normalized (try/except?)
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id, feature_selector='binarized')
-            zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info)
-            self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}")
+            zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
 
             # aggregate existing regions so their foreground can be ignored
             ignore = (page.get_ImageRegion() +

From 7dfd4964be3f4e4db9bfe6ff548eda477ed36ae6 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 14:38:05 +0200
Subject: [PATCH 100/194] update: test_lib base url

---
 tests/test_lib.bash | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_lib.bash b/tests/test_lib.bash
index f28acb1e..c018d253 100644
--- a/tests/test_lib.bash
+++ b/tests/test_lib.bash
@@ -5,7 +5,7 @@ trap "trap 'echo exiting without removing $tmpdir' EXIT" ERR
 trap "rm -rf $tmpdir" EXIT
 
 OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE"
-data_url="https://github.com/OCR-D/gt_structure_text/releases/download/v1.2.4/"
+data_url="https://github.com/OCR-D/gt_structure_text/releases/tag/v1.5.0/"
 function ocrd_cis_download_bagit() {
 	local url="$data_url/$1"
 	mkdir -p "$PWD/download"

From 033c38ac3e3a6fdd9e74ab502d792878aad77439 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 15:07:22 +0200
Subject: [PATCH 101/194] logging exception -> error

---
 ocrd_cis/ocropy/binarize.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 387c51dc..0ea170e4 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -109,7 +109,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
             try:
                 ret.append(self.process_page(page, page_image, page_xywh, zoom, page_id, output_file_id))
             except ValueError as e:
-                self.logger.exception(e)
+                self.logger.error(e)
         else:
             if level == 'table':
                 regions = page.get_TableRegion()
@@ -125,7 +125,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                         ret.append(self.process_region(region, region_image, region_xywh, zoom, region.id, output_file_id))
                         continue
                     except ValueError as e:
-                        self.logger.exception(e)
+                        self.logger.error(e)
                 lines = region.get_TextLine()
                 if not lines:
                     self.logger.warning(f"Page '{page_id}' region '{region.id}' contains no text lines")
@@ -135,7 +135,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                     try:
                         ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, output_file_id))
                     except ValueError as e:
-                        self.logger.exception(e)
+                        self.logger.error(e)
         return ret
 
     def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]:

From 46d84d58b7474adc3cb9f9b756b215efebd495e3 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 15:50:10 +0200
Subject: [PATCH 102/194] refactor: logger as a first positional argument

---
 ocrd_cis/ocropy/binarize.py  |  9 +++++---
 ocrd_cis/ocropy/resegment.py | 18 +++++++--------
 ocrd_cis/ocropy/segment.py   | 43 +++++++++++++++---------------------
 3 files changed, 32 insertions(+), 38 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 0ea170e4..8f7d8d3a 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -22,9 +22,8 @@
 from .common import array2pil, determine_zoom, pil2array, remove_noise
 
 
-def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0):
-    LOG = getLogger('processor.OcropyBinarize')
-    LOG.debug('binarizing %dx%d image with method=%s', pil_image.width, pil_image.height, method)
+def binarize(logger: Logger, pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0):
+    logger.debug('binarizing %dx%d image with method=%s', pil_image.width, pil_image.height, method)
     if method == 'none':
         # useful if the images are already binary,
         # but lack image attribute `binarized`
@@ -152,6 +151,7 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> T
         else:
             maxskew = self.parameter['maxskew']
         bin_image, angle = binarize(
+            self.logger,
             page_image,
             method=self.parameter['method'],
             maxskew=maxskew,
@@ -191,6 +191,7 @@ def process_region(self, region, region_image, region_xywh, zoom, page_id, file_
             # orientation has already been annotated (by previous deskewing),
             # so skip deskewing here:
             bin_image, _ = binarize(
+                self.logger,
                 region_image,
                 method=self.parameter['method'],
                 maxskew=0,
@@ -198,6 +199,7 @@ def process_region(self, region, region_image, region_xywh, zoom, page_id, file_
                 zoom=zoom)
         else:
             bin_image, angle = binarize(
+                self.logger,
                 region_image,
                 method=self.parameter['method'],
                 maxskew=self.parameter['maxskew'],
@@ -235,6 +237,7 @@ def process_line(
         self.logger.info(f"About to binarize page '{page_id}' region '{region_id}' line '{line.id}'")
         features = line_xywh['features']
         bin_image, angle = binarize(
+            self.logger,
             line_image,
             method=self.parameter['method'],
             maxskew=self.parameter['maxskew'],
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index e8c52a69..b18c0b5e 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -265,8 +265,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                                                   line_polygon[:, 0],
                                                   parent_bin.shape)
                     new_labels[line_y, line_x] = i + 1
-            spread_dist(lines, line_labels, new_labels, parent_bin, components, parent_coords,
-                        maxdist=maxdist or scale/2, loc=parent.id, threshold=threshold, logger=self.logger)
+            spread_dist(self.logger, lines, line_labels, new_labels, parent_bin, components, parent_coords,
+                        maxdist=maxdist or scale/2, loc=parent.id, threshold=threshold)
             return
         try:
             # TODO: 'scale' passed as a param may not be always defined (mehmedGIT)
@@ -280,9 +280,9 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
         self.logger.info("Found %d new line labels for %d existing lines on %s '%s'",
                  new_line_labels.max(), len(lines), tag, parent.id)
         # polygonalize and prepare comparison
-        new_line_polygons, new_line_labels = masks2polygons(
+        new_line_polygons, new_line_labels = masks2polygons(self.logger,
             new_line_labels, new_baselines, parent_bin, '%s "%s"' % (tag, parent.id),
-            min_area=640/zoom/zoom, logger=self.logger)
+            min_area=640/zoom/zoom)
         DSAVE('line_labels', [np.argmax(np.insert(line_labels, 0, 0, axis=0), axis=0), parent_bin])
         DSAVE('new_line_labels', [new_line_labels, parent_bin])
         new_line_polygons, new_baselines = list(zip(*[(Polygon(poly), LineString(base))
@@ -392,8 +392,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                 self.logger.debug("joining %d new line polygons for '%s'", len(new_lines), line.id)
             new_polygon = join_polygons([new_line_polygons[i] #intersections[(i, j)]
                                          for i in new_lines], loc=line.id, scale=scale)
-            new_baseline = join_baselines([new_polygon.intersection(new_baselines[i])
-                                           for i in new_lines], loc=line.id, logger=self.logger)
+            new_baseline = join_baselines(self.logger, [new_polygon.intersection(new_baselines[i])
+                                           for i in new_lines], loc=line.id)
             # convert back to absolute (page) coordinates:
             line_polygon = coordinates_for_segment(new_polygon.exterior.coords[:-1],
                                                    parent_image, parent_coords)
@@ -427,11 +427,9 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                         continue
                     otherline.get_Coords().set_points(points_from_polygon(other_polygon))
 
-def spread_dist(lines, old_labels, new_labels, binarized, components, coords,
-                maxdist=43, loc='', threshold=0.9, logger = None):
+def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, components, coords,
+                maxdist=43, loc='', threshold=0.9):
     """redefine line coordinates by contourizing spread of connected components propagated from new labels"""
-    if not logger:
-        raise ValueError(f"Logger has not been passed by the caller")
     DSAVE('seeds', [new_labels, (components>0)])
     # allocate to connected components consistently
     # (ignoring smallest components like punctuation)
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index c092718f..782425cc 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -57,7 +57,7 @@
     lines2regions
 )
 
-def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True, logger=None):
+def masks2polygons(logger: Logger, bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True):
     """Convert label masks into polygon coordinates.
 
     Given a Numpy array of background labels ``bg_labels``,
@@ -230,9 +230,9 @@ def getx(xy):
             # get baseline segments intersecting with this line mask
             # and concatenate them from left to right
             if baselines is not None:
-                base = join_baselines([baseline.intersection(polygon)
+                base = join_baselines(logger, [baseline.intersection(polygon)
                                        for baseline in baselines
-                                       if baseline.intersects(polygon)], name, logger)
+                                       if baseline.intersects(polygon)], name)
                 if base is not None:
                     base = base.coords
             else:
@@ -416,7 +416,7 @@ def process(self):
                             roelem = reading_order.get(region.id)
                             # replace by empty group with same index and ref
                             # (which can then take the cells as subregions)
-                            reading_order[region.id] = page_subgroup_in_reading_order(roelem, self.logger)
+                            reading_order[region.id] = page_subgroup_in_reading_order(self.logger, roelem)
                         else:
                             self.logger.warning('skipping table "%s" with existing TextRegions', region.id)
                             continue
@@ -434,7 +434,7 @@ def process(self):
                     elif overwrite_order:
                         # replace by empty ordered group with same (index and) ref
                         # (which can then take the cells as subregions)
-                        roelem = page_subgroup_in_reading_order(roelem, self.logger)
+                        roelem = page_subgroup_in_reading_order(self.logger, roelem)
                         reading_order[region.id] = roelem
                     elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)):
                         self.logger.warning("Page '%s' table region '%s' already has an ordered group (%s)",
@@ -446,7 +446,7 @@ def process(self):
                     else:
                         # replace regionRef(Indexed) by group with same index and ref
                         # (which can then take the cells as subregions)
-                        roelem = page_subgroup_in_reading_order(roelem, self.logger)
+                        roelem = page_subgroup_in_reading_order(self.logger, roelem)
                         reading_order[region.id] = roelem
                     # go get TextRegions with TextLines (and SeparatorRegions)
                     self._process_element(region, subignore, region_image, region_coords,
@@ -661,16 +661,14 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                                                        seps=np.maximum(sepmask, colseps))
                 region_mask |= region_line_labels > 0
                 # find contours for region (can be non-contiguous)
-                regions, _ = masks2polygons(region_mask * region_label, None, element_bin,
+                regions, _ = masks2polygons(self.logger, region_mask * region_label, None, element_bin,
                                             '%s "%s"' % (element_name, element_id),
                                             min_area=6000/zoom/zoom,
-                                            simplify=ignore_labels * ~(sep_bin),
-                                            logger=self.logger)
+                                            simplify=ignore_labels * ~(sep_bin))
                 # find contours for lines (can be non-contiguous)
-                lines, _ = masks2polygons(region_line_labels, baselines, element_bin,
+                lines, _ = masks2polygons(self.logger, region_line_labels, baselines, element_bin,
                                           'region "%s"' % element_id,
-                                          min_area=640/zoom/zoom,
-                                          logger=self.logger)
+                                          min_area=640/zoom/zoom)
                 # create new lines in new regions (allocating by intersection)
                 line_polys = [Polygon(polygon) for _, polygon, _ in lines]
                 for _, region_polygon, _ in regions:
@@ -722,8 +720,8 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             # (e.g. drop-capitals or images) ...
             self.logger.info('Found %d large image regions for %s "%s"', images.max(), element_name, element_id)
             # find contours around region labels (can be non-contiguous):
-            image_polygons, _ = masks2polygons(images, None, element_bin,
-                                               '%s "%s"' % (element_name, element_id), self.logger)
+            image_polygons, _ = masks2polygons(self.logger, images, None, element_bin,
+                                               '%s "%s"' % (element_name, element_id))
             for image_label, polygon, _ in image_polygons:
                 # convert back to absolute (page) coordinates:
                 region_polygon = coordinates_for_segment(polygon, image, coords)
@@ -740,9 +738,9 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             # split detected separator labels into separator regions:
             self.logger.info('Found %d separators for %s "%s"', seplines.max(), element_name, element_id)
             # find contours around region labels (can be non-contiguous):
-            sep_polygons, _ = masks2polygons(seplines, None, element_bin,
+            sep_polygons, _ = masks2polygons(self.logger, seplines, None, element_bin,
                                              '%s "%s"' % (element_name, element_id),
-                                             open_holes=True, reorder=False, logger=self.logger)
+                                             open_holes=True, reorder=False)
             for sep_label, polygon, _ in sep_polygons:
                 # convert back to absolute (page) coordinates:
                 region_polygon = coordinates_for_segment(polygon, image, coords)
@@ -774,9 +772,9 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             # ensure the new line labels do not extrude from the region:
             line_labels = line_labels * region_mask
             # find contours around labels (can be non-contiguous):
-            line_polygons, _ = masks2polygons(line_labels, baselines, element_bin,
+            line_polygons, _ = masks2polygons(self.logger, line_labels, baselines, element_bin,
                                               'region "%s"' % element_id,
-                                              min_area=640/zoom/zoom, logger=self.logger)
+                                              min_area=640/zoom/zoom)
             line_no = 0
             for line_label, polygon, baseline in line_polygons:
                 # convert back to absolute (page) coordinates:
@@ -918,9 +916,7 @@ def join_polygons(polygons, loc='', scale=20):
         jointp = make_valid(jointp)
     return jointp
 
-def join_baselines(baselines, loc='', logger = None):
-    if not logger:
-        raise ValueError(f"Logger has not been passed by the caller")
+def join_baselines(logger: Logger, baselines, loc=''):
     lines = []
     for baseline in baselines:
         if (baseline.is_empty or
@@ -1062,7 +1058,7 @@ def page_add_to_reading_order(rogroup, region_id, index=None):
             index += 1
     return index
 
-def page_subgroup_in_reading_order(roelem, logger = None):
+def page_subgroup_in_reading_order(logger: Logger, roelem):
     """Replace given RO element by an equivalent OrderedGroup.
     
     Given a ReadingOrder element ``roelem`` (of any type),
@@ -1076,9 +1072,6 @@ def page_subgroup_in_reading_order(roelem, logger = None):
     
     Return the new group object.
     """
-    if not logger:
-        raise ValueError(f"Logger has not been passed by the caller")
-
     if not roelem:
         logger.error('Cannot subgroup from empty ReadingOrder element')
         return roelem

From f6fe4cf4caaf056ded182b498b44a610349627fc Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 15:54:25 +0200
Subject: [PATCH 103/194] fix: test_lib.bash data url

---
 tests/test_lib.bash | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_lib.bash b/tests/test_lib.bash
index c018d253..801be01a 100644
--- a/tests/test_lib.bash
+++ b/tests/test_lib.bash
@@ -5,7 +5,7 @@ trap "trap 'echo exiting without removing $tmpdir' EXIT" ERR
 trap "rm -rf $tmpdir" EXIT
 
 OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE"
-data_url="https://github.com/OCR-D/gt_structure_text/releases/tag/v1.5.0/"
+data_url="https://github.com/OCR-D/gt_structure_text/releases/download/v1.5.0/"
 function ocrd_cis_download_bagit() {
 	local url="$data_url/$1"
 	mkdir -p "$PWD/download"

From aed0f95ccdc0dfe4cc26982258ef1c8acd613e1e Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 16:33:31 +0200
Subject: [PATCH 104/194] fix: recognize OcrdPage import

---
 ocrd_cis/ocropy/recognize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 9729b480..ccb019eb 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -19,7 +19,7 @@
 )
 from ocrd_modelfactory import page_from_file
 from ocrd_models.ocrd_page import (
-    to_xml, TextEquivType,
+    to_xml, TextEquivType, OcrdPage,
     CoordsType, GlyphType, WordType
 )
 from ocrd import Processor

From 804f031221eb4e64649e167c2f554d26555d5637 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 18:10:00 +0200
Subject: [PATCH 105/194] try to migrate clip

---
 ocrd_cis/ocropy/clip.py | 178 +++++++++++++++++++++++++++++++---------
 1 file changed, 138 insertions(+), 40 deletions(-)

diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index dd0de012..0675257b 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -8,9 +8,7 @@
 from shapely.prepared import prep
 
 from ocrd_modelfactory import page_from_file
-from ocrd_models.ocrd_page import (
-    to_xml, AlternativeImageType
-)
+from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml
 from ocrd import Processor
 from ocrd_utils import (
     getLogger,
@@ -39,6 +37,113 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropyClip')
 
+    def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
+        level = self.parameter['level-of-operation']
+        assert self.workspace
+        self.logger.debug(f'Level of operation: "{level}"')
+
+        pcgts = input_pcgts[0]
+        page = pcgts.get_Page()
+        assert page
+
+        page_image, page_coords, page_image_info = self.workspace.image_from_page(
+            page, page_id, feature_selector='binarized')
+        # TODO: zoom is not used anywhere, is it still useful to have this call here?
+        zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
+        ret = [pcgts]
+
+        # FIXME: what about text regions inside table regions?
+        regions = list(page.get_TextRegion())
+        num_texts = len(regions)
+        regions += (
+                page.get_AdvertRegion() +
+                page.get_ChartRegion() +
+                page.get_ChemRegion() +
+                page.get_GraphicRegion() +
+                page.get_ImageRegion() +
+                page.get_LineDrawingRegion() +
+                page.get_MathsRegion() +
+                page.get_MusicRegion() +
+                page.get_NoiseRegion() +
+                page.get_SeparatorRegion() +
+                page.get_TableRegion() +
+                page.get_UnknownRegion())
+        if not num_texts:
+            self.logger.warning('Page "%s" contains no text regions', page_id)
+        background = ImageStat.Stat(page_image)
+        # workaround for Pillow#4925
+        if len(background.bands) > 1:
+            background = tuple(background.median)
+        else:
+            background = background.median[0]
+        if level == 'region':
+            background_image = Image.new(page_image.mode, page_image.size, background)
+            page_array = pil2array(page_image)
+            page_bin = np.array(page_array <= midrange(page_array), np.uint8)
+            # in absolute coordinates merely for comparison/intersection
+            shapes = [Polygon(polygon_from_points(region.get_Coords().points)) for region in regions]
+            # in relative coordinates for mask/cropping
+            polygons = [coordinates_of_segment(region, page_image, page_coords) for region in regions]
+            for i, polygon in enumerate(polygons[num_texts:], num_texts):
+                # for non-text regions, extend mask by 3 pixels in each direction
+                # to ensure they do not leak components accidentally
+                # (accounts for bad cropping of such regions in GT):
+                polygon = Polygon(polygon).buffer(3).exterior.coords[:-1] # keep open
+                polygons[i] = polygon
+            masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8) for polygon in polygons]
+        for i, region in enumerate(regions):
+            if i >= num_texts:
+                break # keep non-text regions unchanged
+            if level == 'region':
+                if region.get_AlternativeImage():
+                    # FIXME: This should probably be an exception (bad workflow configuration).
+                    self.logger.warning(
+                        f'Page "{page_id}" region "{region.id}" already contains image data: skipping')
+                    continue
+                shape = prep(shapes[i])
+                neighbours = [(regionj, maskj) for shapej, regionj, maskj
+                              in zip(shapes[:i] + shapes[i+1:], regions[:i] + regions[i+1:], masks[:i] + masks[i+1:])
+                              if shape.intersects(shapej)]
+                if neighbours:
+                    segment_region_file_id = f"{output_file_id}_{region.id}"
+                    ret.append(self.process_segment(
+                        region, masks[i], polygons[i], neighbours, background_image,
+                        page_image, page_coords, page_bin, page_id, segment_region_file_id))
+                continue
+            # level == 'line':
+            lines = region.get_TextLine()
+            if not lines:
+                self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines')
+                continue
+            region_image, region_coords = self.workspace.image_from_segment(
+                region, page_image, page_coords, feature_selector='binarized')
+            background_image = Image.new(region_image.mode, region_image.size, background)
+            region_array = pil2array(region_image)
+            region_bin = np.array(region_array <= midrange(region_array), np.uint8)
+            # in absolute coordinates merely for comparison/intersection
+            shapes = [Polygon(polygon_from_points(line.get_Coords().points)) for line in lines]
+            # in relative coordinates for mask/cropping
+            polygons = [coordinates_of_segment(line, region_image, region_coords) for line in lines]
+            masks = [pil2array(polygon_mask(region_image, polygon)).astype(np.uint8) for polygon in polygons]
+            for j, line in enumerate(lines):
+                if line.get_AlternativeImage():
+                    # FIXME: This should probably be an exception (bad workflow configuration).
+                    self.logger.warning(
+                        f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image '
+                        f'data: skipping')
+                    continue
+                shape = prep(shapes[j])
+                neighbours = [(linej, maskj) for shapej, linej, maskj
+                              in zip(shapes[:j] + shapes[j+1:], lines[:j] + lines[j+1:], masks[:j] + masks[j+1:])
+                              if shape.intersects(shapej)]
+                if neighbours:
+                    segment_line_file_id = f"{output_file_id}_{region.id}_{line.id}"
+                    ret.append(self.process_segment(
+                        line, masks[j], polygons[j], neighbours, background_image,
+                        region_image, region_coords, region_bin, page_id, segment_line_file_id))
+        return ret
+
+    # TODO: remove when `process_page_pcgts` is validated to be correct
     def process(self):
         """Clip text regions / lines of the workspace at intersections with neighbours.
 
@@ -119,27 +224,24 @@ def process(self):
                 page_array = pil2array(page_image)
                 page_bin = np.array(page_array <= midrange(page_array), np.uint8)
                 # in absolute coordinates merely for comparison/intersection
-                shapes = [Polygon(polygon_from_points(region.get_Coords().points))
-                          for region in regions]
+                shapes = [Polygon(polygon_from_points(region.get_Coords().points)) for region in regions]
                 # in relative coordinates for mask/cropping
-                polygons = [coordinates_of_segment(region, page_image, page_coords)
-                            for region in regions]
+                polygons = [coordinates_of_segment(region, page_image, page_coords) for region in regions]
                 for i, polygon in enumerate(polygons[num_texts:], num_texts):
                     # for non-text regions, extend mask by 3 pixels in each direction
                     # to ensure they do not leak components accidentally
                     # (accounts for bad cropping of such regions in GT):
                     polygon = Polygon(polygon).buffer(3).exterior.coords[:-1] # keep open
                     polygons[i] = polygon
-                masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8)
-                         for polygon in polygons]
+                masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8) for polygon in polygons]
             for i, region in enumerate(regions):
                 if i >= num_texts:
                     break # keep non-text regions unchanged
                 if level == 'region':
                     if region.get_AlternativeImage():
                         # FIXME: This should probably be an exception (bad workflow configuration).
-                        self.logger.warning('Page "%s" region "%s" already contains image data: skipping',
-                                    page_id, region.id)
+                        self.logger.warning(
+                            f'Page "{page_id}" region "{region.id}" already contains image data: skipping')
                         continue
                     shape = prep(shapes[i])
                     neighbours = [(regionj, maskj) for shapej, regionj, maskj
@@ -148,15 +250,15 @@ def process(self):
                                          masks[:i] + masks[i+1:])
                                   if shape.intersects(shapej)]
                     if neighbours:
-                        self.process_segment(region, masks[i], polygons[i],
-                                             neighbours, background_image,
-                                             page_image, page_coords, page_bin,
-                                             input_file.pageId, file_id + '_' + region.id)
+                        segment_region_file_id = f"{file_id}_{region.id}"
+                        self.process_segment(
+                            region, masks[i], polygons[i], neighbours, background_image,
+                            page_image, page_coords, page_bin, input_file.pageId, segment_region_file_id)
                     continue
                 # level == 'line':
                 lines = region.get_TextLine()
                 if not lines:
-                    self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
+                    self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines')
                     continue
                 region_image, region_coords = self.workspace.image_from_segment(
                     region, page_image, page_coords, feature_selector='binarized')
@@ -164,18 +266,16 @@ def process(self):
                 region_array = pil2array(region_image)
                 region_bin = np.array(region_array <= midrange(region_array), np.uint8)
                 # in absolute coordinates merely for comparison/intersection
-                shapes = [Polygon(polygon_from_points(line.get_Coords().points))
-                          for line in lines]
+                shapes = [Polygon(polygon_from_points(line.get_Coords().points)) for line in lines]
                 # in relative coordinates for mask/cropping
-                polygons = [coordinates_of_segment(line, region_image, region_coords)
-                            for line in lines]
-                masks = [pil2array(polygon_mask(region_image, polygon)).astype(np.uint8)
-                         for polygon in polygons]
+                polygons = [coordinates_of_segment(line, region_image, region_coords) for line in lines]
+                masks = [pil2array(polygon_mask(region_image, polygon)).astype(np.uint8) for polygon in polygons]
                 for j, line in enumerate(lines):
                     if line.get_AlternativeImage():
                         # FIXME: This should probably be an exception (bad workflow configuration).
-                        self.logger.warning('Page "%s" region "%s" line "%s" already contains image data: skipping',
-                                    page_id, region.id, line.id)
+                        self.logger.warning(
+                            f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image '
+                            f'data: skipping')
                         continue
                     shape = prep(shapes[j])
                     neighbours = [(linej, maskj) for shapej, linej, maskj
@@ -184,10 +284,10 @@ def process(self):
                                          masks[:j] + masks[j+1:])
                                   if shape.intersects(shapej)]
                     if neighbours:
-                        self.process_segment(line, masks[j], polygons[j],
-                                             neighbours, background_image,
-                                             region_image, region_coords, region_bin,
-                                             input_file.pageId, file_id + '_' + region.id + '_' + line.id)
+                        segment_line_file_id = f"{file_id}_{region.id}_{line.id}"
+                        self.process_segment(
+                            line, masks[j], polygons[j], neighbours, background_image,
+                            region_image, region_coords, region_bin, input_file.pageId, segment_line_file_id)
 
             # update METS (add the PAGE file):
             file_path = join(self.output_file_grp, file_id + '.xml')
@@ -204,7 +304,7 @@ def process(self):
 
     def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
                         background_image, parent_image, parent_coords, parent_bin,
-                        page_id, file_id):
+                        page_id, file_id) -> Tuple[Image.Image, str, str]:
         # initialize AlternativeImage@comments classes from parent, except
         # for those operations that can apply on multiple hierarchy levels:
         features = ','.join(
@@ -216,8 +316,8 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
         segment_bbox = bbox_from_polygon(segment_polygon)
         for neighbour, neighbour_mask in neighbours:
             if not np.any(segment_mask > neighbour_mask):
-                self.logger.info('Ignoring enclosing neighbour "%s" of segment "%s" on page "%s"',
-                         neighbour.id, segment.id, page_id)
+                self.logger.info(
+                    f'Ignoring enclosing neighbour "{neighbour.id}" of segment "{segment.id}" on page "{page_id}"')
                 continue
             # find connected components that (only) belong to the neighbour:
             intruders = segment_mask * morph.keep_marked(parent_bin, neighbour_mask > 0) # overlaps neighbour
@@ -226,8 +326,9 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
             num_foreground = np.count_nonzero(segment_mask * parent_bin)
             if not num_intruders:
                 continue
-            self.logger.debug('segment "%s" vs neighbour "%s": suppressing %d of %d pixels on page "%s"',
-                      segment.id, neighbour.id, num_intruders, num_foreground, page_id)
+            self.logger.debug(
+                f'segment "{segment.id}" vs neighbour "{neighbour.id}": suppressing {num_intruders} of '
+                f'{num_foreground} pixels on page "{page_id}"')
             # suppress in segment_mask so these intruders can stay in the neighbours
             # (are not removed from both sides)
             segment_mask -= intruders
@@ -241,11 +342,8 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
         # recrop segment into rectangle, just as image_from_segment would do
         # (and also clipping with background colour):
         segment_image = crop_image(segment_image,box=segment_bbox)
-        # update METS (add the image file):
-        file_path = self.workspace.save_image_file(
-            segment_image, file_id + '.IMG-CLIP', self.output_file_grp,
-            page_id=page_id)
+        segment_image_id = file_id + '.IMG-CLIP'
+        segment_image_path = join(self.output_file_grp, f'{segment_image_id}.png')
         # update PAGE (reference the image file):
-        segment.add_AlternativeImage(AlternativeImageType(
-            filename=file_path,
-            comments=features))
+        segment.add_AlternativeImage(AlternativeImageType(filename=segment_image_path, comments=features))
+        return segment_image, segment_image_id, segment_image_path

From 7bdff31747ad2c9cdb834569b8b1adf8b90303d2 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Thu, 15 Aug 2024 11:51:42 +0200
Subject: [PATCH 106/194] remove: process() methods

---
 ocrd_cis/ocropy/clip.py      | 194 +++++++----------------------------
 ocrd_cis/ocropy/recognize.py |  65 +++---------
 2 files changed, 50 insertions(+), 209 deletions(-)

diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 0675257b..9e6d8d19 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -37,7 +37,42 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropyClip')
 
+    # TODO: Adapt the docstring comment to process_page_pcgts
     def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
+        """Clip text regions / lines of the workspace at intersections with neighbours.
+
+        Open and deserialise PAGE input files and their respective images,
+        then iterate over the element hierarchy down to the requested
+        ``level-of-operation``.
+
+        Next, get each segment image according to the layout annotation (by cropping
+        via coordinates into the higher-level image), as well as all its neighbours',
+        binarize them (without deskewing), and make a connected component analysis.
+        (Segments must not already have AlternativeImage annotated, otherwise they
+        will be skipped.)
+
+        Then, for each section of overlap with a neighbour, re-assign components
+        which are only contained in the neighbour by clipping them to white (background),
+        and export the (final) result as image file.
+
+        Add the new image file to the workspace along with the output fileGrp,
+        and using a file ID with suffix ``.IMG-CLIP`` along with further
+        identification of the input element.
+
+        Reference each new image in the AlternativeImage of the element.
+
+        Produce a new output file by serialising the resulting hierarchy.
+        """
+        # This makes best sense for overlapping segmentation, like current GT
+        # or Tesseract layout analysis. Most notably, it can suppress graphics
+        # and separators within or across a region or line. It _should_ ideally
+        # be run after binarization (on page level for region-level clipping,
+        # and on the region level for line-level clipping), because the
+        # connected component analysis after implicit binarization could be
+        # suboptimal, and the explicit binarization after clipping could be,
+        # too. However, region-level clipping _must_ be run before region-level
+        # deskewing, because that would make segments incomensurable with their
+        # neighbours.
         level = self.parameter['level-of-operation']
         assert self.workspace
         self.logger.debug(f'Level of operation: "{level}"')
@@ -143,165 +178,6 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                         region_image, region_coords, region_bin, page_id, segment_line_file_id))
         return ret
 
-    # TODO: remove when `process_page_pcgts` is validated to be correct
-    def process(self):
-        """Clip text regions / lines of the workspace at intersections with neighbours.
-
-        Open and deserialise PAGE input files and their respective images,
-        then iterate over the element hierarchy down to the requested
-        ``level-of-operation``.
-
-        Next, get each segment image according to the layout annotation (by cropping
-        via coordinates into the higher-level image), as well as all its neighbours',
-        binarize them (without deskewing), and make a connected component analysis.
-        (Segments must not already have AlternativeImage annotated, otherwise they
-        will be skipped.)
-
-        Then, for each section of overlap with a neighbour, re-assign components
-        which are only contained in the neighbour by clipping them to white (background),
-        and export the (final) result as image file.
-
-        Add the new image file to the workspace along with the output fileGrp,
-        and using a file ID with suffix ``.IMG-CLIP`` along with further
-        identification of the input element.
-
-        Reference each new image in the AlternativeImage of the element.
-
-        Produce a new output file by serialising the resulting hierarchy.
-        """
-        # This makes best sense for overlapping segmentation, like current GT
-        # or Tesseract layout analysis. Most notably, it can suppress graphics
-        # and separators within or across a region or line. It _should_ ideally
-        # be run after binarization (on page level for region-level clipping,
-        # and on the region level for line-level clipping), because the
-        # connected component analysis after implicit binarization could be
-        # suboptimal, and the explicit binarization after clipping could be,
-        # too. However, region-level clipping _must_ be run before region-level
-        # deskewing, because that would make segments incomensurable with their
-        # neighbours.
-        level = self.parameter['level-of-operation']
-
-        for (n, input_file) in enumerate(self.input_files):
-            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
-            file_id = make_file_id(input_file, self.output_file_grp)
-
-            pcgts = page_from_file(self.workspace.download_file(input_file))
-            self.add_metadata(pcgts)
-            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
-            page = pcgts.get_Page()
-            
-            page_image, page_coords, page_image_info = self.workspace.image_from_page(
-                page, page_id, feature_selector='binarized')
-            # TODO: zoom is not used anywhere, is it still useful to have this call here?
-            zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
-
-            # FIXME: what about text regions inside table regions?
-            regions = list(page.get_TextRegion())
-            num_texts = len(regions)
-            regions += (
-                page.get_AdvertRegion() +
-                page.get_ChartRegion() +
-                page.get_ChemRegion() +
-                page.get_GraphicRegion() +
-                page.get_ImageRegion() +
-                page.get_LineDrawingRegion() +
-                page.get_MathsRegion() +
-                page.get_MusicRegion() +
-                page.get_NoiseRegion() +
-                page.get_SeparatorRegion() +
-                page.get_TableRegion() +
-                page.get_UnknownRegion())
-            if not num_texts:
-                self.logger.warning('Page "%s" contains no text regions', page_id)
-            background = ImageStat.Stat(page_image)
-            # workaround for Pillow#4925
-            if len(background.bands) > 1:
-                background = tuple(background.median)
-            else:
-                background = background.median[0]
-            if level == 'region':
-                background_image = Image.new(page_image.mode, page_image.size, background)
-                page_array = pil2array(page_image)
-                page_bin = np.array(page_array <= midrange(page_array), np.uint8)
-                # in absolute coordinates merely for comparison/intersection
-                shapes = [Polygon(polygon_from_points(region.get_Coords().points)) for region in regions]
-                # in relative coordinates for mask/cropping
-                polygons = [coordinates_of_segment(region, page_image, page_coords) for region in regions]
-                for i, polygon in enumerate(polygons[num_texts:], num_texts):
-                    # for non-text regions, extend mask by 3 pixels in each direction
-                    # to ensure they do not leak components accidentally
-                    # (accounts for bad cropping of such regions in GT):
-                    polygon = Polygon(polygon).buffer(3).exterior.coords[:-1] # keep open
-                    polygons[i] = polygon
-                masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8) for polygon in polygons]
-            for i, region in enumerate(regions):
-                if i >= num_texts:
-                    break # keep non-text regions unchanged
-                if level == 'region':
-                    if region.get_AlternativeImage():
-                        # FIXME: This should probably be an exception (bad workflow configuration).
-                        self.logger.warning(
-                            f'Page "{page_id}" region "{region.id}" already contains image data: skipping')
-                        continue
-                    shape = prep(shapes[i])
-                    neighbours = [(regionj, maskj) for shapej, regionj, maskj
-                                  in zip(shapes[:i] + shapes[i+1:],
-                                         regions[:i] + regions[i+1:],
-                                         masks[:i] + masks[i+1:])
-                                  if shape.intersects(shapej)]
-                    if neighbours:
-                        segment_region_file_id = f"{file_id}_{region.id}"
-                        self.process_segment(
-                            region, masks[i], polygons[i], neighbours, background_image,
-                            page_image, page_coords, page_bin, input_file.pageId, segment_region_file_id)
-                    continue
-                # level == 'line':
-                lines = region.get_TextLine()
-                if not lines:
-                    self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines')
-                    continue
-                region_image, region_coords = self.workspace.image_from_segment(
-                    region, page_image, page_coords, feature_selector='binarized')
-                background_image = Image.new(region_image.mode, region_image.size, background)
-                region_array = pil2array(region_image)
-                region_bin = np.array(region_array <= midrange(region_array), np.uint8)
-                # in absolute coordinates merely for comparison/intersection
-                shapes = [Polygon(polygon_from_points(line.get_Coords().points)) for line in lines]
-                # in relative coordinates for mask/cropping
-                polygons = [coordinates_of_segment(line, region_image, region_coords) for line in lines]
-                masks = [pil2array(polygon_mask(region_image, polygon)).astype(np.uint8) for polygon in polygons]
-                for j, line in enumerate(lines):
-                    if line.get_AlternativeImage():
-                        # FIXME: This should probably be an exception (bad workflow configuration).
-                        self.logger.warning(
-                            f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image '
-                            f'data: skipping')
-                        continue
-                    shape = prep(shapes[j])
-                    neighbours = [(linej, maskj) for shapej, linej, maskj
-                                  in zip(shapes[:j] + shapes[j+1:],
-                                         lines[:j] + lines[j+1:],
-                                         masks[:j] + masks[j+1:])
-                                  if shape.intersects(shapej)]
-                    if neighbours:
-                        segment_line_file_id = f"{file_id}_{region.id}_{line.id}"
-                        self.process_segment(
-                            line, masks[j], polygons[j], neighbours, background_image,
-                            region_image, region_coords, region_bin, input_file.pageId, segment_line_file_id)
-
-            # update METS (add the PAGE file):
-            file_path = join(self.output_file_grp, file_id + '.xml')
-            pcgts.set_pcGtsId(file_id)
-            out = self.workspace.add_file(
-                ID=file_id,
-                file_grp=self.output_file_grp,
-                pageId=input_file.pageId,
-                local_filename=file_path,
-                mimetype=MIMETYPE_PAGE,
-                content=to_xml(pcgts))
-            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
-                     file_id, self.output_file_grp, out.local_filename)
-
     def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
                         background_image, parent_image, parent_coords, parent_bin,
                         page_id, file_id) -> Tuple[Image.Image, str, str]:
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index ccb019eb..389cf8db 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -115,26 +115,8 @@ def get_model(self):
                 self.parameter['model'], self.parameter['model'])
         exit(1)
 
+    # TODO: Adapt the docstring comment to process_page_pcgts
     def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
-        maxlevel = self.parameter['textequiv_level']
-        assert self.workspace
-        self.logger.debug(f'Max level: "{maxlevel}"')
-
-        pcgts = input_pcgts[0]
-        page = pcgts.get_Page()
-        assert page
-
-        page_image, page_coords, _ = self.workspace.image_from_page(page, page_id)
-        self.logger.info(f"Recognizing text in page '{page_id}'")
-        # region, line, word, or glyph level:
-        regions = page.get_AllRegions(classes=['Text'])
-        if not regions:
-            self.logger.warning(f"Page '{page_id}' contains no text regions")
-        self.process_regions(regions, maxlevel, page_image, page_coords)
-        return [pcgts]
-
-    # TODO: remove when `process_page_pcgts` is validated to be correct
-    def process(self):
         """Recognize lines / words / glyphs of the workspace.
 
         Open and deserialise each PAGE input file and its respective image,
@@ -160,38 +142,21 @@ def process(self):
         Produce a new output file by serialising the resulting hierarchy.
         """
         maxlevel = self.parameter['textequiv_level']
+        assert self.workspace
+        self.logger.debug(f'Max level: "{maxlevel}"')
+
+        pcgts = input_pcgts[0]
+        page = pcgts.get_Page()
+        assert page
 
-        # self.logger.info("Using model %s in %s for recognition", model)
-        for (n, input_file) in enumerate(self.input_files):
-            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
-            pcgts = page_from_file(self.workspace.download_file(input_file))
-            self.add_metadata(pcgts)
-            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
-            page = pcgts.get_Page()
-
-            page_image, page_coords, _ = self.workspace.image_from_page(
-                page, page_id)
-
-            self.logger.info("Recognizing text in page '%s'", page_id)
-            # region, line, word, or glyph level:
-            regions = page.get_AllRegions(classes=['Text'])
-            if not regions:
-                self.logger.warning("Page '%s' contains no text regions", page_id)
-            self.process_regions(regions, maxlevel, page_image, page_coords)
-
-            # update METS (add the PAGE file):
-            file_id = make_file_id(input_file, self.output_file_grp)
-            file_path = join(self.output_file_grp, file_id + '.xml')
-            pcgts.set_pcGtsId(file_id)
-            out = self.workspace.add_file(
-                ID=file_id,
-                file_grp=self.output_file_grp,
-                pageId=input_file.pageId,
-                local_filename=file_path,
-                mimetype=MIMETYPE_PAGE,
-                content=to_xml(pcgts))
-            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
-                             file_id, self.output_file_grp, out.local_filename)
+        page_image, page_coords, _ = self.workspace.image_from_page(page, page_id)
+        self.logger.info(f"Recognizing text in page '{page_id}'")
+        # region, line, word, or glyph level:
+        regions = page.get_AllRegions(classes=['Text'])
+        if not regions:
+            self.logger.warning(f"Page '{page_id}' contains no text regions")
+        self.process_regions(regions, maxlevel, page_image, page_coords)
+        return [pcgts]
 
     def process_regions(self, regions, maxlevel, page_image, page_coords):
         edits = 0

From 03c2f158fa02ddeae40baa93cee686be1fd0ca09 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Thu, 15 Aug 2024 11:57:36 +0200
Subject: [PATCH 107/194] adapt: docstring of process_page_pcgts

---
 ocrd_cis/ocropy/clip.py      |  8 ++++----
 ocrd_cis/ocropy/recognize.py | 17 ++++++++---------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 9e6d8d19..a5f4f705 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -39,9 +39,9 @@ def setup(self):
 
     # TODO: Adapt the docstring comment to process_page_pcgts
     def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
-        """Clip text regions / lines of the workspace at intersections with neighbours.
+        """Clip text regions / lines of a page at intersections with neighbours.
 
-        Open and deserialise PAGE input files and their respective images,
+        Open and deserialize PAGE input file and its respective image,
         then iterate over the element hierarchy down to the requested
         ``level-of-operation``.
 
@@ -61,7 +61,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
 
         Reference each new image in the AlternativeImage of the element.
 
-        Produce a new output file by serialising the resulting hierarchy.
+        Return the resulting OcrdPage.
         """
         # This makes best sense for overlapping segmentation, like current GT
         # or Tesseract layout analysis. Most notably, it can suppress graphics
@@ -71,7 +71,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         # connected component analysis after implicit binarization could be
         # suboptimal, and the explicit binarization after clipping could be,
         # too. However, region-level clipping _must_ be run before region-level
-        # deskewing, because that would make segments incomensurable with their
+        # deskewing, because that would make segments incommensurable with their
         # neighbours.
         level = self.parameter['level-of-operation']
         assert self.workspace
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 389cf8db..69b374ec 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -115,18 +115,17 @@ def get_model(self):
                 self.parameter['model'], self.parameter['model'])
         exit(1)
 
-    # TODO: Adapt the docstring comment to process_page_pcgts
     def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
-        """Recognize lines / words / glyphs of the workspace.
+        """Recognize lines / words / glyphs of a page.
 
-        Open and deserialise each PAGE input file and its respective image,
+        Open and deserialize the PAGE input file and its respective image,
         then iterate over the element hierarchy down to the requested
         ``textequiv_level``. If any layout annotation below the line level
         already exists, then remove it (regardless of ``textequiv_level``).
 
-        Set up Ocropy to recognise each text line (via coordinates into
+        Set up Ocropy to recognize each text line (via coordinates into
         the higher-level image, or from the alternative image; the image
-        must have been binarised/grayscale-normalised, deskewed and dewarped
+        must have been binarized/grayscale-normalised, deskewed and dewarped
         already). Rescale and pad the image, then recognize.
 
         Create new elements below the line level, if necessary.
@@ -139,11 +138,11 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         Levenshtein distance. Aggregate these scores for each file and print
         the line-wise and the total character error rates (CER).
 
-        Produce a new output file by serialising the resulting hierarchy.
+        Return the resulting OcrdPage.
         """
-        maxlevel = self.parameter['textequiv_level']
+        max_level = self.parameter['textequiv_level']
         assert self.workspace
-        self.logger.debug(f'Max level: "{maxlevel}"')
+        self.logger.debug(f'Max level: "{max_level}"')
 
         pcgts = input_pcgts[0]
         page = pcgts.get_Page()
@@ -155,7 +154,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         regions = page.get_AllRegions(classes=['Text'])
         if not regions:
             self.logger.warning(f"Page '{page_id}' contains no text regions")
-        self.process_regions(regions, maxlevel, page_image, page_coords)
+        self.process_regions(regions, max_level, page_image, page_coords)
         return [pcgts]
 
     def process_regions(self, regions, maxlevel, page_image, page_coords):

From 90ac28e1f9c9b6c95492aac765aaf5183a045be2 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Thu, 15 Aug 2024 12:11:30 +0200
Subject: [PATCH 108/194] refactor: other small things

---
 ocrd_cis/ocropy/clip.py      | 16 +++++------
 ocrd_cis/ocropy/recognize.py | 52 +++++++++++++++---------------------
 2 files changed, 29 insertions(+), 39 deletions(-)

diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index a5f4f705..75b4123f 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -37,7 +37,6 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropyClip')
 
-    # TODO: Adapt the docstring comment to process_page_pcgts
     def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
         """Clip text regions / lines of a page at intersections with neighbours.
 
@@ -81,9 +80,9 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         page = pcgts.get_Page()
         assert page
 
-        page_image, page_coords, page_image_info = self.workspace.image_from_page(
+        page_image, page_xywh, page_image_info = self.workspace.image_from_page(
             page, page_id, feature_selector='binarized')
-        # TODO: zoom is not used anywhere, is it still useful to have this call here?
+        # The zoom is not used anywhere
         zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
         ret = [pcgts]
 
@@ -104,7 +103,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                 page.get_TableRegion() +
                 page.get_UnknownRegion())
         if not num_texts:
-            self.logger.warning('Page "%s" contains no text regions', page_id)
+            self.logger.warning(f'Page "{page_id}" contains no text regions')
         background = ImageStat.Stat(page_image)
         # workaround for Pillow#4925
         if len(background.bands) > 1:
@@ -118,7 +117,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
             # in absolute coordinates merely for comparison/intersection
             shapes = [Polygon(polygon_from_points(region.get_Coords().points)) for region in regions]
             # in relative coordinates for mask/cropping
-            polygons = [coordinates_of_segment(region, page_image, page_coords) for region in regions]
+            polygons = [coordinates_of_segment(region, page_image, page_xywh) for region in regions]
             for i, polygon in enumerate(polygons[num_texts:], num_texts):
                 # for non-text regions, extend mask by 3 pixels in each direction
                 # to ensure they do not leak components accidentally
@@ -143,7 +142,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                     segment_region_file_id = f"{output_file_id}_{region.id}"
                     ret.append(self.process_segment(
                         region, masks[i], polygons[i], neighbours, background_image,
-                        page_image, page_coords, page_bin, page_id, segment_region_file_id))
+                        page_image, page_xywh, page_bin, page_id, segment_region_file_id))
                 continue
             # level == 'line':
             lines = region.get_TextLine()
@@ -151,7 +150,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                 self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines')
                 continue
             region_image, region_coords = self.workspace.image_from_segment(
-                region, page_image, page_coords, feature_selector='binarized')
+                region, page_image, page_xywh, feature_selector='binarized')
             background_image = Image.new(region_image.mode, region_image.size, background)
             region_array = pil2array(region_image)
             region_bin = np.array(region_array <= midrange(region_array), np.uint8)
@@ -164,8 +163,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                 if line.get_AlternativeImage():
                     # FIXME: This should probably be an exception (bad workflow configuration).
                     self.logger.warning(
-                        f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image '
-                        f'data: skipping')
+                        f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image data: skipping')
                     continue
                 shape = prep(shapes[j])
                 neighbours = [(linej, maskj) for shapej, linej, maskj
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 69b374ec..b9fc453f 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -101,18 +101,19 @@ def get_model(self):
         returns it.  If not, it checks if the model can be found in the
         dirname(__file__)/models/ directory."""
         canread = lambda p: isfile(p) and access(p, R_OK)
+        p_model = self.parameter['model']
         try:
-            model = self.resolve_resource(self.parameter['model'])
+            model = self.resolve_resource(p_model)
             if canread(model):
                 return model
         except SystemExit:
             ocropydir = dirname(abspath(__file__))
-            path = join(ocropydir, 'models', self.parameter['model'])
-            self.logger.info("Failed to resolve model with OCR-D/core mechanism, trying %s", path)
+            path = join(ocropydir, 'models', p_model)
+            self.logger.info(f"Failed to resolve model with OCR-D/core mechanism, trying {path}")
             if canread(path):
                 return path
-        self.logger.error("Could not find model %s. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s",
-                self.parameter['model'], self.parameter['model'])
+        self.logger.error(
+            f"Could not find model {p_model}. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize {p_model}")
         exit(1)
 
     def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
@@ -148,7 +149,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         page = pcgts.get_Page()
         assert page
 
-        page_image, page_coords, _ = self.workspace.image_from_page(page, page_id)
+        page_image, page_xywh, _ = self.workspace.image_from_page(page, page_id)
         self.logger.info(f"Recognizing text in page '{page_id}'")
         # region, line, word, or glyph level:
         regions = page.get_AllRegions(classes=['Text'])
@@ -157,37 +158,32 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         self.process_regions(regions, max_level, page_image, page_coords)
         return [pcgts]
 
-    def process_regions(self, regions, maxlevel, page_image, page_coords):
+    def process_regions(self, regions, maxlevel, page_image, page_xywh):
         edits = 0
         lengs = 0
         for region in regions:
-            region_image, region_coords = self.workspace.image_from_segment(
-                region, page_image, page_coords)
-
-            self.logger.info("Recognizing text in region '%s'", region.id)
+            region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh)
+            self.logger.info(f"Recognizing text in region '{region.id}'")
             textlines = region.get_TextLine()
             if not textlines:
-                self.logger.warning("Region '%s' contains no text lines", region.id)
+                self.logger.warning(f"Region '{region.id}' contains no text lines")
             else:
-                edits_, lengs_ = self.process_lines(textlines, maxlevel, region_image, region_coords)
+                edits_, lengs_ = self.process_lines(textlines, maxlevel, region_image, region_xywh)
                 edits += edits_
                 lengs += lengs_
             # update region text by concatenation for consistency
-            region_unicode = u'\n'.join(line.get_TextEquiv()[0].Unicode
-                                        if line.get_TextEquiv()
-                                        else u'' for line in textlines)
+            region_unicode = u'\n'.join(
+                line.get_TextEquiv()[0].Unicode if line.get_TextEquiv() else u'' for line in textlines)
             region.set_TextEquiv([TextEquivType(Unicode=region_unicode)])
         if lengs > 0:
             self.logger.info('CER: %.1f%%', 100.0 * edits / lengs)
 
-    def process_lines(self, textlines, maxlevel, region_image, region_coords):
+    def process_lines(self, textlines, maxlevel, region_image, region_xywh):
         edits = 0
         lengs = 0
         for line in textlines:
-            line_image, line_coords = self.workspace.image_from_segment(
-                line, region_image, region_coords)
-
-            self.logger.info("Recognizing text in line '%s'", line.id)
+            line_image, line_coords = self.workspace.image_from_segment(line, region_image, region_xywh)
+            self.logger.info(f"Recognizing text in line '{line.id}'")
             if line.get_TextEquiv():
                 linegt = line.TextEquiv[0].Unicode
             else:
@@ -198,19 +194,18 @@ def process_lines(self, textlines, maxlevel, region_image, region_coords):
             line.set_Word([])
 
             if line_image.size[1] < 16:
-                self.logger.debug("ERROR: bounding box is too narrow at line %s", line.id)
+                self.logger.debug(f"ERROR: bounding box is too narrow at line {line.id}")
                 continue
             # resize image to 48 pixel height
             final_img, scale = resize_keep_ratio(line_image)
 
             # process ocropy:
             try:
-                linepred, clist, rlist, confidlist = recognize(
-                    final_img, self.pad, self.network, check=True)
+                linepred, clist, rlist, confidlist = recognize(final_img, self.pad, self.network, check=True)
             except Exception as err:
-                self.logger.debug('error processing line "%s": %s', line.id, err)
+                self.logger.debug(f'error processing line "{line.id}": {err}')
                 continue
-            self.logger.debug("OCR '%s': '%s'", line.id, linepred)
+            self.logger.debug(f"OCR '{line.id}': '{linepred}'")
             edits += Levenshtein.distance(linepred, linegt)
             lengs += len(linegt)
 
@@ -226,11 +221,9 @@ def process_lines(self, textlines, maxlevel, region_image, region_coords):
                         found_char = True
                         word_conf_list[w_no].append(confidlist[i])
                         word_r_list[w_no].append(rlist[i])
-
                     if c == ' ' and found_char:
                         if i == 0:
                             word_r_list[0][0] = rlist[i]
-
                         elif i+1 <= len(clist)-1 and clist[i+1] != ' ':
                             word_conf_list.append([])
                             word_r_list.append([rlist[i]])
@@ -244,8 +237,7 @@ def process_lines(self, textlines, maxlevel, region_image, region_coords):
             # conf for the line
             line_conf = (min(wordsconf) + max(wordsconf))/2
             # line text
-            line.add_TextEquiv(TextEquivType(
-                Unicode=linepred, conf=line_conf))
+            line.add_TextEquiv(TextEquivType(Unicode=linepred, conf=line_conf))
 
             if maxlevel in ['word', 'glyph']:
                 for word_no, word_str in enumerate(words):

From f24f86b9e963e28f206662e464f8843c99deddf0 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Thu, 15 Aug 2024 12:33:04 +0200
Subject: [PATCH 109/194] fix: determine_zoom

---
 ocrd_cis/ocropy/binarize.py  | 2 +-
 ocrd_cis/ocropy/clip.py      | 3 ++-
 ocrd_cis/ocropy/common.py    | 2 +-
 ocrd_cis/ocropy/denoise.py   | 2 +-
 ocrd_cis/ocropy/dewarp.py    | 2 +-
 ocrd_cis/ocropy/recognize.py | 2 +-
 ocrd_cis/ocropy/resegment.py | 2 +-
 ocrd_cis/ocropy/segment.py   | 2 +-
 8 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 8f7d8d3a..7478edb5 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -101,7 +101,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         assert page
 
         page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
-        zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
+        zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
         ret = [pcgts]
         if level == 'page':
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 75b4123f..400e9b54 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -1,5 +1,6 @@
 from __future__ import absolute_import
 from logging import Logger
+from typing import Tuple
 
 from os.path import join
 import numpy as np
@@ -83,7 +84,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         page_image, page_xywh, page_image_info = self.workspace.image_from_page(
             page, page_id, feature_selector='binarized')
         # The zoom is not used anywhere
-        zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
+        zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
         ret = [pcgts]
 
         # FIXME: what about text regions inside table regions?
diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index 095de5eb..c6b7c49d 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -2103,7 +2103,7 @@ def find_topological():
     # DSAVE('rlabels_closed', rlabels)
     return rlabels
 
-def determine_zoom(logger: logging.Logger, dpi: float, page_image_info: OcrdExif) -> float:
+def determine_zoom(logger: logging.Logger, page_id: str, dpi: float, page_image_info: OcrdExif) -> float:
     if dpi > 0:
         zoom = 300.0/dpi
     elif page_image_info.resolution != 1:
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index 78d11c28..cc622c24 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -63,7 +63,7 @@ def process(self):
                 page, page_id,
                 feature_selector='binarized' if level == 'page' else '')
 
-            zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
+            zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
             if level == 'page':
                 self.process_segment(page, page_image, page_xywh, zoom,
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 9dddae44..72efca45 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -112,7 +112,7 @@ def process(self):
             page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                 page, page_id)
 
-            zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
+            zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
             regions = page.get_AllRegions(classes=['Text'], order='reading-order')
             if not regions:
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index b9fc453f..bbb8e415 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -155,7 +155,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         regions = page.get_AllRegions(classes=['Text'])
         if not regions:
             self.logger.warning(f"Page '{page_id}' contains no text regions")
-        self.process_regions(regions, max_level, page_image, page_coords)
+        self.process_regions(regions, max_level, page_image, page_xywh)
         return [pcgts]
 
     def process_regions(self, regions, maxlevel, page_image, page_xywh):
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index b18c0b5e..1e9f8c7f 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -117,7 +117,7 @@ def process(self):
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id, feature_selector='binarized')
 
-            zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
+            zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
             ignore = (page.get_ImageRegion() +
                       page.get_LineDrawingRegion() +
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 782425cc..57368fe8 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -339,7 +339,7 @@ def process(self):
             # TODO: also allow grayscale_normalized (try/except?)
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id, feature_selector='binarized')
-            zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
+            zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
             # aggregate existing regions so their foreground can be ignored
             ignore = (page.get_ImageRegion() +

From 5f8e1dfb337d78cd757f4a6b5aff968829c2d4a1 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Thu, 15 Aug 2024 13:19:08 +0200
Subject: [PATCH 110/194] add missing Levenshtein req in setup

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 38f09abd..e3ee8213 100644
--- a/setup.py
+++ b/setup.py
@@ -41,6 +41,7 @@
         'ocrd>=3.0.0a1',
         'click',
         'scipy',
+        'python-Levenshtein>=0.25.1',
         'numpy>=1.17.0',
         'pillow>=7.1.2',
         'shapely>=1.7.1',

From 9a14e1dddf44515630dadbcc23b62e6951eccc5d Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Thu, 15 Aug 2024 13:53:33 +0200
Subject: [PATCH 111/194] fix: remove version req for Levenshtein

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index e3ee8213..6b75d3a3 100644
--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,7 @@
         'ocrd>=3.0.0a1',
         'click',
         'scipy',
-        'python-Levenshtein>=0.25.1',
+        'python-Levenshtein',
         'numpy>=1.17.0',
         'pillow>=7.1.2',
         'shapely>=1.7.1',

From 4ca4d1417030e40818327a7cc3571b22ad4ccda9 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Thu, 15 Aug 2024 13:59:33 +0200
Subject: [PATCH 112/194] fix: Levenshtein import

---
 ocrd_cis/align/cli.py | 2 +-
 setup.py              | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py
index ffe53fd8..7747622e 100644
--- a/ocrd_cis/align/cli.py
+++ b/ocrd_cis/align/cli.py
@@ -2,7 +2,7 @@
 import click
 import json
 import os
-import Levenshtein
+from rapidfuzz.distance import Levenshtein
 from ocrd import Processor
 from ocrd.decorators import ocrd_cli_options
 from ocrd.decorators import ocrd_cli_wrap_processor
diff --git a/setup.py b/setup.py
index 6b75d3a3..38f09abd 100644
--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,6 @@
         'ocrd>=3.0.0a1',
         'click',
         'scipy',
-        'python-Levenshtein',
         'numpy>=1.17.0',
         'pillow>=7.1.2',
         'shapely>=1.7.1',

From fbaafcb4e3f982496aafdf561a4cd4713d859f5c Mon Sep 17 00:00:00 2001
From: kba <unixprog@gmail.com>
Date: Thu, 15 Aug 2024 16:23:00 +0200
Subject: [PATCH 113/194] update ocrd-cis-binarize to be compatible with
 bertsky/core#8

---
 ocrd_cis/ocropy/binarize.py | 70 ++++++++++++++++---------------------
 ocrd_cis/ocropy/common.py   |  3 +-
 2 files changed, 33 insertions(+), 40 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 7478edb5..3c9583f9 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -1,21 +1,15 @@
 from __future__ import absolute_import
 from logging import Logger
+from typing import Optional
 
 import cv2
 import numpy as np
 from PIL import Image
-from os.path import abspath, dirname, join
 
-from typing import Tuple
+from ocrd.processor.ocrd_page_result import OcrdPageResult, OcrdPageResultImage
 
-#import kraken.binarization
-
-from ocrd_utils import (
-    getLogger,
-    make_file_id,
-    MIMETYPE_PAGE
-)
-from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml
+from ocrd_utils import getLogger
+from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage
 from ocrd import Processor
 
 from . import common
@@ -71,7 +65,7 @@ def setup(self):
             self.logger.critical(f'Requested method {method} does not support grayscale normalized output')
             raise ValueError('only method=ocropy allows grayscale=true')
 
-    def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace.
 
         Iterate over the PAGE-XML element hierarchy down to the requested
@@ -97,16 +91,17 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         self.logger.debug(f'Level of operation: "{level}"')
 
         pcgts = input_pcgts[0]
+        assert pcgts
         page = pcgts.get_Page()
         assert page
 
         page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
         zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
-        ret = [pcgts]
+        result = OcrdPageResult(pcgts)
         if level == 'page':
             try:
-                ret.append(self.process_page(page, page_image, page_xywh, zoom, page_id, output_file_id))
+                result.images.append(self.process_page(page, page_image, page_xywh, zoom, page_id))
             except ValueError as e:
                 self.logger.error(e)
         else:
@@ -121,7 +116,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                     region, page_image, page_xywh, feature_filter='binarized')
                 if level == 'region':
                     try:
-                        ret.append(self.process_region(region, region_image, region_xywh, zoom, region.id, output_file_id))
+                        result.images.append(self.process_region(region, region_image, region_xywh, zoom, region.id))
                         continue
                     except ValueError as e:
                         self.logger.error(e)
@@ -132,12 +127,12 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                     line_image, line_xywh = self.workspace.image_from_segment(
                         line, region_image, region_xywh, feature_filter='binarized')
                     try:
-                        ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, output_file_id))
+                        result.images.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id))
                     except ValueError as e:
                         self.logger.error(e)
-        return ret
+        return result
 
-    def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]:
+    def process_page(self, page, page_image, page_xywh, zoom, page_id) -> OcrdPageResultImage:
         if not page_image.width or not page_image.height:
             raise ValueError(f"Skipping page '{page_id}' with zero size")
         self.logger.info(f"About to binarize page '{page_id}'")
@@ -171,18 +166,17 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> T
         orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
         page.set_orientation(orientation)
         if self.parameter['grayscale']:
-            file_id += '.IMG-NRM'
+            id_suffix = '.IMG-NRM'
             features += ',grayscale_normalized'
         else:
-            file_id += '.IMG-BIN'
+            id_suffix = '.IMG-BIN'
             features += ',binarized'
-        bin_image_id = f'{file_id}.IMG-BIN'
-        bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png')
         # update PAGE (reference the image file):
-        page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features))
-        return bin_image, bin_image_id, bin_image_path
+        alternative_image = AlternativeImageType(comments=features)
+        page.add_AlternativeImage(alternative_image)
+        return OcrdPageResultImage(bin_image, id_suffix, alternative_image)
 
-    def process_region(self, region, region_image, region_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]:
+    def process_region(self, region, region_image, region_xywh, zoom, page_id) -> OcrdPageResultImage:
         if not region_image.width or not region_image.height:
             raise ValueError(f"Skipping region '{region.id}' with zero size")
         self.logger.info(f"About to binarize page '{page_id}' region '{region.id}'")
@@ -217,21 +211,19 @@ def process_region(self, region, region_image, region_xywh, zoom, page_id, file_
         orientation = -region_xywh['angle']
         orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
         region.set_orientation(orientation)
-        bin_image_id = f'{file_id}_{region.id}'
+        id_suffix = f'{region.id}'
         if self.parameter['grayscale']:
-            bin_image_id += '.IMG-NRM'
+            id_suffix += '.IMG-NRM'
             features += ',grayscale_normalized'
         else:
-            bin_image_id += '.IMG-BIN'
+            id_suffix += '.IMG-BIN'
             features += ',binarized'
-        bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png')
         # update PAGE (reference the image file):
-        region.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features))
-        return bin_image, bin_image_id, bin_image_path
+        alternative_image = AlternativeImageType(comments=features)
+        region.add_AlternativeImage(alternative_image)
+        return OcrdPageResultImage(bin_image, id_suffix, alternative_image)
 
-    def process_line(
-        self, line, line_image, line_xywh, zoom, page_id, region_id, file_id
-    ) -> Tuple[Image.Image, str, str]:
+    def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id) -> OcrdPageResultImage:
         if not line_image.width or not line_image.height:
             raise ValueError(f"Skipping line '{line.id}' with zero size")
         self.logger.info(f"About to binarize page '{page_id}' region '{region_id}' line '{line.id}'")
@@ -256,14 +248,14 @@ def process_line(
         bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize'])
         if self.parameter['noise_maxsize']:
             features += ',despeckled'
-        bin_image_id = f'{file_id}_{region_id}_{line.id}'
+        id_suffix = f'{region_id}_{line.id}'
         if self.parameter['grayscale']:
-            bin_image_id += '.IMG-NRM'
+            id_suffix += '.IMG-NRM'
             features += ',grayscale_normalized'
         else:
-            bin_image_id += '.IMG-BIN'
+            id_suffix += '.IMG-BIN'
             features += ',binarized'
-        bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png')
         # update PAGE (reference the image file):
-        line.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features))
-        return bin_image, bin_image_id, bin_image_path
+        alternative_image = AlternativeImageType(comments=features)
+        line.add_AlternativeImage(alternative_image)
+        return OcrdPageResultImage(bin_image, id_suffix, alternative_image)
diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index c6b7c49d..c5b56ed0 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -1,4 +1,5 @@
 from __future__ import absolute_import
+from typing import Optional
 
 import warnings
 import logging
@@ -2103,7 +2104,7 @@ def find_topological():
     # DSAVE('rlabels_closed', rlabels)
     return rlabels
 
-def determine_zoom(logger: logging.Logger, page_id: str, dpi: float, page_image_info: OcrdExif) -> float:
+def determine_zoom(logger: logging.Logger, page_id: Optional[str], dpi: float, page_image_info: OcrdExif) -> float:
     if dpi > 0:
         zoom = 300.0/dpi
     elif page_image_info.resolution != 1:

From 516ce4ba4bd4f65dae975472b5632d8d3b6027c2 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 00:58:16 +0200
Subject: [PATCH 114/194] binarize: use final v3 API

---
 ocrd_cis/ocropy/binarize.py | 69 +++++++++++++++++--------------------
 1 file changed, 32 insertions(+), 37 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 7478edb5..fa47e139 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -6,17 +6,15 @@
 from PIL import Image
 from os.path import abspath, dirname, join
 
-from typing import Tuple
+from typing import Union, Optional
 
 #import kraken.binarization
 
-from ocrd_utils import (
-    getLogger,
-    make_file_id,
-    MIMETYPE_PAGE
-)
-from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml
+from ocrd_utils import getLogger
+from ocrd_models.ocrd_page import AlternativeImageType
+from ocrd_models import OcrdFile, ClientSideOcrdFile, OcrdPage
 from ocrd import Processor
+from ocrd.processor import OcrdPageResult, OcrdPageResultImage
 
 from . import common
 from .common import array2pil, determine_zoom, pil2array, remove_noise
@@ -71,7 +69,7 @@ def setup(self):
             self.logger.critical(f'Requested method {method} does not support grayscale normalized output')
             raise ValueError('only method=ocropy allows grayscale=true')
 
-    def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
+    def process_page_pcgts(self, *input_pcgts: Optional[Union[OcrdFile, ClientSideOcrdFile]], page_id: str = None) -> OcrdPageResult:
         """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace.
 
         Iterate over the PAGE-XML element hierarchy down to the requested
@@ -90,7 +88,8 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
 
         Reference each new image in the AlternativeImage of the element.
 
-        Return a PAGE-XML with AlternativeImage and the arguments for ``workspace.save_image_file``.
+        Return a PAGE-XML with new AlternativeImage(s) and the arguments
+        for ``workspace.save_image_file``.
         """
         level = self.parameter['level-of-operation']
         assert self.workspace
@@ -103,10 +102,10 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
         zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
-        ret = [pcgts]
+        ret = OcrdPageResult(pcgts)
         if level == 'page':
             try:
-                ret.append(self.process_page(page, page_image, page_xywh, zoom, page_id, output_file_id))
+                ret.images.append(self.process_page(page, page_image, page_xywh, zoom, page_id))
             except ValueError as e:
                 self.logger.error(e)
         else:
@@ -121,7 +120,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                     region, page_image, page_xywh, feature_filter='binarized')
                 if level == 'region':
                     try:
-                        ret.append(self.process_region(region, region_image, region_xywh, zoom, region.id, output_file_id))
+                        ret.images.append(self.process_region(region, region_image, region_xywh, zoom, region.id))
                         continue
                     except ValueError as e:
                         self.logger.error(e)
@@ -132,16 +131,15 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                     line_image, line_xywh = self.workspace.image_from_segment(
                         line, region_image, region_xywh, feature_filter='binarized')
                     try:
-                        ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, output_file_id))
+                        ret.images.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id))
                     except ValueError as e:
                         self.logger.error(e)
         return ret
 
-    def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]:
+    def process_page(self, page, page_image, page_xywh, zoom, page_id) -> OcrdPageResultImage:
         if not page_image.width or not page_image.height:
             raise ValueError(f"Skipping page '{page_id}' with zero size")
         self.logger.info(f"About to binarize page '{page_id}'")
-        assert self.output_file_grp
 
         features = page_xywh['features']
         if 'angle' in page_xywh and page_xywh['angle']:
@@ -171,18 +169,17 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> T
         orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
         page.set_orientation(orientation)
         if self.parameter['grayscale']:
-            file_id += '.IMG-NRM'
+            suffix = '.IMG-NRM'
             features += ',grayscale_normalized'
         else:
-            file_id += '.IMG-BIN'
+            suffix = '.IMG-BIN'
             features += ',binarized'
-        bin_image_id = f'{file_id}.IMG-BIN'
-        bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png')
         # update PAGE (reference the image file):
-        page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features))
-        return bin_image, bin_image_id, bin_image_path
+        alt_image = AlternativeImageType(comments=features)
+        page.add_AlternativeImage(alt_image)
+        return OcrdPageResultImage(bin_image, suffix, alt_image)
 
-    def process_region(self, region, region_image, region_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]:
+    def process_region(self, region, region_image, region_xywh, zoom, page_id) -> OcrdPageResultImage:
         if not region_image.width or not region_image.height:
             raise ValueError(f"Skipping region '{region.id}' with zero size")
         self.logger.info(f"About to binarize page '{page_id}' region '{region.id}'")
@@ -217,21 +214,19 @@ def process_region(self, region, region_image, region_xywh, zoom, page_id, file_
         orientation = -region_xywh['angle']
         orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
         region.set_orientation(orientation)
-        bin_image_id = f'{file_id}_{region.id}'
+        suffix = region.id
         if self.parameter['grayscale']:
-            bin_image_id += '.IMG-NRM'
+            suffix += '.IMG-NRM'
             features += ',grayscale_normalized'
         else:
-            bin_image_id += '.IMG-BIN'
+            suffix += '.IMG-BIN'
             features += ',binarized'
-        bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png')
         # update PAGE (reference the image file):
-        region.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features))
-        return bin_image, bin_image_id, bin_image_path
+        alt_image = AlternativeImageType(comments=features)
+        region.add_AlternativeImage(alt_image)
+        return OcrdPageResultImage(bin_image, suffix, alt_image)
 
-    def process_line(
-        self, line, line_image, line_xywh, zoom, page_id, region_id, file_id
-    ) -> Tuple[Image.Image, str, str]:
+    def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id) -> OcrdPageResultImage:
         if not line_image.width or not line_image.height:
             raise ValueError(f"Skipping line '{line.id}' with zero size")
         self.logger.info(f"About to binarize page '{page_id}' region '{region_id}' line '{line.id}'")
@@ -256,14 +251,14 @@ def process_line(
         bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize'])
         if self.parameter['noise_maxsize']:
             features += ',despeckled'
-        bin_image_id = f'{file_id}_{region_id}_{line.id}'
+        suffix = f'{region_id}_{line.id}'
         if self.parameter['grayscale']:
-            bin_image_id += '.IMG-NRM'
+            suffix += '.IMG-NRM'
             features += ',grayscale_normalized'
         else:
-            bin_image_id += '.IMG-BIN'
+            suffix += '.IMG-BIN'
             features += ',binarized'
-        bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png')
         # update PAGE (reference the image file):
-        line.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features))
-        return bin_image, bin_image_id, bin_image_path
+        alt_image = AlternativeImageType(comments=features)
+        line.add_AlternativeImage(alt_image)
+        return OcrdPageResultImage(bin_image, suffix, alt_image)

From 2e4f26f04ec5b2070a0396015d4339493e365fa1 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 01:05:17 +0200
Subject: [PATCH 115/194] binarize: use correct types

---
 ocrd_cis/ocropy/binarize.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index fa47e139..ac499336 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -11,8 +11,7 @@
 #import kraken.binarization
 
 from ocrd_utils import getLogger
-from ocrd_models.ocrd_page import AlternativeImageType
-from ocrd_models import OcrdFile, ClientSideOcrdFile, OcrdPage
+from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage
 from ocrd import Processor
 from ocrd.processor import OcrdPageResult, OcrdPageResultImage
 
@@ -69,7 +68,7 @@ def setup(self):
             self.logger.critical(f'Requested method {method} does not support grayscale normalized output')
             raise ValueError('only method=ocropy allows grayscale=true')
 
-    def process_page_pcgts(self, *input_pcgts: Optional[Union[OcrdFile, ClientSideOcrdFile]], page_id: str = None) -> OcrdPageResult:
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = None) -> OcrdPageResult:
         """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace.
 
         Iterate over the PAGE-XML element hierarchy down to the requested

From 21be94106ac55d001cb5729f21138fb9c7715bcb Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 01:12:04 +0200
Subject: [PATCH 116/194] clip: use final v3 API

---
 ocrd_cis/ocropy/clip.py | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 400e9b54..d0119544 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -8,19 +8,17 @@
 from shapely.geometry import Polygon
 from shapely.prepared import prep
 
-from ocrd_modelfactory import page_from_file
-from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml
+from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage
 from ocrd import Processor
+from ocrd.processor import OcrdPageResult, OcrdPageResultImage
 from ocrd_utils import (
     getLogger,
-    make_file_id,
     coordinates_of_segment,
     polygon_from_points,
     bbox_from_polygon,
     image_from_polygon,
     polygon_mask,
     crop_image,
-    MIMETYPE_PAGE
 )
 
 from .ocrolib import midrange, morph
@@ -38,7 +36,7 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropyClip')
 
-    def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = None) -> OcrdPageResult:
         """Clip text regions / lines of a page at intersections with neighbours.
 
         Open and deserialize PAGE input file and its respective image,
@@ -85,7 +83,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
             page, page_id, feature_selector='binarized')
         # The zoom is not used anywhere
         zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
-        ret = [pcgts]
+        ret = OcrdPageResult(pcgts)
 
         # FIXME: what about text regions inside table regions?
         regions = list(page.get_TextRegion())
@@ -141,9 +139,9 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                               if shape.intersects(shapej)]
                 if neighbours:
                     segment_region_file_id = f"{output_file_id}_{region.id}"
-                    ret.append(self.process_segment(
+                    ret.images.append(self.process_segment(
                         region, masks[i], polygons[i], neighbours, background_image,
-                        page_image, page_xywh, page_bin, page_id, segment_region_file_id))
+                        page_image, page_xywh, page_bin, page_id))
                 continue
             # level == 'line':
             lines = region.get_TextLine()
@@ -172,14 +170,14 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                               if shape.intersects(shapej)]
                 if neighbours:
                     segment_line_file_id = f"{output_file_id}_{region.id}_{line.id}"
-                    ret.append(self.process_segment(
+                    ret.images.append(self.process_segment(
                         line, masks[j], polygons[j], neighbours, background_image,
-                        region_image, region_coords, region_bin, page_id, segment_line_file_id))
+                        region_image, region_coords, region_bin, page_id))
         return ret
 
     def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
                         background_image, parent_image, parent_coords, parent_bin,
-                        page_id, file_id) -> Tuple[Image.Image, str, str]:
+                        page_id) -> OcrdPageResultImage:
         # initialize AlternativeImage@comments classes from parent, except
         # for those operations that can apply on multiple hierarchy levels:
         features = ','.join(
@@ -217,8 +215,7 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
         # recrop segment into rectangle, just as image_from_segment would do
         # (and also clipping with background colour):
         segment_image = crop_image(segment_image,box=segment_bbox)
-        segment_image_id = file_id + '.IMG-CLIP'
-        segment_image_path = join(self.output_file_grp, f'{segment_image_id}.png')
         # update PAGE (reference the image file):
-        segment.add_AlternativeImage(AlternativeImageType(filename=segment_image_path, comments=features))
-        return segment_image, segment_image_id, segment_image_path
+        alternative_image = AlternativeImageType(comments=features)
+        segment.add_AlternativeImage(alternative_image)
+        return OcrdPageResultImage(segment_image, '.IMG-CLIP', alternative_image)

From 9539ac9620776e335bbe107e57e92742027f02b3 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 01:12:51 +0200
Subject: [PATCH 117/194] clip: use correct types

---
 ocrd_cis/ocropy/clip.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index d0119544..3ddd6a70 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -1,6 +1,6 @@
 from __future__ import absolute_import
 from logging import Logger
-from typing import Tuple
+from typing import Optional
 
 from os.path import join
 import numpy as np

From 734b5eb4ef9bfee2e24d8053966b17eaf6e9e1f9 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 01:14:56 +0200
Subject: [PATCH 118/194] recognize: use final v3 API

---
 ocrd_cis/ocropy/recognize.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index bbb8e415..7e4f2957 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -11,18 +11,16 @@
 
 from ocrd_utils import (
     getLogger,
-    make_file_id,
     coordinates_for_segment,
     polygon_from_bbox,
     points_from_polygon,
-    MIMETYPE_PAGE
 )
-from ocrd_modelfactory import page_from_file
 from ocrd_models.ocrd_page import (
-    to_xml, TextEquivType, OcrdPage,
+    TextEquivType, OcrdPage,
     CoordsType, GlyphType, WordType
 )
 from ocrd import Processor
+from ocrd.processor import OcrdPageResult
 
 from .common import check_line, pil2array
 from .ocrolib import lstm, load_object, midrange
@@ -116,7 +114,7 @@ def get_model(self):
             f"Could not find model {p_model}. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize {p_model}")
         exit(1)
 
-    def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = None) -> OcrdPageResult:
         """Recognize lines / words / glyphs of a page.
 
         Open and deserialize the PAGE input file and its respective image,
@@ -156,7 +154,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         if not regions:
             self.logger.warning(f"Page '{page_id}' contains no text regions")
         self.process_regions(regions, max_level, page_image, page_xywh)
-        return [pcgts]
+        return OcrdPageResult(pcgts)
 
     def process_regions(self, regions, maxlevel, page_image, page_xywh):
         edits = 0

From 039e052f0a4226341ce1bf3070de53495b2a550f Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 02:18:53 +0200
Subject: [PATCH 119/194] test_lib.bash: update GT Github URL

---
 tests/test_lib.bash | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_lib.bash b/tests/test_lib.bash
index f28acb1e..801be01a 100644
--- a/tests/test_lib.bash
+++ b/tests/test_lib.bash
@@ -5,7 +5,7 @@ trap "trap 'echo exiting without removing $tmpdir' EXIT" ERR
 trap "rm -rf $tmpdir" EXIT
 
 OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE"
-data_url="https://github.com/OCR-D/gt_structure_text/releases/download/v1.2.4/"
+data_url="https://github.com/OCR-D/gt_structure_text/releases/download/v1.5.0/"
 function ocrd_cis_download_bagit() {
 	local url="$data_url/$1"
 	mkdir -p "$PWD/download"

From 28ad585c94f9895b3f5011a72aabf36b73d71a8e Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 02:20:58 +0200
Subject: [PATCH 120/194] recognize: fix typing import

---
 ocrd_cis/ocropy/recognize.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 7e4f2957..97fcc64d 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -1,7 +1,8 @@
 from __future__ import absolute_import
+
 from logging import Logger
 from sys import exit
-from typing import Any
+from typing import Any, Optional
 from os import access, R_OK
 from os.path import abspath, dirname, isfile, join
 import numpy as np

From 9a7c10ab71f7df3783f44848536aa99dd9c8e483 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 02:31:27 +0200
Subject: [PATCH 121/194] denoise: adapt to final v3 API

---
 ocrd_cis/ocropy/denoise.py | 122 +++++++++++++++----------------------
 1 file changed, 49 insertions(+), 73 deletions(-)

diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index cc622c24..0f368fd5 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -1,17 +1,15 @@
 from __future__ import absolute_import
+
+from typing import Optional
 from logging import Logger
 from os.path import join
 
-from ocrd_utils import (
-    getLogger,
-    make_file_id,
-    MIMETYPE_PAGE
-)
-from ocrd_modelfactory import page_from_file
+from ocrd_utils import getLogger
 from ocrd_models.ocrd_page import (
-    to_xml, AlternativeImageType
+    AlternativeImageType, OcrdPage
 )
 from ocrd import Processor
+from ocrd.processor import OcrdPageResult, OcrdPageResultImage
 
 from .common import (
     # binarize,
@@ -27,10 +25,10 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropyDenoise')
 
-    def process(self):
+    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
         """Despeckle the pages / regions / lines of the workspace.
 
-        Open and deserialise PAGE input files and their respective images,
+        Open and deserialise PAGE input file and its respective images,
         then iterate over the element hierarchy down to the requested
         ``level-of-operation``.
 
@@ -49,73 +47,51 @@ def process(self):
         Produce a new output file by serialising the resulting hierarchy.
         """
         level = self.parameter['level-of-operation']
-
-        for (n, input_file) in enumerate(self.input_files):
-            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
-            file_id = make_file_id(input_file, self.output_file_grp)
-
-            pcgts = page_from_file(self.workspace.download_file(input_file))
-            self.add_metadata(pcgts)
-            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
-            page = pcgts.get_Page()
-                
-            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
-                page, page_id,
-                feature_selector='binarized' if level == 'page' else '')
-
-            zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
-
-            if level == 'page':
-                self.process_segment(page, page_image, page_xywh, zoom,
-                                     input_file.pageId, file_id)
-            else:
-                regions = page.get_AllRegions(classes=['Text'], order='reading-order')
-                if not regions:
-                    self.logger.warning('Page "%s" contains no text regions', page_id)
-                for region in regions:
-                    region_image, region_xywh = self.workspace.image_from_segment(
-                        region, page_image, page_xywh,
-                        feature_selector='binarized' if level == 'region' else '')
-                    if level == 'region':
-                        self.process_segment(region, region_image, region_xywh, zoom,
-                                             input_file.pageId, file_id + '_' + region.id)
-                        continue
-                    lines = region.get_TextLine()
-                    if not lines:
-                        self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
-                    for line in lines:
-                        line_image, line_xywh = self.workspace.image_from_segment(
-                            line, region_image, region_xywh,
-                            feature_selector='binarized')
-                        self.process_segment(line, line_image, line_xywh, zoom,
-                                             input_file.pageId,
-                                             file_id + '_' + region.id + '_' + line.id)
-
-            # update METS (add the PAGE file):
-            file_path = join(self.output_file_grp, file_id + '.xml')
-            pcgts.set_pcGtsId(file_id)
-            out = self.workspace.add_file(
-                ID=file_id,
-                file_grp=self.output_file_grp,
-                pageId=input_file.pageId,
-                local_filename=file_path,
-                mimetype=MIMETYPE_PAGE,
-                content=to_xml(pcgts))
-            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
-                     file_id, self.output_file_grp, out.local_filename)
-
-    def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id, file_id):
+        pcgts = input_pcgts[0]
+        result = OcrdPageResult(pcgts)
+        page = pcgts.get_Page()
+
+        page_image, page_xywh, page_image_info = self.workspace.image_from_page(
+            page, page_id,
+            feature_selector='binarized' if level == 'page' else '')
+        zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
+
+        if level == 'page':
+            image = self.process_segment(page, page_image, page_xywh, zoom)
+            if image:
+                result.images.append(image)
+        else:
+            regions = page.get_AllRegions(classes=['Text'], order='reading-order')
+            if not regions:
+                self.logger.warning('Page "%s" contains no text regions', page_id)
+            for region in regions:
+                region_image, region_xywh = self.workspace.image_from_segment(
+                    region, page_image, page_xywh,
+                    feature_selector='binarized' if level == 'region' else '')
+                if level == 'region':
+                    image = self.process_segment(region, region_image, region_xywh, zoom)
+                    if image:
+                        result.images.append(image)
+                    continue
+                lines = region.get_TextLine()
+                if not lines:
+                    self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
+                for line in lines:
+                    line_image, line_xywh = self.workspace.image_from_segment(
+                        line, region_image, region_xywh,
+                        feature_selector='binarized')
+                    image = self.process_segment(line, line_image, line_xywh, zoom)
+                    if image:
+                        result.images.append(image)
+
+    def process_segment(self, segment, segment_image, segment_xywh, zoom) -> Optional[OcrdPageResultImage]:
         if not segment_image.width or not segment_image.height:
             self.logger.warning("Skipping '%s' with zero size", file_id)
-            return
+            return None
         self.logger.info("About to despeckle '%s'", file_id)
         bin_image = remove_noise(segment_image,
                                  maxsize=self.parameter['noise_maxsize']/zoom*300/72) # in pt
-        # update METS (add the image file):
-        file_path = self.workspace.save_image_file(
-            bin_image, file_id + '.IMG-DESPECK', self.output_file_grp,
-            page_id=page_id)
         # update PAGE (reference the image file):
-        segment.add_AlternativeImage(AlternativeImageType(
-            filename=file_path,
-            comments=segment_xywh['features'] + ',despeckled'))
+        alt_image = AlternativeImageType(comments=segment_xywh['features'] + ',despeckled')
+        segment.add_AlternativeImage(alt_image)
+        return OcrdPageResultImage(bin_image, segment.id + '.IMG-DESPECK', alt_image)

From 7c9f39fa4516401fe17e24d3ca67799c5b85d308 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 02:40:41 +0200
Subject: [PATCH 122/194] deskew: adapt to final v3 API

---
 ocrd_cis/ocropy/deskew.py | 116 +++++++++++++++-----------------------
 1 file changed, 47 insertions(+), 69 deletions(-)

diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index 616864e1..fae0c90c 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -1,24 +1,21 @@
 from __future__ import absolute_import
+
+from typing import Optional
 from logging import Logger
 from os.path import join
 
-from ocrd_utils import (
-    getLogger,
-    make_file_id,
-    MIMETYPE_PAGE
-)
-from ocrd_modelfactory import page_from_file
+from ocrd_utils import getLogger
 from ocrd_models.ocrd_page import (
     PageType,
-    to_xml, AlternativeImageType
+    AlternativeImageType,
+    OcrdPage
 )
 from ocrd import Processor
+from ocrd.processor import OcrdPageResult, OcrdPageResultImage
 
 from . import common
 from .common import pil2array
 
-#sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-
 def deskew(pil_image, maxskew=2):
     array = pil2array(pil_image)
     _, angle = common.binarize(array, maxskew=maxskew)
@@ -34,10 +31,10 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropyDeskew')
 
-    def process(self):
+    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
         """Deskew the pages or regions of the workspace.
 
-        Open and deserialise PAGE input files and their respective images,
+        Open and deserialise PAGE input file and its respective images,
         then iterate over the element hierarchy down to the TextRegion level.
 
         Next, for each file, crop each region image according to the layout
@@ -53,62 +50,45 @@ def process(self):
         Produce a new output file by serialising the resulting hierarchy.
         """
         level = self.parameter['level-of-operation']
-
-        for (n, input_file) in enumerate(self.input_files):
-            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
-            file_id = make_file_id(input_file, self.output_file_grp)
-
-            pcgts = page_from_file(self.workspace.download_file(input_file))
-            self.add_metadata(pcgts)
-            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
-            page = pcgts.get_Page()
-                
-            page_image, page_coords, _ = self.workspace.image_from_page(
-                page, page_id,
+        pcgts = input_pcgts[0]
+        result = OcrdPageResult(pcgts)
+        page = pcgts.get_Page()
+
+        page_image, page_coords, _ = self.workspace.image_from_page(
+            page, page_id,
+            # image must not have been rotated already,
+            # (we will overwrite @orientation anyway,)
+            # abort if no such image can be produced:
+            feature_filter='deskewed' if level == 'page' else '')
+        if level == 'page':
+            image = self._process_segment(page, page_image, page_coords, "page '%s'" % page_id, page_id)
+            if image:
+                result.images.append(image)
+            return result
+        if level == 'table':
+            regions = page.get_TableRegion()
+        else: # region
+            regions = page.get_AllRegions(classes=['Text'], order='reading-order')
+        if not regions:
+            self.logger.warning('Page "%s" contains no text regions', page_id)
+        for region in regions:
+            # process region:
+            region_image, region_coords = self.workspace.image_from_segment(
+                region, page_image, page_coords,
                 # image must not have been rotated already,
                 # (we will overwrite @orientation anyway,)
                 # abort if no such image can be produced:
-                feature_filter='deskewed' if level == 'page' else '')
-            if level == 'page':
-                self._process_segment(page, page_image, page_coords,
-                                      "page '%s'" % page_id, input_file.pageId,
-                                      file_id)
-            else:
-                if level == 'table':
-                    regions = page.get_TableRegion()
-                else: # region
-                    regions = page.get_AllRegions(classes=['Text'], order='reading-order')
-                if not regions:
-                    self.logger.warning('Page "%s" contains no text regions', page_id)
-                for region in regions:
-                    # process region:
-                    region_image, region_coords = self.workspace.image_from_segment(
-                        region, page_image, page_coords,
-                        # image must not have been rotated already,
-                        # (we will overwrite @orientation anyway,)
-                        # abort if no such image can be produced:
-                        feature_filter='deskewed')
-                    self._process_segment(region, region_image, region_coords,
-                                          "region '%s'" % region.id, input_file.pageId,
-                                          file_id + '_' + region.id)
-
-            # update METS (add the PAGE file):
-            file_path = join(self.output_file_grp, file_id + '.xml')
-            pcgts.set_pcGtsId(file_id)
-            out = self.workspace.add_file(
-                ID=file_id,
-                file_grp=self.output_file_grp,
-                pageId=input_file.pageId,
-                local_filename=file_path,
-                mimetype=MIMETYPE_PAGE,
-                content=to_xml(pcgts))
-            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
-                     file_id, self.output_file_grp, out.local_filename)
-
-    def _process_segment(self, segment, segment_image, segment_coords, segment_id, page_id, file_id):
+                feature_filter='deskewed')
+            image = self._process_segment(region, region_image, region_coords,
+                                          "region '%s'" % region.id, page_id)
+            if image:
+                result.images.append(image)
+        return result
+
+    def _process_segment(self, segment, segment_image, segment_coords, segment_id, page_id) -> Optional[OcrdPageResultImage]:
         if not segment_image.width or not segment_image.height:
             self.logger.warning("Skipping %s with zero size", segment_id)
-            return
+            return None
         angle0 = segment_coords['angle'] # deskewing (w.r.t. top image) already applied to segment_image
         self.logger.info("About to deskew %s", segment_id)
         angle = deskew(segment_image, maxskew=self.parameter['maxskew']) # additional angle to be applied
@@ -123,20 +103,18 @@ def _process_segment(self, segment, segment_image, segment_coords, segment_id, p
             segment_image, segment_coords, _ = self.workspace.image_from_page(
                 segment, page_id,
                 fill='background', transparency=True)
+            suffix = '.IMG-DESKEW'
         else:
             segment_image, segment_coords = self.workspace.image_from_segment(
                 segment, segment_image, segment_coords,
                 fill='background', transparency=True)
+            suffix = segment.id + '.IMG-DESKEW'
         if not angle:
             # zero rotation does not change coordinates,
             # but assures consuming processors that the
             # workflow had deskewing
             segment_coords['features'] += ',deskewed'
-        # update METS (add the image file):
-        file_path = self.workspace.save_image_file(
-            segment_image, file_id + '.IMG-DESKEW', self.output_file_grp,
-            page_id=page_id)
         # update PAGE (reference the image file):
-        segment.add_AlternativeImage(AlternativeImageType(
-            filename=file_path,
-            comments=segment_coords['features']))
+        alternative = AlternativeImageType(comments=segment_coords['features'])
+        segment.add_AlternativeImage(alternative)
+        return OcrdPageResultImage(segment_image, suffix, alternative)

From 669866857395544ed10c0fbda5ea03abd1b31f14 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 02:52:55 +0200
Subject: [PATCH 123/194] dewarp: adapt to final v3 API

---
 ocrd_cis/ocropy/dewarp.py | 129 +++++++++++++++-----------------------
 1 file changed, 50 insertions(+), 79 deletions(-)

diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 72efca45..a063a05e 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -1,24 +1,22 @@
 from __future__ import absolute_import
+
+from typing import Optional
 from logging import Logger
 from os.path import join
+
 import numpy as np
 
-from ocrd_utils import (
-    getLogger,
-    make_file_id,
-)
-from ocrd_modelfactory import page_from_file
+from ocrd_utils import getLogger
 from ocrd_models.ocrd_page import (
-    to_xml, AlternativeImageType
+    AlternativeImageType,
+    OcrdPage
 )
 from ocrd import Processor
-from ocrd_utils import MIMETYPE_PAGE
+from ocrd.processor import OcrdPageResult, OcrdPageResultImage
 
 from .ocrolib import lineest
 from .common import array2pil, check_line, determine_zoom, pil2array
 
-#sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-
 class InvalidLine(Exception):
     """Line image does not allow dewarping and should be ignored."""
 
@@ -80,10 +78,10 @@ def setup(self):
                     #  and extra params)
                     0.3))
 
-    def process(self):
+    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
         """Dewarp the lines of the workspace.
 
-        Open and deserialise PAGE input files and their respective images,
+        Open and deserialise PAGE input file and its respective images,
         then iterate over the element hierarchy down to the TextLine level.
 
         Next, get each line image according to the layout annotation (from
@@ -99,71 +97,44 @@ def process(self):
 
         Produce a new output file by serialising the resulting hierarchy.
         """
-
-        for (n, input_file) in enumerate(self.input_files):
-            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
-            file_id = make_file_id(input_file, self.output_file_grp)
-
-            pcgts = page_from_file(self.workspace.download_file(input_file))
-            self.add_metadata(pcgts)
-            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
-            page = pcgts.get_Page()
-                
-            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
-                page, page_id)
-
-            zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
-
-            regions = page.get_AllRegions(classes=['Text'], order='reading-order')
-            if not regions:
-                self.logger.warning('Page "%s" contains no text regions', page_id)
-            for region in regions:
-                region_image, region_xywh = self.workspace.image_from_segment(
-                    region, page_image, page_xywh)
-
-                lines = region.get_TextLine()
-                if not lines:
-                    self.logger.warning('Region %s contains no text lines', region.id)
-                for line in lines:
-                    line_image, line_xywh = self.workspace.image_from_segment(
-                        line, region_image, region_xywh)
-
-                    self.logger.info("About to dewarp page '%s' region '%s' line '%s'",
-                                     page_id, region.id, line.id)
-                    try:
-                        dew_image = dewarp(line_image, self.lnorm, check=True,
-                                           max_neighbour=self.parameter['max_neighbour'],
-                                           zoom=zoom)
-                    except InvalidLine as err:
-                        self.logger.error('cannot dewarp line "%s": %s', line.id, err)
-                        continue
-                    except InadequateLine as err:
-                        self.logger.warning('cannot dewarp line "%s": %s', line.id, err)
-                        # as a fallback, simply pad the image vertically
-                        # (just as dewarping would do on average, so at least
-                        #  this line has similar margins as the others):
-                        dew_image = padvert(line_image, self.parameter['range'])
-                    # update METS (add the image file):
-                    file_path = self.workspace.save_image_file(
-                        dew_image,
-                        file_id + '_' + region.id + '_' + line.id + '.IMG-DEWARP',
-                        self.output_file_grp,
-                        page_id=input_file.pageId)
-                    # update PAGE (reference the image file):
-                    alternative_image = line.get_AlternativeImage()
-                    line.add_AlternativeImage(AlternativeImageType(
-                        filename=file_path,
-                        comments=line_xywh['features'] + ',dewarped'))
-
-            # update METS (add the PAGE file):
-            file_path = join(self.output_file_grp, file_id + '.xml')
-            pcgts.set_pcGtsId(file_id)
-            out = self.workspace.add_file(
-                ID=file_id,
-                file_grp=self.output_file_grp,
-                pageId=input_file.pageId,
-                local_filename=file_path,
-                mimetype=MIMETYPE_PAGE,
-                content=to_xml(pcgts))
-            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
-                             file_id, self.output_file_grp, out.local_filename)
+        pcgts = input_pcgts[0]
+        result = OcrdPageResult(pcgts)
+        page = pcgts.get_Page()
+
+        page_image, page_xywh, page_image_info = self.workspace.image_from_page(
+            page, page_id)
+        zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
+
+        regions = page.get_AllRegions(classes=['Text'], order='reading-order')
+        if not regions:
+            self.logger.warning('Page "%s" contains no text regions', page_id)
+        for region in regions:
+            region_image, region_xywh = self.workspace.image_from_segment(
+                region, page_image, page_xywh)
+
+            lines = region.get_TextLine()
+            if not lines:
+                self.logger.warning('Region %s contains no text lines', region.id)
+            for line in lines:
+                line_image, line_xywh = self.workspace.image_from_segment(
+                    line, region_image, region_xywh)
+
+                self.logger.info("About to dewarp page '%s' region '%s' line '%s'",
+                                 page_id, region.id, line.id)
+                try:
+                    dew_image = dewarp(line_image, self.lnorm, check=True,
+                                       max_neighbour=self.parameter['max_neighbour'],
+                                       zoom=zoom)
+                except InvalidLine as err:
+                    self.logger.error('cannot dewarp line "%s": %s', line.id, err)
+                    continue
+                except InadequateLine as err:
+                    self.logger.warning('cannot dewarp line "%s": %s', line.id, err)
+                    # as a fallback, simply pad the image vertically
+                    # (just as dewarping would do on average, so at least
+                    #  this line has similar margins as the others):
+                    dew_image = padvert(line_image, self.parameter['range'])
+                # update PAGE (reference the image file):
+                alt_image = AlternativeImageType(comments=line_xywh['features'] + ',dewarped')
+                line.add_AlternativeImage(alternative_image)
+                return OcrdPageResultImage(dew_image, region.id + '_' + line.id + '.IMG-DEWARP', alt_image)

From 48a3146a4e510b14899aafc80c7f9f05da05fc48 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 03:07:40 +0200
Subject: [PATCH 124/194] resegment: adapt to final v3 API

---
 ocrd_cis/ocropy/resegment.py | 109 +++++++++++++++--------------------
 1 file changed, 45 insertions(+), 64 deletions(-)

diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 1e9f8c7f..05f17d4f 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -1,24 +1,25 @@
 from __future__ import absolute_import
+
+from typing import Optional
 from logging import Logger
 from os.path import join
+
 import numpy as np
 from skimage import draw, segmentation
 from shapely.geometry import Polygon, LineString
 from shapely.prepared import prep
 
-from ocrd_modelfactory import page_from_file
-from ocrd_models.ocrd_page import BaselineType, PageType, to_xml
-from ocrd import Processor
 from ocrd_utils import (
     getLogger,
-    make_file_id,
     coordinates_of_segment,
     coordinates_for_segment,
     points_from_polygon,
     polygon_from_points,
     transform_coordinates,
-    MIMETYPE_PAGE
 )
+from ocrd_models.ocrd_page import BaselineType, PageType, OcrdPage
+from ocrd import Processor
+from ocrd.processor import OcrdPageResult
 
 from .ocrolib import midrange, morph
 from .common import (
@@ -52,10 +53,10 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropyResegment')
 
-    def process(self):
+    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
         """Resegment lines of the workspace.
 
-        Open and deserialise PAGE input files and their respective images,
+        Open and deserialise PAGE input file and its respective images,
         then iterate over the element hierarchy down to the line level.
 
         Next, get the page image according to the layout annotation (from
@@ -104,67 +105,47 @@ def process(self):
         # accuracy crucially depends on a good estimate of the images'
         # pixel density (at least if source input is not 300 DPI).
         level = self.parameter['level-of-operation']
+        pcgts = input_pcgts[0]
+        page = pcgts.get_Page()
 
-        for n, input_file in enumerate(self.input_files):
-            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
-            file_id = make_file_id(input_file, self.output_file_grp)
-
-            pcgts = page_from_file(self.workspace.download_file(input_file))
-            self.add_metadata(pcgts)
-            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID
-            page = pcgts.get_Page()
-
-            page_image, page_coords, page_image_info = self.workspace.image_from_page(
-                page, page_id, feature_selector='binarized')
+        page_image, page_coords, page_image_info = self.workspace.image_from_page(
+            page, page_id, feature_selector='binarized')
+        zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
-            zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
-
-            ignore = (page.get_ImageRegion() +
-                      page.get_LineDrawingRegion() +
-                      page.get_GraphicRegion() +
-                      page.get_ChartRegion() +
-                      page.get_MapRegion() +
-                      page.get_MathsRegion() +
-                      page.get_ChemRegion() +
-                      page.get_MusicRegion() +
-                      page.get_AdvertRegion() +
-                      page.get_NoiseRegion() +
-                      page.get_SeparatorRegion() +
-                      page.get_UnknownRegion() +
-                      page.get_CustomRegion())
-            regions = page.get_AllRegions(classes=['Text'])
-            if not regions:
-                self.logger.warning('Page "%s" contains no text regions', page_id)
-            elif level == 'page':
-                lines = [line for region in regions
-                         for line in region.get_TextLine()]
+        ignore = (page.get_ImageRegion() +
+                  page.get_LineDrawingRegion() +
+                  page.get_GraphicRegion() +
+                  page.get_ChartRegion() +
+                  page.get_MapRegion() +
+                  page.get_MathsRegion() +
+                  page.get_ChemRegion() +
+                  page.get_MusicRegion() +
+                  page.get_AdvertRegion() +
+                  page.get_NoiseRegion() +
+                  page.get_SeparatorRegion() +
+                  page.get_UnknownRegion() +
+                  page.get_CustomRegion())
+        regions = page.get_AllRegions(classes=['Text'])
+        if not regions:
+            self.logger.warning('Page "%s" contains no text regions', page_id)
+        elif level == 'page':
+            lines = [line for region in regions
+                     for line in region.get_TextLine()]
+            if lines:
+                self._process_segment(page, page_image, page_coords, page_id, zoom, lines, ignore)
+            else:
+                self.logger.warning('Page "%s" contains no text regions with lines', page_id)
+        else:
+            for region in regions:
+                lines = region.get_TextLine()
                 if lines:
-                    self._process_segment(page, page_image, page_coords, page_id, zoom, lines, ignore)
+                    region_image, region_coords = self.workspace.image_from_segment(
+                        region, page_image, page_coords, feature_selector='binarized')
+                    self._process_segment(region, region_image, region_coords, page_id, zoom, lines, ignore)
                 else:
-                    self.logger.warning('Page "%s" contains no text regions with lines', page_id)
-            else:
-                for region in regions:
-                    lines = region.get_TextLine()
-                    if lines:
-                        region_image, region_coords = self.workspace.image_from_segment(
-                            region, page_image, page_coords, feature_selector='binarized')
-                        self._process_segment(region, region_image, region_coords, page_id, zoom, lines, ignore)
-                    else:
-                        self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
-
-            # update METS (add the PAGE file):
-            file_path = join(self.output_file_grp, file_id + '.xml')
-            pcgts.set_pcGtsId(file_id)
-            out = self.workspace.add_file(
-                ID=file_id,
-                file_grp=self.output_file_grp,
-                pageId=input_file.pageId,
-                local_filename=file_path,
-                mimetype=MIMETYPE_PAGE,
-                content=to_xml(pcgts))
-            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
-                     file_id, self.output_file_grp, out.local_filename)
-
+                    self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
+        return OcrdPageResult(pcgts)
+ 
     def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, lines, ignore):
         threshold = self.parameter['min_fraction']
         method = self.parameter['method']

From 0dd6fbac1a63965d241203cdc1dda85ca1fa4728 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 11:04:23 +0200
Subject: [PATCH 125/194] ocropy_segment: implement process_page_pcgts

---
 ocrd_cis/ocropy/segment.py | 314 +++++++++++++++++++++++++++----------
 1 file changed, 229 insertions(+), 85 deletions(-)

diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 57368fe8..d2a7a727 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -1,6 +1,7 @@
 from __future__ import absolute_import
 from logging import Logger
 from os.path import join
+from typing import Optional
 import itertools
 import numpy as np
 from scipy.sparse.csgraph import minimum_spanning_tree
@@ -16,6 +17,7 @@
 from ocrd_modelfactory import page_from_file
 from ocrd_models.ocrd_page import (
     to_xml, CoordsType,
+    OcrdPage,
     TextLineType,
     TextRegionType,
     SeparatorRegionType,
@@ -35,6 +37,7 @@
     ReadingOrderType
 )
 from ocrd import Processor
+from ocrd.processor import OcrdPageResult
 from ocrd_utils import (
     getLogger,
     make_file_id,
@@ -252,6 +255,168 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropySegment')
 
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
+        overwrite_lines = self.parameter['overwrite_lines']
+        overwrite_regions = self.parameter['overwrite_regions']
+        overwrite_separators = self.parameter['overwrite_separators']
+        overwrite_order = self.parameter['overwrite_order']
+        oplevel = self.parameter['level-of-operation']
+
+        pcgts = input_pcgts[0]
+        page = pcgts.get_Page()
+
+        # TODO: also allow grayscale_normalized (try/except?)
+        page_image, page_coords, page_image_info = self.workspace.image_from_page(
+            page, page_id, feature_selector='binarized')
+        zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
+
+        # aggregate existing regions so their foreground can be ignored
+        ignore = (page.get_ImageRegion() +
+                  page.get_LineDrawingRegion() +
+                  page.get_GraphicRegion() +
+                  page.get_ChartRegion() +
+                  page.get_MapRegion() +
+                  page.get_MathsRegion() +
+                  page.get_ChemRegion() +
+                  page.get_MusicRegion() +
+                  page.get_AdvertRegion() +
+                  page.get_NoiseRegion() +
+                  page.get_UnknownRegion() +
+                  page.get_CustomRegion())
+        if oplevel == 'page' and overwrite_separators:
+            page.set_SeparatorRegion([])
+        else:
+            ignore.extend(page.get_SeparatorRegion())
+        # prepare reading order
+        reading_order = dict()
+        ro = page.get_ReadingOrder()
+        if ro:
+            rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup()
+            if rogroup:
+                page_get_reading_order(reading_order, rogroup)
+            # get segments to process / overwrite
+        if oplevel == 'page':
+            ignore.extend(page.get_TableRegion())
+            regions = list(page.get_TextRegion())
+            if regions:
+                # page is already region-segmented
+                if overwrite_regions:
+                    self.logger.info(f'Removing existing TextRegions in page "{page_id}"')
+                    # we could remove all other region types as well,
+                    # but this is more flexible (for workflows with
+                    # specialized separator/image/table detectors):
+                    page.set_TextRegion([])
+                    page.set_ReadingOrder(None)
+                    ro = None
+                else:
+                    self.logger.warning(f'Keeping existing TextRegions in page "{page_id}"')
+                    ignore.extend(regions)
+            # create reading order if necessary
+            if not ro or overwrite_order:
+                ro = ReadingOrderType()
+                page.set_ReadingOrder(ro)
+            rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup()
+            if not rogroup:
+                # new top-level group
+                rogroup = OrderedGroupType(id="reading-order")
+                ro.set_OrderedGroup(rogroup)
+            # go get TextRegions with TextLines (and SeparatorRegions):
+            self._process_element(
+                page, ignore, page_image, page_coords, page_id, file_id, page_id, zoom, rogroup=rogroup)
+            if (not rogroup.get_RegionRefIndexed() and
+                    not rogroup.get_OrderedGroupIndexed() and
+                    not rogroup.get_UnorderedGroupIndexed()):
+                # schema forbids empty OrderedGroup
+                ro.set_OrderedGroup(None)
+        elif oplevel == 'table':
+            ignore.extend(page.get_TextRegion())
+            regions = list(page.get_TableRegion())
+            if not regions:
+                self.logger.warning(f'Page "{page_id}" contains no table regions')
+            for region in regions:
+                subregions = region.get_TextRegion()
+                if subregions:
+                    # table is already cell-segmented
+                    if overwrite_regions:
+                        self.logger.info(f'Removing existing TextRegions in table "{region.id}"')
+                        region.set_TextRegion([])
+                        roelem = reading_order.get(region.id)
+                        # replace by empty group with same index and ref
+                        # (which can then take the cells as subregions)
+                        reading_order[region.id] = page_subgroup_in_reading_order(self.logger, roelem)
+                    else:
+                        self.logger.warning(f'Skipping table "{region.id}" with existing TextRegions')
+                        continue
+                # TODO: also allow grayscale_normalized (try/except?)
+                region_image, region_coords = self.workspace.image_from_segment(
+                    region, page_image, page_coords, feature_selector='binarized')
+                # ignore everything but the current table region
+                subignore = regions + ignore
+                subignore.remove(region)
+                # create reading order group if necessary
+                roelem = reading_order.get(region.id)
+                if not roelem:
+                    self.logger.warning(f"Page '{page_id}' table region '{region.id}' is not referenced in reading "
+                                        f"order (no target to add cells to)")
+                elif overwrite_order:
+                    # replace by empty ordered group with same (index and) ref
+                    # (which can then take the cells as subregions)
+                    roelem = page_subgroup_in_reading_order(self.logger, roelem)
+                    reading_order[region.id] = roelem
+                elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)):
+                    self.logger.warning(f"Page '{page_id}' table region '{region.id}' already has an ordered "
+                                        f"group (cells will be appended)")
+                elif isinstance(roelem, (UnorderedGroupType, UnorderedGroupIndexedType)):
+                    self.logger.warning(f"Page '{page_id}' table region '{region.id}' already has an unordered "
+                                        f"group (cells will not be appended)")
+                    roelem = None
+                else:
+                    # replace regionRef(Indexed) by group with same index and ref
+                    # (which can then take the cells as subregions)
+                    roelem = page_subgroup_in_reading_order(self.logger, roelem)
+                    reading_order[region.id] = roelem
+                # go get TextRegions with TextLines (and SeparatorRegions)
+                self._process_element(
+                    region, subignore, region_image, region_coords, region.id, file_id + '_' + region.id,
+                    page_id, zoom, rogroup=roelem)
+        else:  # 'region'
+            regions = list(page.get_TextRegion())
+            # besides top-level text regions, line-segment any table cells,
+            # and for tables without any cells, add a pseudo-cell
+            for region in page.get_TableRegion():
+                subregions = region.get_TextRegion()
+                if subregions:
+                    regions.extend(subregions)
+                else:
+                    subregion = TextRegionType(
+                        id=region.id + '_text',
+                        Coords=region.get_Coords(),
+                        # as if generated from parser:
+                        parent_object_=region)
+                    region.add_TextRegion(subregion)
+                    regions.append(subregion)
+            if not regions:
+                self.logger.warning('Page "%s" contains no text regions', page_id)
+            for region in regions:
+                if region.get_TextLine():
+                    if overwrite_lines:
+                        self.logger.info(f'Removing existing TextLines in page "{page_id}" region "{region.id}"')
+                        region.set_TextLine([])
+                    else:
+                        self.logger.warning(f'Keeping existing TextLines in page "{page_id}" region "{region.id}"')
+                        ignore.extend(region.get_TextLine())
+                # TODO: also allow grayscale_normalized (try/except?)
+                region_image, region_coords = self.workspace.image_from_segment(
+                    region, page_image, page_coords, feature_selector='binarized')
+                # if the region images have already been clipped against their neighbours specifically,
+                # then we don't need to suppress all neighbours' foreground generally here
+                if 'clipped' in region_coords['features'].split(','):
+                    ignore = []
+                # go get TextLines
+                self._process_element(
+                    region, ignore, region_image, region_coords, region.id, file_id + '_' + region.id, page_id, zoom)
+        return OcrdPageResult(pcgts)
+
     def process(self):
         """Segment pages into regions+lines, tables into cells+lines, or regions into lines.
         
@@ -335,7 +500,7 @@ def process(self):
             self.add_metadata(pcgts)
             page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
             page = pcgts.get_Page()
-            
+
             # TODO: also allow grayscale_normalized (try/except?)
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id, feature_selector='binarized')
@@ -521,15 +686,14 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
         newly detected separators to guide region segmentation.
         """
         if not image.width or not image.height:
-            self.logger.warning("Skipping '%s' with zero size", element_id)
+            self.logger.warning(f"Skipping '{element_id}' with zero size")
             return
         element_array = pil2array(image)
         element_bin = np.array(element_array <= midrange(element_array), bool)
         sep_bin = np.zeros_like(element_bin, bool)
         ignore_labels = np.zeros_like(element_bin, int)
         for i, segment in enumerate(ignore):
-            self.logger.debug('masking foreground of %s "%s" for "%s"',
-                      type(segment).__name__[:-4], segment.id, element_id)
+            self.logger.debug(f'Masking foreground of {type(segment).__name__[:-4]} "{segment.id}" for "{element_id}"')
             # mark these segments (e.g. separator regions, tables, images)
             # for workflows where they have been detected already;
             # these will be:
@@ -540,13 +704,11 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             # negative/above-max indices), either fully or partially,
             # then this will silently ignore them. The caller does
             # not need to concern herself with this.
+            sp_row = segment_polygon[:, 1]
+            sp_column = segment_polygon[:, 0]
             if isinstance(segment, SeparatorRegionType):
-                sep_bin[draw.polygon(segment_polygon[:, 1],
-                                     segment_polygon[:, 0],
-                                     sep_bin.shape)] = True
-            ignore_labels[draw.polygon(segment_polygon[:, 1],
-                                       segment_polygon[:, 0],
-                                       ignore_labels.shape)] = i+1 # mapped back for RO
+                sep_bin[draw.polygon(sp_row, sp_column, sep_bin.shape)] = True
+            ignore_labels[draw.polygon(sp_row, sp_column, ignore_labels.shape)] = i + 1  # mapped back for RO
         if isinstance(element, PageType):
             element_name = 'page'
             fullpage = True
@@ -562,7 +724,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             element_name = 'region'
             fullpage = False
             report = check_region(element_bin, zoom)
-        self.logger.info('computing line segmentation for %s "%s"', element_name, element_id)
+        self.logger.info(f'Computing line segmentation for {element_name} "{element_id}"')
         # TODO: we should downscale if DPI is large enough to save time
         try:
             if report:
@@ -570,9 +732,9 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             line_labels, baselines, seplines, images, colseps, scale = compute_segmentation(
                 # suppress separators and ignored regions for textline estimation
                 # but keep them for h/v-line detection (in fullpage mode):
-                element_bin, seps=(sep_bin+ignore_labels)>0,
+                element_bin, seps=(sep_bin + ignore_labels) > 0,
                 zoom=zoom, fullpage=fullpage,
-                spread_dist=round(self.parameter['spread']/zoom*300/72), # in pt
+                spread_dist=round(self.parameter['spread'] / zoom * 300 / 72),  # in pt
                 # these are ignored when not in fullpage mode:
                 maxcolseps=self.parameter['maxcolseps'],
                 maxseps=self.parameter['maxseps'],
@@ -580,16 +742,13 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                 csminheight=self.parameter['csminheight'])
         except Exception as err:
             if isinstance(element, TextRegionType):
-                self.logger.error('Cannot line-segment region "%s": %s', element_id, err)
+                self.logger.error(f'Cannot line-segment region "{element_id}": {err}')
                 # as a fallback, add a single text line comprising the whole region:
                 element.add_TextLine(TextLineType(id=element_id + "_line", Coords=element.get_Coords()))
             else:
-                self.logger.error('Cannot line-segment %s "%s": %s', element_name, element_id, err)
+                self.logger.error(f'Cannot line-segment {element_name} "{element_id}": {err}')
             return
-
-        self.logger.info('Found %d text lines for %s "%s"',
-                 len(np.unique(line_labels)) - 1,
-                 element_name, element_id)
+        self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines for {element_name} "{element_id}"')
         # post-process line labels
         if isinstance(element, (PageType, TableRegionType)):
             # aggregate text lines to text regions
@@ -598,31 +757,28 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                 # i.e. identical line and region labels
                 # to detect their reading order among the others
                 # (these cannot be split or grouped together with other regions)
-                line_labels = np.where(line_labels, line_labels+len(ignore), ignore_labels)
+                line_labels = np.where(line_labels, line_labels + len(ignore), ignore_labels)
                 # suppress separators/images in fg and try to use for partitioning slices
                 sepmask = np.maximum(sep_bin, np.maximum(seplines > 0, images > 0))
                 region_labels = lines2regions(
                     element_bin, line_labels,
                     rlabels=ignore_labels,
-                    sepmask=np.maximum(sepmask, colseps), # add bg
+                    sepmask=np.maximum(sepmask, colseps),  # add bg
                     # decide horizontal vs vertical cut when gaps of similar size
                     prefer_vertical=not isinstance(element, TableRegionType),
                     gap_height=self.parameter['gap_height'],
                     gap_width=self.parameter['gap_width'],
                     scale=scale, zoom=zoom)
-                self.logger.info('Found %d text regions for %s "%s"',
-                         len(np.unique(region_labels)) - 1,
-                         element_name, element_id)
+                self.logger.info(
+                    f'Found {len(np.unique(region_labels)) - 1} text regions for {element_name} "{element_id}"')
             except Exception as err:
-                self.logger.error('Cannot region-segment %s "%s": %s',
-                          element_name, element_id, err)
+                self.logger.error(f'Cannot region-segment {element_name} "{element_id}": {err}')
                 region_labels = np.where(line_labels > len(ignore), 1 + len(ignore), line_labels)
-            
             # prepare reading order group index
             if rogroup:
                 if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)):
                     index = 0
-                    # start counting from largest existing index
+                    # start counting from the largest existing index
                     for elem in (rogroup.get_RegionRefIndexed() +
                                  rogroup.get_OrderedGroupIndexed() +
                                  rogroup.get_UnorderedGroupIndexed()):
@@ -634,7 +790,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             region_no = 0
             for region_label in np.unique(region_labels):
                 if not region_label:
-                    continue # no bg
+                    continue  # no bg
                 region_mask = region_labels == region_label
                 region_line_labels = line_labels * region_mask
                 region_line_labels0 = np.setdiff1d(region_line_labels, [0])
@@ -643,13 +799,12 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     # (no new region, no actual text lines)
                     region_line_labels0 = np.intersect1d(region_line_labels0, ignore_labels)
                     assert len(region_line_labels0) == 1, \
-                        "region label %d has both existing regions and new lines (%s)" % (
-                            region_label, str(region_line_labels0))
+                        (f"Region label {region_label} has both existing regions and new lines "
+                         f"({str(region_line_labels0)})")
                     region = ignore[region_line_labels0[0] - 1]
                     if rogroup and region.parent_object_ is element and not isinstance(region, SeparatorRegionType):
                         index = page_add_to_reading_order(rogroup, region.id, index)
-                    self.logger.debug('Region label %d is for ignored region "%s"',
-                              region_label, region.id)
+                    self.logger.debug(f'Region label {region_label} is for ignored region "{region.id}"')
                     continue
                 # normal case: new lines inside new regions
                 # remove binary-empty labels, and re-order locally
@@ -657,18 +812,18 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                 order[np.setdiff1d(region_line_labels0, element_bin * region_line_labels)] = 0
                 region_line_labels = order[region_line_labels]
                 # avoid horizontal gaps
-                region_line_labels = hmerge_line_seeds(element_bin, region_line_labels, scale,
-                                                       seps=np.maximum(sepmask, colseps))
+                region_line_labels = hmerge_line_seeds(
+                    element_bin, region_line_labels, scale, seps=np.maximum(sepmask, colseps))
                 region_mask |= region_line_labels > 0
                 # find contours for region (can be non-contiguous)
-                regions, _ = masks2polygons(self.logger, region_mask * region_label, None, element_bin,
-                                            '%s "%s"' % (element_name, element_id),
-                                            min_area=6000/zoom/zoom,
-                                            simplify=ignore_labels * ~(sep_bin))
+                regions, _ = masks2polygons(
+                    self.logger, region_mask * region_label, None, element_bin,
+                    name=f'{element_name} "{element_id}"', min_area=6000 / zoom / zoom,
+                    simplify=ignore_labels * ~(sep_bin))
                 # find contours for lines (can be non-contiguous)
-                lines, _ = masks2polygons(self.logger, region_line_labels, baselines, element_bin,
-                                          'region "%s"' % element_id,
-                                          min_area=640/zoom/zoom)
+                lines, _ = masks2polygons(
+                    self.logger, region_line_labels, baselines, element_bin,
+                    name=f'region "{element_id}"', min_area=640 / zoom / zoom)
                 # create new lines in new regions (allocating by intersection)
                 line_polys = [Polygon(polygon) for _, polygon, _ in lines]
                 for _, region_polygon, _ in regions:
@@ -677,34 +832,31 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     region_polygon = coordinates_for_segment(region_polygon, image, coords)
                     region_polygon = polygon_for_parent(region_polygon, element)
                     if region_polygon is None:
-                        self.logger.warning('Ignoring extant region contour for region label %d', region_label)
+                        self.logger.warning(f'Ignoring extant region contour for region label {region_label}')
                         continue
                     # annotate result:
                     region_no += 1
                     region_id = element_id + "_region%04d" % region_no
-                    self.logger.debug('Region label %d becomes ID "%s"', region_label, region_id)
-                    region = TextRegionType(
-                        id=region_id, Coords=CoordsType(
-                        points=points_from_polygon(region_polygon)))
+                    self.logger.debug(f'Region label {region_label} becomes ID "{region_id}"')
+                    region = TextRegionType(id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))
                     # find out which line (contours) belong to which region (contours)
                     line_no = 0
                     for i, line_poly in enumerate(line_polys):
-                        if not region_poly.intersects(line_poly): # .contains
+                        if not region_poly.intersects(line_poly):  # .contains
                             continue
                         line_label, line_polygon, line_baseline = lines[i]
                         # convert back to absolute (page) coordinates:
                         line_polygon = coordinates_for_segment(line_polygon, image, coords)
                         line_polygon = polygon_for_parent(line_polygon, region)
                         if line_polygon is None:
-                            self.logger.warning('Ignoring extant line contour for region label %d line label %d',
-                                        region_label, line_label)
+                            self.logger.warning(
+                                f'Ignoring extant line contour for region label {region_label} line label {line_label}')
                             continue
                         # annotate result:
                         line_no += 1
                         line_id = region_id + "_line%04d" % line_no
-                        self.logger.debug('Line label %d becomes ID "%s"', line_label, line_id)
-                        line = TextLineType(id=line_id,
-                                            Coords=CoordsType(points=points_from_polygon(line_polygon)))
+                        self.logger.debug(f'Line label {line_label} becomes ID "{line_id}"')
+                        line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon)))
                         if line_baseline:
                             line_baseline = coordinates_for_segment(line_baseline, image, coords)
                             line.set_Baseline(BaselineType(points=points_from_polygon(line_baseline)))
@@ -712,95 +864,87 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     # if the region has received text lines, keep it
                     if region.get_TextLine():
                         element.add_TextRegion(region)
-                        self.logger.info('Added region "%s" with %d lines for %s "%s"',
-                                 region_id, line_no, element_name, element_id)
+                        self.logger.info(
+                            f'Added region "{region_id}" with {line_no} lines for {element_name} "{element_id}"')
                         if rogroup:
                             index = page_add_to_reading_order(rogroup, region.id, index)
             # add additional image/non-text regions from compute_segmentation
             # (e.g. drop-capitals or images) ...
-            self.logger.info('Found %d large image regions for %s "%s"', images.max(), element_name, element_id)
+            self.logger.info(f'Found {images.max()} large image regions for {element_name} "{element_id}"')
             # find contours around region labels (can be non-contiguous):
-            image_polygons, _ = masks2polygons(self.logger, images, None, element_bin,
-                                               '%s "%s"' % (element_name, element_id))
+            image_polygons, _ = masks2polygons(
+                self.logger, images, None, element_bin, f'{element_name} "{element_id}"')
             for image_label, polygon, _ in image_polygons:
                 # convert back to absolute (page) coordinates:
                 region_polygon = coordinates_for_segment(polygon, image, coords)
                 region_polygon = polygon_for_parent(region_polygon, element)
                 if region_polygon is None:
-                    self.logger.warning('Ignoring extant region contour for image label %d', image_label)
+                    self.logger.warning(f'Ignoring extant region contour for image label {image_label}')
                     continue
                 region_no += 1
                 # annotate result:
                 region_id = element_id + "_image%04d" % region_no
                 element.add_ImageRegion(ImageRegionType(
-                    id=region_id, Coords=CoordsType(
-                    points=points_from_polygon(region_polygon))))
+                    id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon))))
             # split detected separator labels into separator regions:
-            self.logger.info('Found %d separators for %s "%s"', seplines.max(), element_name, element_id)
+            self.logger.info(f'Found {seplines.max()} separators for {element_name} "{element_id}"')
             # find contours around region labels (can be non-contiguous):
-            sep_polygons, _ = masks2polygons(self.logger, seplines, None, element_bin,
-                                             '%s "%s"' % (element_name, element_id),
-                                             open_holes=True, reorder=False)
+            sep_polygons, _ = masks2polygons(
+                self.logger, seplines, None, element_bin,
+                name=f'{element_name} "{element_id}"', open_holes=True, reorder=False)
             for sep_label, polygon, _ in sep_polygons:
                 # convert back to absolute (page) coordinates:
                 region_polygon = coordinates_for_segment(polygon, image, coords)
                 region_polygon = polygon_for_parent(region_polygon, element)
                 if region_polygon is None:
-                    self.logger.warning('Ignoring extant region contour for separator %d', sep_label)
+                    self.logger.warning(f'Ignoring extant region contour for separator {sep_label}')
                     continue
                 # annotate result:
                 region_no += 1
                 region_id = element_id + "_sep%04d" % region_no
                 element.add_SeparatorRegion(SeparatorRegionType(
-                    id=region_id, Coords=CoordsType(
-                    points=points_from_polygon(region_polygon))))
+                    id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon))))
             # annotate a text/image-separated image
             element_array[sepmask] = np.amax(element_array) # clip to white/bg
             image_clipped = array2pil(element_array)
             file_path = self.workspace.save_image_file(
-                image_clipped, file_id + '.IMG-CLIP', self.output_file_grp,
-                page_id=page_id)
+                image_clipped, file_id + '.IMG-CLIP', self.output_file_grp, page_id=page_id)
             element.add_AlternativeImage(AlternativeImageType(
                 filename=file_path, comments=coords['features'] + ',clipped'))
         else:
-            # get mask from region polygon:
+            #  get mask from region polygon:
             region_polygon = coordinates_of_segment(element, image, coords)
             region_mask = np.zeros_like(element_bin, bool)
-            region_mask[draw.polygon(region_polygon[:, 1],
-                                     region_polygon[:, 0],
-                                     region_mask.shape)] = True
+            region_mask[draw.polygon(region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True
             # ensure the new line labels do not extrude from the region:
             line_labels = line_labels * region_mask
             # find contours around labels (can be non-contiguous):
-            line_polygons, _ = masks2polygons(self.logger, line_labels, baselines, element_bin,
-                                              'region "%s"' % element_id,
-                                              min_area=640/zoom/zoom)
+            line_polygons, _ = masks2polygons(
+                self.logger, line_labels, baselines, element_bin,
+                name=f'region "{element_id}"', min_area=640 / zoom / zoom)
             line_no = 0
             for line_label, polygon, baseline in line_polygons:
                 # convert back to absolute (page) coordinates:
                 line_polygon = coordinates_for_segment(polygon, image, coords)
                 line_polygon = polygon_for_parent(line_polygon, element)
                 if line_polygon is None:
-                    self.logger.warning('Ignoring extant line contour for line label %d',
-                                line_label)
+                    self.logger.warning(f'Ignoring extant line contour for line label {line_label}')
                     continue
                 # annotate result:
                 line_no += 1
                 line_id = element_id + "_line%04d" % line_no
-                line = TextLineType(id=line_id,
-                                    Coords=CoordsType(points=points_from_polygon(line_polygon)))
+                line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon)))
                 if baseline:
                     line_baseline = coordinates_for_segment(baseline, image, coords)
                     line.set_Baseline(BaselineType(points=points_from_polygon(line_baseline)))
                 element.add_TextLine(line)
             if not sep_bin.any():
-                return # no derived image
+                return  # no derived image
             # annotate a text/image-separated image
-            element_array[sep_bin] = np.amax(element_array) # clip to white/bg
+            element_array[sep_bin] = np.amax(element_array)  # clip to white/bg
             image_clipped = array2pil(element_array)
             file_path = self.workspace.save_image_file(
-                image_clipped, file_id + '.IMG-CLIP', self.output_file_grp,
-                page_id=page_id)
+                image_clipped, file_id + '.IMG-CLIP', self.output_file_grp, page_id=page_id)
             # update PAGE (reference the image file):
             element.add_AlternativeImage(AlternativeImageType(
                 filename=file_path, comments=coords['features'] + ',clipped'))

From ad5ac7c4ab7f2b52bf313563456feca0094761ce Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 11:06:01 +0200
Subject: [PATCH 126/194] ocropy_segment: remove process

---
 ocrd_cis/ocropy/segment.py | 317 ++++++++-----------------------------
 1 file changed, 67 insertions(+), 250 deletions(-)

diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index d2a7a727..94b6ab1f 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -256,6 +256,73 @@ def setup(self):
         self.logger = getLogger('processor.OcropySegment')
 
     def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
+        """Segment pages into regions+lines, tables into cells+lines, or regions into lines.
+        Open and deserialise PAGE input files and their respective images,
+        then iterate over the element hierarchy down to the requested level.
+
+        Depending on ``level-of-operation``, consider existing segments:
+        - If ``overwrite_separators=True`` on ``page`` level, then
+          delete any SeparatorRegions.
+        - If ``overwrite_regions=True`` on ``page`` level, then
+          delete any top-level TextRegions (along with ReadingOrder).
+        - If ``overwrite_regions=True`` on ``table`` level, then
+          delete any TextRegions in TableRegions (along with their OrderGroup).
+        - If ``overwrite_lines=True`` on ``region`` level, then
+          delete any TextLines in TextRegions.
+        - If ``overwrite_order=True`` on ``page`` or ``table`` level, then
+          delete the reading order OrderedGroup entry corresponding
+          to the (page/table) segment.
+
+        Next, get each element image according to the layout annotation (from
+        the alternative image of the page/region, or by cropping via coordinates
+        into the higher-level image) in binarized form, and represent it as an array
+        with non-text regions and (remaining) text neighbours suppressed.
+
+        Then compute a text line segmentation for that array (as a label mask).
+        When ``level-of-operation`` is ``page`` or ``table``, this also entails
+        detecting
+        - up to ``maximages`` large foreground images,
+        - up to ``maxseps`` foreground line separators and
+        - up to ``maxcolseps`` background column separators
+        before text line segmentation itself, as well as aggregating text lines
+        to text regions afterwards.
+
+        Text regions are detected via a hybrid variant recursive X-Y cut algorithm
+        (RXYC): RXYC partitions the binarized image in top-down manner by detecting
+        horizontal or vertical gaps. This implementation uses the bottom-up text line
+        segmentation to guide the search, and also uses both pre-existing and newly
+        detected separators to alternatively partition the respective boxes into
+        non-rectangular parts.
+
+        During line segmentation, suppress the foreground of all previously annotated
+        regions (of any kind) and lines, except if just removed due to ``overwrite``.
+        During region aggregation however, combine the existing separators with the
+        new-found separators to guide the column search.
+
+        All detected segments (both text line and text region) are sorted according
+        to their reading order (assuming a top-to-bottom, left-to-right ordering).
+        When ``level-of-operation`` is ``page``, prefer vertical (column-first)
+        succession of regions. When it is ``table``, prefer horizontal (row-first)
+        succession of cells.
+
+        Then for each resulting segment label, convert its background mask into
+        polygon outlines by finding the outer contours consistent with the element's
+        polygon outline. Annotate the result by adding it as a new TextLine/TextRegion:
+        - If ``level-of-operation`` is ``region``, then append the new lines to the
+          parent region.
+        - If it is ``table``, then append the new lines to their respective regions,
+          and append the new regions to the parent table.
+          (Also, create an OrderedGroup for it as the parent's RegionRef.)
+        - If it is ``page``, then append the new lines to their respective regions,
+          and append the new regions to the page.
+          (Also, create an OrderedGroup for it in the ReadingOrder.)
+
+        Produce a new output file by serialising the resulting hierarchy.
+        """
+        # FIXME: allow passing a-priori info on reading order / textline order
+        # (and then pass on as ``bt`` and ``rl``; however, there may be a mixture
+        #  of different scripts; also, vertical writing needs internal rotation
+        #  because our line segmentation only works for horizontal writing)
         overwrite_lines = self.parameter['overwrite_lines']
         overwrite_regions = self.parameter['overwrite_regions']
         overwrite_separators = self.parameter['overwrite_separators']
@@ -417,256 +484,6 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                     region, ignore, region_image, region_coords, region.id, file_id + '_' + region.id, page_id, zoom)
         return OcrdPageResult(pcgts)
 
-    def process(self):
-        """Segment pages into regions+lines, tables into cells+lines, or regions into lines.
-        
-        Open and deserialise PAGE input files and their respective images,
-        then iterate over the element hierarchy down to the requested level.
-        
-        Depending on ``level-of-operation``, consider existing segments:
-        - If ``overwrite_separators=True`` on ``page`` level, then
-          delete any SeparatorRegions.
-        - If ``overwrite_regions=True`` on ``page`` level, then
-          delete any top-level TextRegions (along with ReadingOrder).
-        - If ``overwrite_regions=True`` on ``table`` level, then
-          delete any TextRegions in TableRegions (along with their OrderGroup).
-        - If ``overwrite_lines=True`` on ``region`` level, then
-          delete any TextLines in TextRegions.
-        - If ``overwrite_order=True`` on ``page`` or ``table`` level, then
-          delete the reading order OrderedGroup entry corresponding
-          to the (page/table) segment.
-        
-        Next, get each element image according to the layout annotation (from
-        the alternative image of the page/region, or by cropping via coordinates
-        into the higher-level image) in binarized form, and represent it as an array
-        with non-text regions and (remaining) text neighbours suppressed.
-        
-        Then compute a text line segmentation for that array (as a label mask).
-        When ``level-of-operation`` is ``page`` or ``table``, this also entails
-        detecting
-        - up to ``maximages`` large foreground images,
-        - up to ``maxseps`` foreground line separators and
-        - up to ``maxcolseps`` background column separators
-        before text line segmentation itself, as well as aggregating text lines
-        to text regions afterwards.
-        
-        Text regions are detected via a hybrid variant recursive X-Y cut algorithm
-        (RXYC): RXYC partitions the binarized image in top-down manner by detecting
-        horizontal or vertical gaps. This implementation uses the bottom-up text line
-        segmentation to guide the search, and also uses both pre-existing and newly
-        detected separators to alternatively partition the respective boxes into
-        non-rectangular parts.
-        
-        During line segmentation, suppress the foreground of all previously annotated
-        regions (of any kind) and lines, except if just removed due to ``overwrite``.
-        During region aggregation however, combine the existing separators with the
-        new-found separators to guide the column search.
-        
-        All detected segments (both text line and text region) are sorted according
-        to their reading order (assuming a top-to-bottom, left-to-right ordering).
-        When ``level-of-operation`` is ``page``, prefer vertical (column-first)
-        succession of regions. When it is ``table``, prefer horizontal (row-first)
-        succession of cells.
-        
-        Then for each resulting segment label, convert its background mask into
-        polygon outlines by finding the outer contours consistent with the element's
-        polygon outline. Annotate the result by adding it as a new TextLine/TextRegion:
-        - If ``level-of-operation`` is ``region``, then append the new lines to the
-          parent region.
-        - If it is ``table``, then append the new lines to their respective regions,
-          and append the new regions to the parent table.
-          (Also, create an OrderedGroup for it as the parent's RegionRef.)
-        - If it is ``page``, then append the new lines to their respective regions,
-          and append the new regions to the page.
-          (Also, create an OrderedGroup for it in the ReadingOrder.)
-        
-        Produce a new output file by serialising the resulting hierarchy.
-        """
-        # FIXME: allow passing a-priori info on reading order / textline order
-        # (and then pass on as ``bt`` and ``rl``; however, there may be a mixture
-        #  of different scripts; also, vertical writing needs internal rotation
-        #  because our line segmentation only works for horizontal writing)
-        overwrite_lines = self.parameter['overwrite_lines']
-        overwrite_regions = self.parameter['overwrite_regions']
-        overwrite_separators = self.parameter['overwrite_separators']
-        overwrite_order = self.parameter['overwrite_order']
-        oplevel = self.parameter['level-of-operation']
-
-        for (n, input_file) in enumerate(self.input_files):
-            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
-            file_id = make_file_id(input_file, self.output_file_grp)
-
-            pcgts = page_from_file(self.workspace.download_file(input_file))
-            self.add_metadata(pcgts)
-            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
-            page = pcgts.get_Page()
-
-            # TODO: also allow grayscale_normalized (try/except?)
-            page_image, page_coords, page_image_info = self.workspace.image_from_page(
-                page, page_id, feature_selector='binarized')
-            zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
-
-            # aggregate existing regions so their foreground can be ignored
-            ignore = (page.get_ImageRegion() +
-                      page.get_LineDrawingRegion() +
-                      page.get_GraphicRegion() +
-                      page.get_ChartRegion() +
-                      page.get_MapRegion() +
-                      page.get_MathsRegion() +
-                      page.get_ChemRegion() +
-                      page.get_MusicRegion() +
-                      page.get_AdvertRegion() +
-                      page.get_NoiseRegion() +
-                      page.get_UnknownRegion() +
-                      page.get_CustomRegion())
-            if oplevel == 'page' and overwrite_separators:
-                page.set_SeparatorRegion([])
-            else:
-                ignore.extend(page.get_SeparatorRegion())
-            # prepare reading order
-            reading_order = dict()
-            ro = page.get_ReadingOrder()
-            if ro:
-                rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup()
-                if rogroup:
-                    page_get_reading_order(reading_order, rogroup)
-            
-            # get segments to process / overwrite
-            if oplevel == 'page':
-                ignore.extend(page.get_TableRegion())
-                regions = list(page.get_TextRegion())
-                if regions:
-                    # page is already region-segmented
-                    if overwrite_regions:
-                        self.logger.info('removing existing TextRegions in page "%s"', page_id)
-                        # we could remove all other region types as well,
-                        # but this is more flexible (for workflows with
-                        # specialized separator/image/table detectors):
-                        page.set_TextRegion([])
-                        page.set_ReadingOrder(None)
-                        ro = None
-                    else:
-                        self.logger.warning('keeping existing TextRegions in page "%s"', page_id)
-                        ignore.extend(regions)
-                # create reading order if necessary
-                if not ro or overwrite_order:
-                    ro = ReadingOrderType()
-                    page.set_ReadingOrder(ro)
-                rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup()
-                if not rogroup:
-                    # new top-level group
-                    rogroup = OrderedGroupType(id="reading-order")
-                    ro.set_OrderedGroup(rogroup)
-                # go get TextRegions with TextLines (and SeparatorRegions):
-                self._process_element(page, ignore, page_image, page_coords,
-                                      page_id, file_id,
-                                      input_file.pageId, zoom, rogroup=rogroup)
-                if (not rogroup.get_RegionRefIndexed() and
-                    not rogroup.get_OrderedGroupIndexed() and
-                    not rogroup.get_UnorderedGroupIndexed()):
-                     # schema forbids empty OrderedGroup
-                    ro.set_OrderedGroup(None)
-            elif oplevel == 'table':
-                ignore.extend(page.get_TextRegion())
-                regions = list(page.get_TableRegion())
-                if not regions:
-                    self.logger.warning('Page "%s" contains no table regions', page_id)
-                for region in regions:
-                    subregions = region.get_TextRegion()
-                    if subregions:
-                        # table is already cell-segmented
-                        if overwrite_regions:
-                            self.logger.info('removing existing TextRegions in table "%s"', region.id)
-                            region.set_TextRegion([])
-                            roelem = reading_order.get(region.id)
-                            # replace by empty group with same index and ref
-                            # (which can then take the cells as subregions)
-                            reading_order[region.id] = page_subgroup_in_reading_order(self.logger, roelem)
-                        else:
-                            self.logger.warning('skipping table "%s" with existing TextRegions', region.id)
-                            continue
-                    # TODO: also allow grayscale_normalized (try/except?)
-                    region_image, region_coords = self.workspace.image_from_segment(
-                        region, page_image, page_coords, feature_selector='binarized')
-                    # ignore everything but the current table region
-                    subignore = regions + ignore
-                    subignore.remove(region)
-                    # create reading order group if necessary
-                    roelem = reading_order.get(region.id)
-                    if not roelem:
-                        self.logger.warning("Page '%s' table region '%s' is not referenced in reading order (%s)",
-                                    page_id, region.id, "no target to add cells to")
-                    elif overwrite_order:
-                        # replace by empty ordered group with same (index and) ref
-                        # (which can then take the cells as subregions)
-                        roelem = page_subgroup_in_reading_order(self.logger, roelem)
-                        reading_order[region.id] = roelem
-                    elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)):
-                        self.logger.warning("Page '%s' table region '%s' already has an ordered group (%s)",
-                                    page_id, region.id, "cells will be appended")
-                    elif isinstance(roelem, (UnorderedGroupType, UnorderedGroupIndexedType)):
-                        self.logger.warning("Page '%s' table region '%s' already has an unordered group (%s)",
-                                    page_id, region.id, "cells will not be appended")
-                        roelem = None
-                    else:
-                        # replace regionRef(Indexed) by group with same index and ref
-                        # (which can then take the cells as subregions)
-                        roelem = page_subgroup_in_reading_order(self.logger, roelem)
-                        reading_order[region.id] = roelem
-                    # go get TextRegions with TextLines (and SeparatorRegions)
-                    self._process_element(region, subignore, region_image, region_coords,
-                                          region.id, file_id + '_' + region.id,
-                                          input_file.pageId, zoom, rogroup=roelem)
-            else: # 'region'
-                regions = list(page.get_TextRegion())
-                # besides top-level text regions, line-segment any table cells,
-                # and for tables without any cells, add a pseudo-cell
-                for region in page.get_TableRegion():
-                    subregions = region.get_TextRegion()
-                    if subregions:
-                        regions.extend(subregions)
-                    else:
-                        subregion = TextRegionType(id=region.id + '_text',
-                                                   Coords=region.get_Coords(),
-                                                   # as if generated from parser:
-                                                   parent_object_=region)
-                        region.add_TextRegion(subregion)
-                        regions.append(subregion)
-                if not regions:
-                    self.logger.warning('Page "%s" contains no text regions', page_id)
-                for region in regions:
-                    if region.get_TextLine():
-                        if overwrite_lines:
-                            self.logger.info('removing existing TextLines in page "%s" region "%s"', page_id, region.id)
-                            region.set_TextLine([])
-                        else:
-                            self.logger.warning('keeping existing TextLines in page "%s" region "%s"', page_id, region.id)
-                            ignore.extend(region.get_TextLine())
-                    # TODO: also allow grayscale_normalized (try/except?)
-                    region_image, region_coords = self.workspace.image_from_segment(
-                        region, page_image, page_coords, feature_selector='binarized')
-                    # if the region images have already been clipped against their neighbours specifically,
-                    # then we don't need to suppress all neighbours' foreground generally here
-                    if 'clipped' in region_coords['features'].split(','):
-                        ignore = []
-                    # go get TextLines
-                    self._process_element(region, ignore, region_image, region_coords,
-                                          region.id, file_id + '_' + region.id,
-                                          input_file.pageId, zoom)
-
-            # update METS (add the PAGE file):
-            file_path = join(self.output_file_grp, file_id + '.xml')
-            pcgts.set_pcGtsId(file_id)
-            out = self.workspace.add_file(
-                ID=file_id,
-                file_grp=self.output_file_grp,
-                pageId=input_file.pageId,
-                local_filename=file_path,
-                mimetype=MIMETYPE_PAGE,
-                content=to_xml(pcgts))
-            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
-                     file_id, self.output_file_grp, out.local_filename)
-
     def _process_element(self, element, ignore, image, coords, element_id, file_id, page_id, zoom=1.0, rogroup=None):
         """Add PAGE layout elements by segmenting an image.
 

From 5d4007be9ec0e352520995302bd8b11e92e51aae Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 11:41:01 +0200
Subject: [PATCH 127/194] segment: adapt to final v3 API

---
 ocrd_cis/ocropy/segment.py | 252 +++++++++++++++++++------------------
 1 file changed, 133 insertions(+), 119 deletions(-)

diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 94b6ab1f..bdeb40dd 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -1,8 +1,10 @@
 from __future__ import absolute_import
+
+from typing import Optional
 from logging import Logger
 from os.path import join
-from typing import Optional
 import itertools
+
 import numpy as np
 from scipy.sparse.csgraph import minimum_spanning_tree
 from skimage import draw
@@ -14,15 +16,21 @@
 from shapely.validation import explain_validity
 from shapely import set_precision
 
-from ocrd_modelfactory import page_from_file
+from ocrd_utils import (
+    getLogger,
+    coordinates_of_segment,
+    coordinates_for_segment,
+    points_from_polygon,
+    polygon_from_points,
+)
 from ocrd_models.ocrd_page import (
-    to_xml, CoordsType,
-    OcrdPage,
+    CoordsType,
     TextLineType,
     TextRegionType,
     SeparatorRegionType,
     PageType,
-    AlternativeImageType
+    AlternativeImageType,
+    OcrdPage
 )
 from ocrd_models.ocrd_page_generateds import (
     BaselineType,
@@ -37,16 +45,7 @@
     ReadingOrderType
 )
 from ocrd import Processor
-from ocrd.processor import OcrdPageResult
-from ocrd_utils import (
-    getLogger,
-    make_file_id,
-    coordinates_of_segment,
-    coordinates_for_segment,
-    points_from_polygon,
-    polygon_from_points,
-    MIMETYPE_PAGE
-)
+from ocrd.processor import OcrdPageResult, OcrdPageResultImage
 
 from .ocrolib import midrange
 from .ocrolib import morph
@@ -255,11 +254,12 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropySegment')
 
-    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
+    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
         """Segment pages into regions+lines, tables into cells+lines, or regions into lines.
-        Open and deserialise PAGE input files and their respective images,
+        
+        Open and deserialise PAGE input file and its respective images,
         then iterate over the element hierarchy down to the requested level.
-
+        
         Depending on ``level-of-operation``, consider existing segments:
         - If ``overwrite_separators=True`` on ``page`` level, then
           delete any SeparatorRegions.
@@ -272,12 +272,12 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
         - If ``overwrite_order=True`` on ``page`` or ``table`` level, then
           delete the reading order OrderedGroup entry corresponding
           to the (page/table) segment.
-
+        
         Next, get each element image according to the layout annotation (from
         the alternative image of the page/region, or by cropping via coordinates
         into the higher-level image) in binarized form, and represent it as an array
         with non-text regions and (remaining) text neighbours suppressed.
-
+        
         Then compute a text line segmentation for that array (as a label mask).
         When ``level-of-operation`` is ``page`` or ``table``, this also entails
         detecting
@@ -286,25 +286,25 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
         - up to ``maxcolseps`` background column separators
         before text line segmentation itself, as well as aggregating text lines
         to text regions afterwards.
-
+        
         Text regions are detected via a hybrid variant recursive X-Y cut algorithm
         (RXYC): RXYC partitions the binarized image in top-down manner by detecting
         horizontal or vertical gaps. This implementation uses the bottom-up text line
         segmentation to guide the search, and also uses both pre-existing and newly
         detected separators to alternatively partition the respective boxes into
         non-rectangular parts.
-
+        
         During line segmentation, suppress the foreground of all previously annotated
         regions (of any kind) and lines, except if just removed due to ``overwrite``.
         During region aggregation however, combine the existing separators with the
         new-found separators to guide the column search.
-
+        
         All detected segments (both text line and text region) are sorted according
         to their reading order (assuming a top-to-bottom, left-to-right ordering).
         When ``level-of-operation`` is ``page``, prefer vertical (column-first)
         succession of regions. When it is ``table``, prefer horizontal (row-first)
         succession of cells.
-
+        
         Then for each resulting segment label, convert its background mask into
         polygon outlines by finding the outer contours consistent with the element's
         polygon outline. Annotate the result by adding it as a new TextLine/TextRegion:
@@ -316,7 +316,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
         - If it is ``page``, then append the new lines to their respective regions,
           and append the new regions to the page.
           (Also, create an OrderedGroup for it in the ReadingOrder.)
-
+        
         Produce a new output file by serialising the resulting hierarchy.
         """
         # FIXME: allow passing a-priori info on reading order / textline order
@@ -330,6 +330,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
         oplevel = self.parameter['level-of-operation']
 
         pcgts = input_pcgts[0]
+        result = OcrdPageResult(pcgts)
         page = pcgts.get_Page()
 
         # TODO: also allow grayscale_normalized (try/except?)
@@ -361,14 +362,15 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
             rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup()
             if rogroup:
                 page_get_reading_order(reading_order, rogroup)
-            # get segments to process / overwrite
+
+        # get segments to process / overwrite
         if oplevel == 'page':
             ignore.extend(page.get_TableRegion())
             regions = list(page.get_TextRegion())
             if regions:
                 # page is already region-segmented
                 if overwrite_regions:
-                    self.logger.info(f'Removing existing TextRegions in page "{page_id}"')
+                    self.logger.info('removing existing TextRegions in page "%s"', page_id)
                     # we could remove all other region types as well,
                     # but this is more flexible (for workflows with
                     # specialized separator/image/table detectors):
@@ -376,7 +378,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                     page.set_ReadingOrder(None)
                     ro = None
                 else:
-                    self.logger.warning(f'Keeping existing TextRegions in page "{page_id}"')
+                    self.logger.warning('keeping existing TextRegions in page "%s"', page_id)
                     ignore.extend(regions)
             # create reading order if necessary
             if not ro or overwrite_order:
@@ -387,32 +389,36 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                 # new top-level group
                 rogroup = OrderedGroupType(id="reading-order")
                 ro.set_OrderedGroup(rogroup)
-            # go get TextRegions with TextLines (and SeparatorRegions):
-            self._process_element(
-                page, ignore, page_image, page_coords, page_id, file_id, page_id, zoom, rogroup=rogroup)
             if (not rogroup.get_RegionRefIndexed() and
-                    not rogroup.get_OrderedGroupIndexed() and
-                    not rogroup.get_UnorderedGroupIndexed()):
-                # schema forbids empty OrderedGroup
+                not rogroup.get_OrderedGroupIndexed() and
+                not rogroup.get_UnorderedGroupIndexed()):
+                 # schema forbids empty OrderedGroup
                 ro.set_OrderedGroup(None)
-        elif oplevel == 'table':
+            # go get TextRegions with TextLines (and SeparatorRegions):
+            image = self._process_element(page, ignore, page_image, page_coords,
+                                          zoom=zoom, rogroup=rogroup)
+            if image:
+                result.images.append(image)
+            return result
+
+        if oplevel == 'table':
             ignore.extend(page.get_TextRegion())
             regions = list(page.get_TableRegion())
             if not regions:
-                self.logger.warning(f'Page "{page_id}" contains no table regions')
+                self.logger.warning('Page "%s" contains no table regions', page_id)
             for region in regions:
                 subregions = region.get_TextRegion()
                 if subregions:
                     # table is already cell-segmented
                     if overwrite_regions:
-                        self.logger.info(f'Removing existing TextRegions in table "{region.id}"')
+                        self.logger.info('removing existing TextRegions in table "%s"', region.id)
                         region.set_TextRegion([])
                         roelem = reading_order.get(region.id)
                         # replace by empty group with same index and ref
                         # (which can then take the cells as subregions)
                         reading_order[region.id] = page_subgroup_in_reading_order(self.logger, roelem)
                     else:
-                        self.logger.warning(f'Skipping table "{region.id}" with existing TextRegions')
+                        self.logger.warning('skipping table "%s" with existing TextRegions', region.id)
                         continue
                 # TODO: also allow grayscale_normalized (try/except?)
                 region_image, region_coords = self.workspace.image_from_segment(
@@ -423,19 +429,19 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                 # create reading order group if necessary
                 roelem = reading_order.get(region.id)
                 if not roelem:
-                    self.logger.warning(f"Page '{page_id}' table region '{region.id}' is not referenced in reading "
-                                        f"order (no target to add cells to)")
+                    self.logger.warning("Page '%s' table region '%s' is not referenced in reading order (%s)",
+                                page_id, region.id, "no target to add cells to")
                 elif overwrite_order:
                     # replace by empty ordered group with same (index and) ref
                     # (which can then take the cells as subregions)
                     roelem = page_subgroup_in_reading_order(self.logger, roelem)
                     reading_order[region.id] = roelem
                 elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)):
-                    self.logger.warning(f"Page '{page_id}' table region '{region.id}' already has an ordered "
-                                        f"group (cells will be appended)")
+                    self.logger.warning("Page '%s' table region '%s' already has an ordered group (%s)",
+                                page_id, region.id, "cells will be appended")
                 elif isinstance(roelem, (UnorderedGroupType, UnorderedGroupIndexedType)):
-                    self.logger.warning(f"Page '{page_id}' table region '{region.id}' already has an unordered "
-                                        f"group (cells will not be appended)")
+                    self.logger.warning("Page '%s' table region '%s' already has an unordered group (%s)",
+                                page_id, region.id, "cells will not be appended")
                     roelem = None
                 else:
                     # replace regionRef(Indexed) by group with same index and ref
@@ -443,10 +449,11 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                     roelem = page_subgroup_in_reading_order(self.logger, roelem)
                     reading_order[region.id] = roelem
                 # go get TextRegions with TextLines (and SeparatorRegions)
-                self._process_element(
-                    region, subignore, region_image, region_coords, region.id, file_id + '_' + region.id,
-                    page_id, zoom, rogroup=roelem)
-        else:  # 'region'
+                image = self._process_element(region, subignore, region_image, region_coords,
+                                              zoom=zoom, rogroup=roelem)
+                if image:
+                    result.images.append(image)
+        else: # 'region'
             regions = list(page.get_TextRegion())
             # besides top-level text regions, line-segment any table cells,
             # and for tables without any cells, add a pseudo-cell
@@ -455,11 +462,10 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                 if subregions:
                     regions.extend(subregions)
                 else:
-                    subregion = TextRegionType(
-                        id=region.id + '_text',
-                        Coords=region.get_Coords(),
-                        # as if generated from parser:
-                        parent_object_=region)
+                    subregion = TextRegionType(id=region.id + '_text',
+                                               Coords=region.get_Coords(),
+                                               # as if generated from parser:
+                                               parent_object_=region)
                     region.add_TextRegion(subregion)
                     regions.append(subregion)
             if not regions:
@@ -467,10 +473,10 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
             for region in regions:
                 if region.get_TextLine():
                     if overwrite_lines:
-                        self.logger.info(f'Removing existing TextLines in page "{page_id}" region "{region.id}"')
+                        self.logger.info('removing existing TextLines in page "%s" region "%s"', page_id, region.id)
                         region.set_TextLine([])
                     else:
-                        self.logger.warning(f'Keeping existing TextLines in page "{page_id}" region "{region.id}"')
+                        self.logger.warning('keeping existing TextLines in page "%s" region "%s"', page_id, region.id)
                         ignore.extend(region.get_TextLine())
                 # TODO: also allow grayscale_normalized (try/except?)
                 region_image, region_coords = self.workspace.image_from_segment(
@@ -480,11 +486,13 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                 if 'clipped' in region_coords['features'].split(','):
                     ignore = []
                 # go get TextLines
-                self._process_element(
-                    region, ignore, region_image, region_coords, region.id, file_id + '_' + region.id, page_id, zoom)
-        return OcrdPageResult(pcgts)
+                image = self._process_element(region, ignore, region_image, region_coords, zoom=zoom)
+                if image:
+                    result.images.append(image)
 
-    def _process_element(self, element, ignore, image, coords, element_id, file_id, page_id, zoom=1.0, rogroup=None):
+        return result
+
+    def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=None) -> Optional[OcrdPageResultImage]:
         """Add PAGE layout elements by segmenting an image.
 
         Given a PageType, TableRegionType or TextRegionType ``element``, and
@@ -503,14 +511,15 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
         newly detected separators to guide region segmentation.
         """
         if not image.width or not image.height:
-            self.logger.warning(f"Skipping '{element_id}' with zero size")
-            return
+            self.logger.warning(f"Skipping '{element.id}' with zero size")
+            return None
         element_array = pil2array(image)
         element_bin = np.array(element_array <= midrange(element_array), bool)
         sep_bin = np.zeros_like(element_bin, bool)
         ignore_labels = np.zeros_like(element_bin, int)
         for i, segment in enumerate(ignore):
-            self.logger.debug(f'Masking foreground of {type(segment).__name__[:-4]} "{segment.id}" for "{element_id}"')
+            self.logger.debug(f'masking foreground of {type(segment).__name__[:-4]} '
+                              f'"{segment.id}" for "{element.id}"')
             # mark these segments (e.g. separator regions, tables, images)
             # for workflows where they have been detected already;
             # these will be:
@@ -522,14 +531,15 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             # then this will silently ignore them. The caller does
             # not need to concern herself with this.
             sp_row = segment_polygon[:, 1]
-            sp_column = segment_polygon[:, 0]
+            sp_col = segment_polygon[:, 0]
             if isinstance(segment, SeparatorRegionType):
-                sep_bin[draw.polygon(sp_row, sp_column, sep_bin.shape)] = True
-            ignore_labels[draw.polygon(sp_row, sp_column, ignore_labels.shape)] = i + 1  # mapped back for RO
+                sep_bin[draw.polygon(sp_row, sp_col, sep_bin.shape)] = True
+            ignore_labels[draw.polygon(sp_row, sp_col, ignore_labels.shape)] = i+1 # mapped back for RO
         if isinstance(element, PageType):
             element_name = 'page'
             fullpage = True
             report = check_page(element_bin, zoom)
+            suffix = '.IMG-CLIP'
         elif isinstance(element, TableRegionType) or (
                 # sole/congruent text region of a table region?
                 element.id.endswith('_text') and
@@ -537,11 +547,13 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             element_name = 'table'
             fullpage = True
             report = check_region(element_bin, zoom)
+            suffix = element.id + '.IMG-CLIP'
         else:
             element_name = 'region'
             fullpage = False
             report = check_region(element_bin, zoom)
-        self.logger.info(f'Computing line segmentation for {element_name} "{element_id}"')
+            suffix = element.id + '.IMG-CLIP'
+        self.logger.info(f'computing line segmentation for {element_name} "{element.id}"')
         # TODO: we should downscale if DPI is large enough to save time
         try:
             if report:
@@ -551,7 +563,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                 # but keep them for h/v-line detection (in fullpage mode):
                 element_bin, seps=(sep_bin + ignore_labels) > 0,
                 zoom=zoom, fullpage=fullpage,
-                spread_dist=round(self.parameter['spread'] / zoom * 300 / 72),  # in pt
+                spread_dist=round(self.parameter['spread'] / zoom * 300 / 72), # in pt
                 # these are ignored when not in fullpage mode:
                 maxcolseps=self.parameter['maxcolseps'],
                 maxseps=self.parameter['maxseps'],
@@ -559,13 +571,15 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                 csminheight=self.parameter['csminheight'])
         except Exception as err:
             if isinstance(element, TextRegionType):
-                self.logger.error(f'Cannot line-segment region "{element_id}": {err}')
+                self.logger.error(f'Cannot line-segment region "{element.id}": {err}')
                 # as a fallback, add a single text line comprising the whole region:
-                element.add_TextLine(TextLineType(id=element_id + "_line", Coords=element.get_Coords()))
+                element.add_TextLine(TextLineType(id=element.id + "_line", Coords=element.get_Coords()))
             else:
-                self.logger.error(f'Cannot line-segment {element_name} "{element_id}": {err}')
-            return
-        self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines for {element_name} "{element_id}"')
+                self.logger.error(f'Cannot line-segment {element_name} "{element.id}": {err}')
+            return None
+
+        self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines '
+                         f'for {element_name} "{element.id}"')
         # post-process line labels
         if isinstance(element, (PageType, TableRegionType)):
             # aggregate text lines to text regions
@@ -580,17 +594,18 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                 region_labels = lines2regions(
                     element_bin, line_labels,
                     rlabels=ignore_labels,
-                    sepmask=np.maximum(sepmask, colseps),  # add bg
+                    sepmask=np.maximum(sepmask, colseps), # add bg
                     # decide horizontal vs vertical cut when gaps of similar size
                     prefer_vertical=not isinstance(element, TableRegionType),
                     gap_height=self.parameter['gap_height'],
                     gap_width=self.parameter['gap_width'],
                     scale=scale, zoom=zoom)
-                self.logger.info(
-                    f'Found {len(np.unique(region_labels)) - 1} text regions for {element_name} "{element_id}"')
+                self.logger.info(f'Found {len(np.unique(region_labels)) - 1} text regions '
+                                 f'for {element_name} "{element.id}"')
             except Exception as err:
-                self.logger.error(f'Cannot region-segment {element_name} "{element_id}": {err}')
+                self.logger.error(f'Cannot region-segment {element_name} "{element.id}": {err}')
                 region_labels = np.where(line_labels > len(ignore), 1 + len(ignore), line_labels)
+            
             # prepare reading order group index
             if rogroup:
                 if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)):
@@ -607,7 +622,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             region_no = 0
             for region_label in np.unique(region_labels):
                 if not region_label:
-                    continue  # no bg
+                    continue # no bg
                 region_mask = region_labels == region_label
                 region_line_labels = line_labels * region_mask
                 region_line_labels0 = np.setdiff1d(region_line_labels, [0])
@@ -616,12 +631,12 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     # (no new region, no actual text lines)
                     region_line_labels0 = np.intersect1d(region_line_labels0, ignore_labels)
                     assert len(region_line_labels0) == 1, \
-                        (f"Region label {region_label} has both existing regions and new lines "
-                         f"({str(region_line_labels0)})")
+                        (f'region label "{region_label}" has both existing regions and new lines '
+                         f'({str(region_line_labels0)})')
                     region = ignore[region_line_labels0[0] - 1]
                     if rogroup and region.parent_object_ is element and not isinstance(region, SeparatorRegionType):
                         index = page_add_to_reading_order(rogroup, region.id, index)
-                    self.logger.debug(f'Region label {region_label} is for ignored region "{region.id}"')
+                    self.logger.debug(f'Region label "{region_label}" is for ignored region "{region.id}"')
                     continue
                 # normal case: new lines inside new regions
                 # remove binary-empty labels, and re-order locally
@@ -629,18 +644,18 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                 order[np.setdiff1d(region_line_labels0, element_bin * region_line_labels)] = 0
                 region_line_labels = order[region_line_labels]
                 # avoid horizontal gaps
-                region_line_labels = hmerge_line_seeds(
-                    element_bin, region_line_labels, scale, seps=np.maximum(sepmask, colseps))
+                region_line_labels = hmerge_line_seeds(element_bin, region_line_labels, scale,
+                                                       seps=np.maximum(sepmask, colseps))
                 region_mask |= region_line_labels > 0
                 # find contours for region (can be non-contiguous)
-                regions, _ = masks2polygons(
-                    self.logger, region_mask * region_label, None, element_bin,
-                    name=f'{element_name} "{element_id}"', min_area=6000 / zoom / zoom,
-                    simplify=ignore_labels * ~(sep_bin))
+                regions, _ = masks2polygons(self.logger, region_mask * region_label, None, element_bin,
+                                            name=f'{element_name} "{element.id}"',
+                                            min_area=6000 / zoom / zoom,
+                                            simplify=ignore_labels * ~(sep_bin))
                 # find contours for lines (can be non-contiguous)
-                lines, _ = masks2polygons(
-                    self.logger, region_line_labels, baselines, element_bin,
-                    name=f'region "{element_id}"', min_area=640 / zoom / zoom)
+                lines, _ = masks2polygons(self.logger, region_line_labels, baselines, element_bin,
+                                          name=f'region "{element.id}"',
+                                          min_area=640 / zoom / zoom)
                 # create new lines in new regions (allocating by intersection)
                 line_polys = [Polygon(polygon) for _, polygon, _ in lines]
                 for _, region_polygon, _ in regions:
@@ -653,13 +668,13 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                         continue
                     # annotate result:
                     region_no += 1
-                    region_id = element_id + "_region%04d" % region_no
+                    region_id = element.id + "_region%04d" % region_no
                     self.logger.debug(f'Region label {region_label} becomes ID "{region_id}"')
                     region = TextRegionType(id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))
                     # find out which line (contours) belong to which region (contours)
                     line_no = 0
                     for i, line_poly in enumerate(line_polys):
-                        if not region_poly.intersects(line_poly):  # .contains
+                        if not region_poly.intersects(line_poly): # .contains
                             continue
                         line_label, line_polygon, line_baseline = lines[i]
                         # convert back to absolute (page) coordinates:
@@ -681,16 +696,16 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     # if the region has received text lines, keep it
                     if region.get_TextLine():
                         element.add_TextRegion(region)
-                        self.logger.info(
-                            f'Added region "{region_id}" with {line_no} lines for {element_name} "{element_id}"')
+                        self.logger.info(f'Added region "{region_id}" with {line_no} lines '
+                                         f'for {element_name} "{element.id}"')
                         if rogroup:
                             index = page_add_to_reading_order(rogroup, region.id, index)
             # add additional image/non-text regions from compute_segmentation
             # (e.g. drop-capitals or images) ...
-            self.logger.info(f'Found {images.max()} large image regions for {element_name} "{element_id}"')
+            self.logger.info(f'Found {images.max()} large image regions for {element_name} "{element.id}"')
             # find contours around region labels (can be non-contiguous):
-            image_polygons, _ = masks2polygons(
-                self.logger, images, None, element_bin, f'{element_name} "{element_id}"')
+            image_polygons, _ = masks2polygons(self.logger, images, None, element_bin,
+                                               name=f'{element_name} "{element.id}"')
             for image_label, polygon, _ in image_polygons:
                 # convert back to absolute (page) coordinates:
                 region_polygon = coordinates_for_segment(polygon, image, coords)
@@ -700,15 +715,15 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     continue
                 region_no += 1
                 # annotate result:
-                region_id = element_id + "_image%04d" % region_no
+                region_id = element.id + "_image%04d" % region_no
                 element.add_ImageRegion(ImageRegionType(
                     id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon))))
             # split detected separator labels into separator regions:
-            self.logger.info(f'Found {seplines.max()} separators for {element_name} "{element_id}"')
+            self.logger.info(f'Found {seplines.max()} separators for {element_name} "{element.id}"')
             # find contours around region labels (can be non-contiguous):
-            sep_polygons, _ = masks2polygons(
-                self.logger, seplines, None, element_bin,
-                name=f'{element_name} "{element_id}"', open_holes=True, reorder=False)
+            sep_polygons, _ = masks2polygons(self.logger, seplines, None, element_bin,
+                                             name=f'{element_name} "{element.id}"',
+                                             open_holes=True, reorder=False)
             for sep_label, polygon, _ in sep_polygons:
                 # convert back to absolute (page) coordinates:
                 region_polygon = coordinates_for_segment(polygon, image, coords)
@@ -718,27 +733,28 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     continue
                 # annotate result:
                 region_no += 1
-                region_id = element_id + "_sep%04d" % region_no
+                region_id = element.id + "_sep%04d" % region_no
                 element.add_SeparatorRegion(SeparatorRegionType(
                     id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon))))
             # annotate a text/image-separated image
             element_array[sepmask] = np.amax(element_array) # clip to white/bg
             image_clipped = array2pil(element_array)
-            file_path = self.workspace.save_image_file(
-                image_clipped, file_id + '.IMG-CLIP', self.output_file_grp, page_id=page_id)
-            element.add_AlternativeImage(AlternativeImageType(
-                filename=file_path, comments=coords['features'] + ',clipped'))
+            image_ref = AlternativeImageType(comments=coords['features'] + ',clipped')
+            element.add_AlternativeImage(image_ref)
+            return OcrdPageResultImage(image_clipped, suffix, image_ref)
         else:
-            #  get mask from region polygon:
+            # get mask from region polygon:
             region_polygon = coordinates_of_segment(element, image, coords)
             region_mask = np.zeros_like(element_bin, bool)
-            region_mask[draw.polygon(region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True
+            region_mask[draw.polygon(region_polygon[:, 1],
+                                     region_polygon[:, 0],
+                                     region_mask.shape)] = True
             # ensure the new line labels do not extrude from the region:
             line_labels = line_labels * region_mask
             # find contours around labels (can be non-contiguous):
-            line_polygons, _ = masks2polygons(
-                self.logger, line_labels, baselines, element_bin,
-                name=f'region "{element_id}"', min_area=640 / zoom / zoom)
+            line_polygons, _ = masks2polygons(self.logger, line_labels, baselines, element_bin,
+                                              name=f'region "{element.id}"',
+                                              min_area=640 / zoom / zoom)
             line_no = 0
             for line_label, polygon, baseline in line_polygons:
                 # convert back to absolute (page) coordinates:
@@ -749,22 +765,20 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     continue
                 # annotate result:
                 line_no += 1
-                line_id = element_id + "_line%04d" % line_no
+                line_id = element.id + "_line%04d" % line_no
                 line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon)))
                 if baseline:
                     line_baseline = coordinates_for_segment(baseline, image, coords)
                     line.set_Baseline(BaselineType(points=points_from_polygon(line_baseline)))
                 element.add_TextLine(line)
             if not sep_bin.any():
-                return  # no derived image
+                return None # no derived image
             # annotate a text/image-separated image
-            element_array[sep_bin] = np.amax(element_array)  # clip to white/bg
+            element_array[sep_bin] = np.amax(element_array) # clip to white/bg
             image_clipped = array2pil(element_array)
-            file_path = self.workspace.save_image_file(
-                image_clipped, file_id + '.IMG-CLIP', self.output_file_grp, page_id=page_id)
-            # update PAGE (reference the image file):
-            element.add_AlternativeImage(AlternativeImageType(
-                filename=file_path, comments=coords['features'] + ',clipped'))
+            image_ref = AlternativeImageType(comments=coords['features'] + ',clipped')
+            element.add_AlternativeImage(image_ref)
+            return OcrdPageResultImage(image_clipped, suffix, image_ref)
 
 def polygon_for_parent(polygon, parent):
     """Clip polygon to parent polygon range.

From 83ba2f01e0cb210fa7777c7fc4f9ddc3233be633 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 11:54:14 +0200
Subject: [PATCH 128/194] CI: try testing in parallel

---
 .circleci/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 35f0a966..d5e18b9f 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -10,7 +10,7 @@ jobs:
       - checkout
       - run: apt-get update && apt-get -y install default-jre-headless
       - run: make install
-      - run: make test V=""
+      - run: make -j test V=""
 
   deploy-docker:
     docker:

From a2100c29c0f4f85803e0faa2dde9bdf84299c589 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 12:55:16 +0200
Subject: [PATCH 129/194] Updated config.yml

---
 .circleci/config.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index d5e18b9f..5825a4e0 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -3,6 +3,7 @@ jobs:
   test-python3:
     docker:
       - image: ocrd/core
+    resource_class: large
     environment:
       PIP: pip3
       PYTHON: python3

From df1c35cbe1325a8da5dabd2c9227a7246439fd15 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 14:42:57 +0200
Subject: [PATCH 130/194] train: adapt to final v3 API

---
 ocrd_cis/ocropy/train.py | 129 +++++++++++++++++++--------------------
 1 file changed, 64 insertions(+), 65 deletions(-)

diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py
index 08b68693..5c57b2cf 100644
--- a/ocrd_cis/ocropy/train.py
+++ b/ocrd_cis/ocropy/train.py
@@ -1,12 +1,15 @@
 from __future__ import absolute_import
+
+from typing import Optional
 from logging import Logger
 from sys import exit
 from os import getcwd, makedirs, remove
 from os.path import abspath, dirname, exists, join, isfile
 import tempfile
 
-from ocrd_modelfactory import page_from_file
-from ocrd import Processor
+from ocrd_models import OcrdPage
+from ocrd import Processor, Workspace
+from ocrd.processor import OcrdPageResult
 from ocrd_utils import getLogger
 
 from .ocropus_rtrain import *
@@ -37,80 +40,79 @@ def executable(self):
 
     def setup(self):
         self.logger = getLogger('processor.OcropyTrain')
-        self.old_cwd = getcwd()
-        #print(self.parameter)
         if 'model' in self.parameter:
             model = self.parameter['model']
             try:
-                modelpath = self.resolve_resource(model)
+                self.modelpath = self.resolve_resource(model)
             except SystemExit:
                 ocropydir = dirname(abspath(__file__))
-                modelpath = join(ocropydir, 'models', model)
-                self.logger.info("Failed to resolve model '%s' path, trying '%s'", model, modelpath)
+                self.modelpath = join(ocropydir, 'models', model)
+                self.logger.error(f"Failed to resolve model '{model}' path, trying '{modelpath}'")
             if not isfile(modelpath):
-                self.logger.error("Could not find model '%s'. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s'",
-                               model, model)
+                self.logger.critical(f"Could not find model '{model}'.\n"
+                                     f"Try 'ocrd resmgr download ocrd-cis-ocropy-recognize {model}'")
                 exit(1)
-            outputpath = join(self.old_cwd, 'output', model)
-            if 'outputpath' in self.parameter:
-                outputpath = join(self.parameter, model)
+            self.outputpath = join(self.parameter.get('outputpath', 'output'), model)
         else:
-            modelpath = None
-            outputpath = join(self.old_cwd, 'output', 'lstm')
-            if 'outputpath' in self.parameter:
-                outputpath = join(self.parameter, 'lstm')
-        makedirs(dirname(outputpath))
-        self.modelpath = modelpath
-        self.outputpath = outputpath
-
-    def process(self):
+            self.modelpath = None
+            self.outputpath = join(self.parameter.get('outputpath', 'output'), 'lstm')
+        makedirs(dirname(self.outputpath))
+        self.filelist = None
+
+    def process_workspace(self, workspace: Workspace) -> None:
         """
         Trains a new model on the text lines from the input fileGrp,
-        extracted as temporary image-text file pairs.
+        extracted as image-text file pairs into the output fileGrp.
+        (If the output fileGrp already exists and these files should
+        be re-used, pass the `--overwrite` option when processing.)
+
+        The model is written into `outputpath` (or just `output`) under
+        the same name as `model` (i.e. the start model, or just `lstm`).
+        """
+        self.filelist = []
+        super().process_workspace(workspace)
+        self.logger.info(f"Training {self.outputpath} from {self.modelpath or 'scratch'} "
+                         f"on {len(self.filelist)} file pairs")
+        rtrain(self.filelist, self.modelpath, self.outputpath, self.parameter['ntrain'])
+        # deletefiles(self.filelist)
+        
+    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
+        """
+        Extracts pairs of plaintext and cropped image files for each text line
+        in the PAGE file (to be used during training).
         """
-        filelist = []
-        filepath = tempfile.mkdtemp(prefix='ocrd-cis-ocropy-train-')
+        pcgts = input_pcgts[0]
         #self.logger.info("Using model %s in %s for recognition", model)
-        for (n, input_file) in enumerate(self.input_files):
-            #self.logger.info("INPUT FILE %i / %s", n, input_file)
-            pcgts = page_from_file(self.workspace.download_file(input_file))
-            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
-            page = pcgts.get_Page()
-            page_image, page_coords, _ = self.workspace.image_from_page(page, page_id)
-
-            self.logger.info("Extracting from page '%s'", page_id)
-            for region in page.get_AllRegions(classes=['Text']):
-                textlines = region.get_TextLine()
-                self.logger.info("Extracting %i lines from region '%s'", len(textlines), region.id)
-                for line in textlines:
-                    if self.parameter['textequiv_level'] == 'line':
-                        path = join(filepath, page_id + region.id + line.id)
-                        imgpath = self.extract_segment(path, line, page_image, page_coords)
-                        if imgpath:
-                            filelist.append(imgpath)
+        page = pcgts.get_Page()
+        page_image, page_coords, _ = self.workspace.image_from_page(page, page_id)
+
+        self.logger.debug(f"Extracting from page '{page_id}'")
+        for region in page.get_AllRegions(classes=['Text']):
+            textlines = region.get_TextLine()
+            self.logger.debug(f"Extracting {len(textlines)} lines from region '{region.id}'")
+            for line in textlines:
+                if self.parameter['textequiv_level'] == 'line':
+                    path = join(self.output_file_grp, f"{page_id}_{region.id}_{line.id}")
+                    self.filelist.append(self.extract_segment(path, line, page_image, page_coords))
+                    continue
+                for word in line.get_Word():
+                    if self.parameter['textequiv_level'] == 'word':
+                        path = join(self.output_file_grp, f"{page_id}_{region.id}_{line.id}_{word.id}")
+                        self.filelist.append(self.extract_segment(path, word, page_image, page_coords))
                         continue
-                    for word in line.get_Word():
-                        if self.parameter['textequiv_level'] == 'word':
-                            path = join(filepath, page_id + region.id + line.id + word.id)
-                            imgpath = self.extract_segment(path, word, page_image, page_coords)
-                            if imgpath:
-                                filelist.append(imgpath)
-                            continue
-                        for glyph in word.get_Glyph():
-                            path = join(filepath, page_id + region.id + line.id + glyph.id)
-                            imgpath = self.extract_segment(path, glyph, page_image, page_coords)
-                            if imgpath:
-                                filelist.append(imgpath)
-
-        self.logger.info("Training %s from %s on %i file pairs",
-                      self.outputpath,
-                      self.modelpath or 'scratch',
-                      len(filelist))
-        rtrain(filelist, self.modelpath, self.outputpath, self.parameter['ntrain'])
-        deletefiles(filelist)
+                    for glyph in word.get_Glyph():
+                        path = join(self.output_file_grp, f"{page_id}_{region.id}_{line.id}_{word.id}_{glyph.id}")
+                        self.filelist.append(self.extract_segment(path, glyph, page_image, page_coords))
+        # FIXME: PAGE-XML not really needed, find a way around this (raising special exception?)
+        return OcrdPageResult(pcgts)
 
     def extract_segment(self, path, segment, page_image, page_coords):
-        #ground truth
+        gtpath = path + '.gt.txt'
+        imgpath = path + '.png'
+        if exists(gtpath) and exists(imgpath):
+            self.logger.debug(f"Reusing {segment.__class__.__name__} '{segment.id}' file pair")
+            return imgpath
+
         gt = segment.TextEquiv
         if not gt:
             return None
@@ -118,11 +120,10 @@ def extract_segment(self, path, segment, page_image, page_coords):
         if not gt or not gt.strip():
             return None
         gt = gt.strip()
-        gtpath = path + '.gt.txt'
         with open(gtpath, "w", encoding='utf-8') as f:
             f.write(gt)
 
-        self.logger.debug("Extracting %s '%s'", segment.__class__.__name__, segment.id)
+        self.logger.debug(f"Extracting {segment.__class__.__name__} '{segment.id}' file pair")
         image, coords = self.workspace.image_from_segment(segment, page_image, page_coords)
 
         if 'binarized' not in coords['features'].split(','):
@@ -132,8 +133,6 @@ def extract_segment(self, path, segment, page_image, page_coords):
         # resize image to 48 pixel height
         image = resize_keep_ratio(image)
 
-        #save temp image
-        imgpath = path + '.png'
         image.save(imgpath)
 
         return imgpath

From c08b623f9b0ad9daf4f8dc858b5b416b1212e018 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 14:51:54 +0200
Subject: [PATCH 131/194] ocrd-tool.json: add v3 cardinalities

---
 ocrd_cis/ocrd-tool.json | 120 +++++++++++-----------------------------
 1 file changed, 31 insertions(+), 89 deletions(-)

diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json
index a93917da..c2e20268 100644
--- a/ocrd_cis/ocrd-tool.json
+++ b/ocrd_cis/ocrd-tool.json
@@ -12,17 +12,9 @@
 				"preprocessing/optimization/grayscale_normalization",
 				"preprocessing/optimization/deskewing"
 			],
-			"input_file_grp": [
-				"OCR-D-IMG",
-				"OCR-D-SEG-BLOCK",
-				"OCR-D-SEG-LINE"
-			],
-			"output_file_grp": [
-				"OCR-D-IMG-BIN",
-				"OCR-D-SEG-BLOCK",
-				"OCR-D-SEG-LINE"
-			],
-			"description": "Binarize (and optionally deskew/despeckle) pages / regions / lines with ocropy",
+			"input_file_grp_cardinality": 1,
+			"output_file_grp_cardinality": 1,
+			"description": "Binarize (and optionally deskew/despeckle) pages / regions / lines with Ocropy v1",
 			"parameters": {
 				"method": {
 					"type": "string",
@@ -75,15 +67,9 @@
 			"steps": [
 				"preprocessing/optimization/deskewing"
 			],
-			"input_file_grp": [
-				"OCR-D-SEG-BLOCK",
-				"OCR-D-SEG-LINE"
-			],
-			"output_file_grp": [
-				"OCR-D-SEG-BLOCK",
-				"OCR-D-SEG-LINE"
-			],
-			"description": "Deskew regions with ocropy (by annotating orientation angle and adding AlternativeImage)",
+			"input_file_grp_cardinality": 1,
+			"output_file_grp_cardinality": 1,
+			"description": "Deskew regions with Ocropy v1 (by annotating orientation angle and adding AlternativeImage)",
 			"parameters": {
 				"maxskew": {
 					"type": "number",
@@ -106,17 +92,9 @@
 			"steps": [
 				"preprocessing/optimization/despeckling"
 			],
-			"input_file_grp": [
-				"OCR-D-IMG",
-				"OCR-D-SEG-BLOCK",
-				"OCR-D-SEG-LINE"
-			],
-			"output_file_grp": [
-				"OCR-D-IMG-DESPECK",
-				"OCR-D-SEG-BLOCK",
-				"OCR-D-SEG-LINE"
-			],
-			"description": "Despeckle pages / regions / lines with ocropy",
+			"input_file_grp_cardinality": 1,
+			"output_file_grp_cardinality": 1,
+			"description": "Despeckle pages / regions / lines with Ocropy v1",
 			"parameters": {
 				"noise_maxsize": {
 					"type": "number",
@@ -147,14 +125,8 @@
 				"layout/segmentation/region",
 				"layout/segmentation/line"
 			],
-			"input_file_grp": [
-				"OCR-D-SEG-BLOCK",
-				"OCR-D-SEG-LINE"
-			],
-			"output_file_grp": [
-				"OCR-D-SEG-BLOCK",
-				"OCR-D-SEG-LINE"
-			],
+			"input_file_grp_cardinality": 1,
+			"output_file_grp_cardinality": 1,
 			"description": "Clip text regions / lines at intersections with neighbours",
 			"parameters": {
 				"level-of-operation": {
@@ -185,12 +157,8 @@
 			"steps": [
 				"layout/segmentation/line"
 			],
-			"input_file_grp": [
-				"OCR-D-SEG-LINE"
-			],
-			"output_file_grp": [
-				"OCR-D-SEG-LINE"
-			],
+			"input_file_grp_cardinality": 1,
+			"output_file_grp_cardinality": 1,
 			"description": "Improve coordinates of text lines",
 			"parameters": {
 				"level-of-operation": {
@@ -245,12 +213,8 @@
 				"preprocessing/optimization/dewarping"
 			],
 			"description": "Dewarp line images with ocropy",
-			"input_file_grp": [
-				"OCR-D-SEG-LINE"
-			],
-			"output_file_grp": [
-				"OCR-D-SEG-LINE"
-			],
+			"input_file_grp_cardinality": 1,
+			"output_file_grp_cardinality": 1,
 			"parameters": {
 				"dpi": {
 					"type": "number",
@@ -286,15 +250,9 @@
 			"steps": [
 				"recognition/text-recognition"
 			],
-			"description": "Recognize text in (binarized+deskewed+dewarped) lines with ocropy",
-			"input_file_grp": [
-				"OCR-D-SEG-LINE",
-				"OCR-D-SEG-WORD",
-				"OCR-D-SEG-GLYPH"
-			],
-			"output_file_grp": [
-				"OCR-D-OCR-OCRO"
-			],
+			"description": "Recognize text in (binarized+deskewed+dewarped) lines with Ocropy v1",
+			"input_file_grp_cardinality": 1,
+			"output_file_grp_cardinality": 1,
 			"parameters": {
 				"textequiv_level": {
 					"type": "string",
@@ -345,14 +303,9 @@
 				"layout/segmentation/region",
 				"layout/segmentation/line"
 			],
-			"input_file_grp": [
-				"OCR-D-GT-SEG-BLOCK",
-				"OCR-D-SEG-BLOCK"
-			],
-			"output_file_grp": [
-				"OCR-D-SEG-LINE"
-			],
-			"description": "Segment pages into regions and lines, tables into cells and lines, or regions into lines with ocropy",
+			"input_file_grp_cardinality": 1,
+			"output_file_grp_cardinality": 1,
+			"description": "Segment pages into regions and lines, tables into cells and lines, or regions into lines with Ocropy v1",
 			"parameters": {
 				"dpi": {
 					"type": "number",
@@ -444,11 +397,9 @@
 			"steps": [
 				"recognition/text-recognition"
 			],
-			"input_file_grp": [
-				"OCR-D-GT-SEG-BLOCK",
-				"OCR-D-SEG-BLOCK"
-			],
-			"description": "train model with ground truth from mets data",
+			"input_file_grp_cardinality": 1,
+			"output_file_grp_cardinality": 1,
+			"description": "train Ocropy v1 text recognition model with PAGE ground truth from the input fileGrp extracted as file pairs into the output fileGrp",
 			"parameters": {
 				"textequiv_level": {
 					"type": "string",
@@ -470,7 +421,8 @@
 				},
 				"outputpath": {
 					"type": "string",
-					"description": "(existing) path for the trained model"
+					"default": "output",
+					"description": "directory path for the trained model"
 				}
 			}
 		},
@@ -482,15 +434,9 @@
 			"steps": [
 				"recognition/post-correction"
 			],
-			"input_file_grp": [
-				"OCR-D-OCR-1",
-				"OCR-D-OCR-2",
-				"OCR-D-OCR-N"
-			],
-			"output_file_grp": [
-				"OCR-D-ALIGNED"
-			],
-			"description": "Align multiple OCRs and/or GTs"
+			"input_file_grp_cardinality": [2, -1],
+			"output_file_grp_cardinality": 1,
+			"description": "Align multiple OCRs and/or GTs textually on line/word level"
 		},
 		"ocrd-cis-postcorrect": {
 			"executable": "ocrd-cis-postcorrect",
@@ -501,12 +447,8 @@
 				"recognition/post-correction"
 			],
 			"description": "Post correct OCR results",
-			"input_file_grp": [
-				"OCR-D-LINE-ALIGNED"
-			],
-			"output_file_grp": [
-				"OCR-D-POST-CORRECTED"
-			],
+			"input_file_grp_cardinality": 1,
+			"output_file_grp_cardinality": 1,
 			"parameters": {
 				"maxCandidates": {
 					"description": "Maximum number of considered correction candidates per suspicious token",

From a18307d4a8f50b0a4b081016c9d9db55cca63023 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 15:27:09 +0200
Subject: [PATCH 132/194] fix: ocropy train errors

---
 ocrd_cis/ocropy/train.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py
index 5c57b2cf..f5d70d6a 100644
--- a/ocrd_cis/ocropy/train.py
+++ b/ocrd_cis/ocropy/train.py
@@ -47,8 +47,8 @@ def setup(self):
             except SystemExit:
                 ocropydir = dirname(abspath(__file__))
                 self.modelpath = join(ocropydir, 'models', model)
-                self.logger.error(f"Failed to resolve model '{model}' path, trying '{modelpath}'")
-            if not isfile(modelpath):
+                self.logger.error(f"Failed to resolve model '{model}' path, trying '{self.modelpath}'")
+            if not isfile(self.modelpath):
                 self.logger.critical(f"Could not find model '{model}'.\n"
                                      f"Try 'ocrd resmgr download ocrd-cis-ocropy-recognize {model}'")
                 exit(1)
@@ -128,7 +128,7 @@ def extract_segment(self, path, segment, page_image, page_coords):
 
         if 'binarized' not in coords['features'].split(','):
             # binarize with nlbin
-            image, _ = binarize(image, maxskew=0)
+            image, _ = binarize(self.logger, image, maxskew=0)
 
         # resize image to 48 pixel height
         image = resize_keep_ratio(image)

From 0ba6839c849688431fa2259da4cd934963724cfb Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 15:39:09 +0200
Subject: [PATCH 133/194] remove: unused imports

---
 ocrd_cis/ocropy/binarize.py  |  6 +-----
 ocrd_cis/ocropy/clip.py      | 14 ++++++--------
 ocrd_cis/ocropy/denoise.py   | 10 ++--------
 ocrd_cis/ocropy/deskew.py    |  8 +-------
 ocrd_cis/ocropy/dewarp.py    | 12 +++---------
 ocrd_cis/ocropy/recognize.py | 12 ++----------
 ocrd_cis/ocropy/resegment.py |  1 -
 ocrd_cis/ocropy/segment.py   |  1 -
 ocrd_cis/ocropy/train.py     |  9 +++++----
 9 files changed, 20 insertions(+), 53 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index ac499336..271f01fa 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -1,14 +1,10 @@
 from __future__ import absolute_import
 from logging import Logger
+from typing import Optional
 
 import cv2
 import numpy as np
 from PIL import Image
-from os.path import abspath, dirname, join
-
-from typing import Union, Optional
-
-#import kraken.binarization
 
 from ocrd_utils import getLogger
 from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 3ddd6a70..36ee4eb3 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -2,7 +2,6 @@
 from logging import Logger
 from typing import Optional
 
-from os.path import join
 import numpy as np
 from PIL import Image, ImageStat, ImageOps
 from shapely.geometry import Polygon
@@ -12,19 +11,18 @@
 from ocrd import Processor
 from ocrd.processor import OcrdPageResult, OcrdPageResultImage
 from ocrd_utils import (
-    getLogger,
-    coordinates_of_segment,
-    polygon_from_points,
     bbox_from_polygon,
+    coordinates_of_segment,
+    crop_image,
+    getLogger,
     image_from_polygon,
+    polygon_from_points,
     polygon_mask,
-    crop_image,
 )
 
+from .common import array2pil, determine_zoom, pil2array
 from .ocrolib import midrange, morph
-from .common import (
-    # binarize,
-    array2pil, determine_zoom, pil2array)
+
 
 class OcropyClip(Processor):
     logger: Logger
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index 0f368fd5..72757e0c 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -1,19 +1,13 @@
 from __future__ import absolute_import
-
 from typing import Optional
 from logging import Logger
-from os.path import join
 
 from ocrd_utils import getLogger
-from ocrd_models.ocrd_page import (
-    AlternativeImageType, OcrdPage
-)
+from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage
 from ocrd import Processor
 from ocrd.processor import OcrdPageResult, OcrdPageResultImage
 
-from .common import (
-    # binarize,
-    determine_zoom, remove_noise)
+from .common import determine_zoom, remove_noise
 
 class OcropyDenoise(Processor):
     logger: Logger
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index fae0c90c..9f9f8b0a 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -1,15 +1,9 @@
 from __future__ import absolute_import
-
 from typing import Optional
 from logging import Logger
-from os.path import join
 
 from ocrd_utils import getLogger
-from ocrd_models.ocrd_page import (
-    PageType,
-    AlternativeImageType,
-    OcrdPage
-)
+from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, PageType
 from ocrd import Processor
 from ocrd.processor import OcrdPageResult, OcrdPageResultImage
 
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index a063a05e..9902af95 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -1,18 +1,12 @@
 from __future__ import absolute_import
-
-from typing import Optional
 from logging import Logger
-from os.path import join
-
+from typing import Optional
 import numpy as np
 
-from ocrd_utils import getLogger
-from ocrd_models.ocrd_page import (
-    AlternativeImageType,
-    OcrdPage
-)
 from ocrd import Processor
 from ocrd.processor import OcrdPageResult, OcrdPageResultImage
+from ocrd_utils import getLogger
+from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage
 
 from .ocrolib import lineest
 from .common import array2pil, check_line, determine_zoom, pil2array
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 97fcc64d..41576e43 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -10,16 +10,8 @@
 
 from rapidfuzz.distance import Levenshtein
 
-from ocrd_utils import (
-    getLogger,
-    coordinates_for_segment,
-    polygon_from_bbox,
-    points_from_polygon,
-)
-from ocrd_models.ocrd_page import (
-    TextEquivType, OcrdPage,
-    CoordsType, GlyphType, WordType
-)
+from ocrd_utils import coordinates_for_segment, getLogger, points_from_polygon, polygon_from_bbox
+from ocrd_models.ocrd_page import CoordsType, GlyphType, OcrdPage, TextEquivType, WordType
 from ocrd import Processor
 from ocrd.processor import OcrdPageResult
 
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 05f17d4f..0ef64687 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -2,7 +2,6 @@
 
 from typing import Optional
 from logging import Logger
-from os.path import join
 
 import numpy as np
 from skimage import draw, segmentation
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index bdeb40dd..edb5751a 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -2,7 +2,6 @@
 
 from typing import Optional
 from logging import Logger
-from os.path import join
 import itertools
 
 import numpy as np
diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py
index f5d70d6a..8f224b86 100644
--- a/ocrd_cis/ocropy/train.py
+++ b/ocrd_cis/ocropy/train.py
@@ -3,9 +3,8 @@
 from typing import Optional
 from logging import Logger
 from sys import exit
-from os import getcwd, makedirs, remove
+from os import makedirs, remove
 from os.path import abspath, dirname, exists, join, isfile
-import tempfile
 
 from ocrd_models import OcrdPage
 from ocrd import Processor, Workspace
@@ -32,7 +31,9 @@ def resize_keep_ratio(image, baseheight=48):
 
 class OcropyTrain(Processor):
     logger: Logger
+    modelpath: str
     old_cwd: str
+    outputpath: str
 
     @property
     def executable(self):
@@ -75,8 +76,8 @@ def process_workspace(self, workspace: Workspace) -> None:
                          f"on {len(self.filelist)} file pairs")
         rtrain(self.filelist, self.modelpath, self.outputpath, self.parameter['ntrain'])
         # deletefiles(self.filelist)
-        
-    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
+
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """
         Extracts pairs of plaintext and cropped image files for each text line
         in the PAGE file (to be used during training).

From 6b06e8856addd3b4963961df6d6cb1fb29e126cf Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 15:48:42 +0200
Subject: [PATCH 134/194] Update binarize.py

---
 ocrd_cis/ocropy/binarize.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 3e87cf8a..e82dbc16 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -98,6 +98,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
         page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
         zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
+        result = OcrdPageResult(pcgts)
         if level == 'page':
             try:
                 result.images.append(self.process_page(page, page_image, page_xywh, zoom, page_id))
@@ -256,4 +257,4 @@ def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id) ->
         # update PAGE (reference the image file):
         alt_image = AlternativeImageType(comments=features)
         line.add_AlternativeImage(alt_image)
-        return OcrdPageResultImage(bin_image, suffix, alt_image)
\ No newline at end of file
+        return OcrdPageResultImage(bin_image, suffix, alt_image)

From d1a14b704c0d2559685b8f33ddd23d60c65563a7 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 16:22:42 +0200
Subject: [PATCH 135/194] refactor: python strings v3

---
 ocrd_cis/ocropy/binarize.py  |  6 +--
 ocrd_cis/ocropy/clip.py      |  5 +--
 ocrd_cis/ocropy/denoise.py   |  8 ++--
 ocrd_cis/ocropy/deskew.py    |  7 ++--
 ocrd_cis/ocropy/dewarp.py    | 11 +++---
 ocrd_cis/ocropy/recognize.py |  6 +--
 ocrd_cis/ocropy/resegment.py | 72 +++++++++++++++-------------------
 ocrd_cis/ocropy/segment.py   | 76 ++++++++++++++++++------------------
 8 files changed, 88 insertions(+), 103 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index e82dbc16..782dd578 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -16,7 +16,7 @@
 
 
 def binarize(logger: Logger, pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0):
-    logger.debug('binarizing %dx%d image with method=%s', pil_image.width, pil_image.height, method)
+    logger.debug(f'Binarizing {pil_image.width}x{pil_image.height} image with method={method}')
     if method == 'none':
         # useful if the images are already binary,
         # but lack image attribute `binarized`
@@ -242,8 +242,8 @@ def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id) ->
         #orientation = -angle
         #orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
         #line.set_orientation(orientation) # does not exist on line level!
-        self.logger.warning(f"cannot add orientation %.2f to page '{page_id}' region '{region_id}' line '{line.id}'",
-                            -angle)
+        self.logger.warning(
+            f"Cannot add orientation %.2f to page '{page_id}' region '{region_id}' line '{line.id}'", -angle)
         bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize'])
         if self.parameter['noise_maxsize']:
             features += ',despeckled'
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 36ee4eb3..7f40a214 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -128,15 +128,13 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = No
             if level == 'region':
                 if region.get_AlternativeImage():
                     # FIXME: This should probably be an exception (bad workflow configuration).
-                    self.logger.warning(
-                        f'Page "{page_id}" region "{region.id}" already contains image data: skipping')
+                    self.logger.warning(f'Page "{page_id}" region "{region.id}" already contains image data: skipping')
                     continue
                 shape = prep(shapes[i])
                 neighbours = [(regionj, maskj) for shapej, regionj, maskj
                               in zip(shapes[:i] + shapes[i+1:], regions[:i] + regions[i+1:], masks[:i] + masks[i+1:])
                               if shape.intersects(shapej)]
                 if neighbours:
-                    segment_region_file_id = f"{output_file_id}_{region.id}"
                     ret.images.append(self.process_segment(
                         region, masks[i], polygons[i], neighbours, background_image,
                         page_image, page_xywh, page_bin, page_id))
@@ -167,7 +165,6 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = No
                               in zip(shapes[:j] + shapes[j+1:], lines[:j] + lines[j+1:], masks[:j] + masks[j+1:])
                               if shape.intersects(shapej)]
                 if neighbours:
-                    segment_line_file_id = f"{output_file_id}_{region.id}_{line.id}"
                     ret.images.append(self.process_segment(
                         line, masks[j], polygons[j], neighbours, background_image,
                         region_image, region_coords, region_bin, page_id))
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index 72757e0c..b3c219fb 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -57,7 +57,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
         else:
             regions = page.get_AllRegions(classes=['Text'], order='reading-order')
             if not regions:
-                self.logger.warning('Page "%s" contains no text regions', page_id)
+                self.logger.warning(f'Page "{page_id}" contains no text regions')
             for region in regions:
                 region_image, region_xywh = self.workspace.image_from_segment(
                     region, page_image, page_xywh,
@@ -69,7 +69,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
                     continue
                 lines = region.get_TextLine()
                 if not lines:
-                    self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
+                    self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines')
                 for line in lines:
                     line_image, line_xywh = self.workspace.image_from_segment(
                         line, region_image, region_xywh,
@@ -80,9 +80,9 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
 
     def process_segment(self, segment, segment_image, segment_xywh, zoom) -> Optional[OcrdPageResultImage]:
         if not segment_image.width or not segment_image.height:
-            self.logger.warning("Skipping '%s' with zero size", file_id)
+            self.logger.warning(f"Skipping '{segment.id}' with zero size")
             return None
-        self.logger.info("About to despeckle '%s'", file_id)
+        self.logger.info(f"About to despeckle '{segment.id}'")
         bin_image = remove_noise(segment_image,
                                  maxsize=self.parameter['noise_maxsize']/zoom*300/72) # in pt
         # update PAGE (reference the image file):
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index 9f9f8b0a..84475d81 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -73,8 +73,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
                 # (we will overwrite @orientation anyway,)
                 # abort if no such image can be produced:
                 feature_filter='deskewed')
-            image = self._process_segment(region, region_image, region_coords,
-                                          "region '%s'" % region.id, page_id)
+            image = self._process_segment(region, region_image, region_coords, f"region '{region.id}'", page_id)
             if image:
                 result.images.append(image)
         return result
@@ -84,14 +83,14 @@ def _process_segment(self, segment, segment_image, segment_coords, segment_id, p
             self.logger.warning("Skipping %s with zero size", segment_id)
             return None
         angle0 = segment_coords['angle'] # deskewing (w.r.t. top image) already applied to segment_image
-        self.logger.info("About to deskew %s", segment_id)
+        self.logger.info(f"About to deskew {segment_id}")
         angle = deskew(segment_image, maxskew=self.parameter['maxskew']) # additional angle to be applied
         # segment angle: PAGE orientation is defined clockwise,
         # whereas PIL/ndimage rotation is in mathematical direction:
         orientation = -(angle + angle0)
         orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
         segment.set_orientation(orientation) # also removes all deskewed AlternativeImages
-        self.logger.info("Found angle for %s: %.1f", segment_id, angle)
+        self.logger.info(f"Found angle for {segment_id}: %.1f", angle)
         # delegate reflection, rotation and re-cropping to core:
         if isinstance(segment, PageType):
             segment_image, segment_coords, _ = self.workspace.image_from_page(
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 9902af95..302cf2e0 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -101,29 +101,28 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
 
         regions = page.get_AllRegions(classes=['Text'], order='reading-order')
         if not regions:
-            self.logger.warning('Page "%s" contains no text regions', page_id)
+            self.logger.warning(f'Page "{page_id}" contains no text regions')
         for region in regions:
             region_image, region_xywh = self.workspace.image_from_segment(
                 region, page_image, page_xywh)
 
             lines = region.get_TextLine()
             if not lines:
-                self.logger.warning('Region %s contains no text lines', region.id)
+                self.logger.warning(f'Region {region.id} contains no text lines')
             for line in lines:
                 line_image, line_xywh = self.workspace.image_from_segment(
                     line, region_image, region_xywh)
 
-                self.logger.info("About to dewarp page '%s' region '%s' line '%s'",
-                                 page_id, region.id, line.id)
+                self.logger.info(f"About to dewarp page '{page_id}' region '{region.id}' line '{line.id}'")
                 try:
                     dew_image = dewarp(line_image, self.lnorm, check=True,
                                        max_neighbour=self.parameter['max_neighbour'],
                                        zoom=zoom)
                 except InvalidLine as err:
-                    self.logger.error('cannot dewarp line "%s": %s', line.id, err)
+                    self.logger.error(f'Cannot dewarp line "{line.id}": {err}')
                     continue
                 except InadequateLine as err:
-                    self.logger.warning('cannot dewarp line "%s": %s', line.id, err)
+                    self.logger.warning(f'cannot dewarp line "{line.id}": {err}')
                     # as a fallback, simply pad the image vertically
                     # (just as dewarping would do on average, so at least
                     #  this line has similar margins as the others):
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 41576e43..f0c4b520 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -179,13 +179,13 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh):
                 linegt = line.TextEquiv[0].Unicode
             else:
                 linegt = ''
-            self.logger.debug("GT  '%s': '%s'", line.id, linegt)
+            self.logger.debug(f"GT  '{line.id}': '{linegt}'")
             # remove existing annotation below line level:
             line.set_TextEquiv([])
             line.set_Word([])
 
             if line_image.size[1] < 16:
-                self.logger.debug(f"ERROR: bounding box is too narrow at line {line.id}")
+                self.logger.debug(f"Error: bounding box is too narrow at line {line.id}")
                 continue
             # resize image to 48 pixel height
             final_img, scale = resize_keep_ratio(line_image)
@@ -194,7 +194,7 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh):
             try:
                 linepred, clist, rlist, confidlist = recognize(final_img, self.pad, self.network, check=True)
             except Exception as err:
-                self.logger.debug(f'error processing line "{line.id}": {err}')
+                self.logger.debug(f'Error processing line "{line.id}": {err}')
                 continue
             self.logger.debug(f"OCR '{line.id}': '{linepred}'")
             edits += Levenshtein.distance(linepred, linegt)
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 0ef64687..d429c1de 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -126,14 +126,14 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
                   page.get_CustomRegion())
         regions = page.get_AllRegions(classes=['Text'])
         if not regions:
-            self.logger.warning('Page "%s" contains no text regions', page_id)
+            self.logger.warning(f'Page "{page_id}" contains no text regions')
         elif level == 'page':
             lines = [line for region in regions
                      for line in region.get_TextLine()]
             if lines:
                 self._process_segment(page, page_image, page_coords, page_id, zoom, lines, ignore)
             else:
-                self.logger.warning('Page "%s" contains no text regions with lines', page_id)
+                self.logger.warning(f'Page "{page_id}" contains no text regions with lines', )
         else:
             for region in regions:
                 lines = region.get_TextLine()
@@ -142,7 +142,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
                         region, page_image, page_coords, feature_selector='binarized')
                     self._process_segment(region, region_image, region_coords, page_id, zoom, lines, ignore)
                 else:
-                    self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
+                    self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines')
         return OcrdPageResult(pcgts)
  
     def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, lines, ignore):
@@ -163,8 +163,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             fullpage = False
             report = check_region(parent_bin, zoom)
         if report:
-            self.logger.warning('Invalid %s "%s": %s', tag,
-                        page_id if fullpage else parent.id, report)
+            self.logger.warning(f'Invalid {tag} "{page_id if fullpage else parent.id}": {report}')
             return
         # get existing line labels:
         line_labels = np.zeros_like(parent_bin, bool)
@@ -191,8 +190,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             line_labels[i, line_y, line_x] = True
         # only text region(s) may contain new text lines
         for i, region in enumerate(set(line.parent_object_ for line in lines)):
-            self.logger.debug('unmasking area of text region "%s" for "%s"',
-                      region.id, page_id if fullpage else parent.id)
+            self.logger.debug(f'Unmasking area of text region "{region.id}" for "{page_id if fullpage else parent.id}"')
             region_polygon = coordinates_of_segment(region, parent_image, parent_coords)
             region_polygon = make_valid(Polygon(region_polygon))
             region_polygon = np.array(region_polygon.exterior.coords, int)[:-1]
@@ -201,14 +199,14 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                                     parent_bin.shape)] = False
         # mask/ignore overlapping neighbours
         for i, segment in enumerate(ignore):
-            self.logger.debug('masking area of %s "%s" for "%s"', type(segment).__name__[:-4],
-                      segment.id, page_id if fullpage else parent.id)
+            self.logger.debug(f'Masking area of {type(segment).__name__[:-4]} "{segment.id}" for '
+                              f'"{page_id if fullpage else parent.id}"')
             segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords)
             ignore_bin[draw.polygon(segment_polygon[:, 1],
                                     segment_polygon[:, 0],
                                     parent_bin.shape)] = True
         if method != 'lineest':
-            self.logger.debug('calculating connected component and distance transforms for "%s"', parent.id)
+            self.logger.debug(f'Calculating connected component and distance transforms for "{parent.id}"')
             bin = parent_bin & ~ ignore_bin
             components, _ = morph.label(bin)
             # estimate glyph scale (roughly)
@@ -217,7 +215,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                 counts = np.sqrt(3 * counts)
                 scale = int(np.median(counts[(5/zoom < counts) & (counts < 100/zoom)]))
                 components *= (counts > 15/zoom)[components]
-                self.logger.debug("estimated scale: %d", scale)
+                self.logger.debug(f"Estimated scale: {scale}")
             else:
                 scale = 43
             if method == 'ccomps':
@@ -235,7 +233,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                 new_labels = np.zeros_like(parent_bin, np.uint8)
                 for i, line in enumerate(lines):
                     if line.Baseline is None:
-                        self.logger.warning("Skipping '%s' without baseline", line.id)
+                        self.logger.warning(f"Skipping '{line.id}' without baseline")
                         new_labels[line_labels[i]] = i + 1
                         continue
                     line_baseline = baseline_of_segment(line, parent_coords)
@@ -254,14 +252,13 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                 parent_bin, seps=ignore_bin, zoom=zoom, spread_dist=maxdist or scale/2,
                 fullpage=fullpage, maxseps=0, maxcolseps=len(ignore), maximages=0)
         except Exception as err:
-            self.logger.error('Cannot line-segment %s "%s": %s',
-                      tag, page_id if fullpage else parent.id, err)
+            self.logger.error(f'Cannot line-segment {tag} "{page_id if fullpage else parent.id}": {err}')
             return
-        self.logger.info("Found %d new line labels for %d existing lines on %s '%s'",
-                 new_line_labels.max(), len(lines), tag, parent.id)
+        self.logger.info(
+            f"Found {new_line_labels.max()} new line labels for {len(lines)} existing lines on {tag} '{parent.id}'")
         # polygonalize and prepare comparison
         new_line_polygons, new_line_labels = masks2polygons(self.logger,
-            new_line_labels, new_baselines, parent_bin, '%s "%s"' % (tag, parent.id),
+            new_line_labels, new_baselines, parent_bin, name=f'{tag} "{parent.id}"',
             min_area=640/zoom/zoom)
         DSAVE('line_labels', [np.argmax(np.insert(line_labels, 0, 0, axis=0), axis=0), parent_bin])
         DSAVE('new_line_labels', [new_line_labels, parent_bin])
@@ -345,31 +342,29 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
         for j, line in enumerate(lines):
             new_lines = np.nonzero(assignments == j)[0]
             if not np.prod(new_lines.shape):
-                self.logger.debug("no lines for '%s' match or fit", line.id)
+                self.logger.debug(f"no lines for '{line.id}' match or fit", )
                 continue
             covers = np.sum(covers_bg[new_lines,j])
             if covers < threshold / 3:
-                self.logger.debug("new lines for '%s' only cover %.1f%% bg",
-                          line.id, covers * 100)
+                self.logger.debug(f"new lines for '{line.id}' only cover %.1f%% bg", covers * 100)
                 continue
             covers = np.sum(covers_fg[new_lines,j])
             if covers < threshold:
-                self.logger.debug("new lines for '%s' only cover %.1f%% fg",
-                          line.id, covers * 100)
+                self.logger.debug(f"new lines for '{line.id}' only cover %.1f%% fg", covers * 100)
                 continue
             looses = (assignments < 0) & (covers_bg[:,j] > 0.1)
             if looses.any():
                 covers = np.sum(covers_bg[np.nonzero(looses)[0],j])
-                self.logger.debug("new lines for '%s' would loose %d non-matching segments totalling %.1f%% bg",
-                          line.id, np.count_nonzero(looses), covers * 100)
+                self.logger.debug(
+                    f"new lines for '{line.id}' would loose {np.count_nonzero(looses)} non-matching segments "
+                    f"totalling %.1f%% bg", covers * 100)
                 continue
             line_count = np.count_nonzero(line_labels[j] & parent_bin)
             new_count = covers * line_count
-            self.logger.debug('Black pixels before/after resegment of line "%s": %d/%d',
-                      line.id, line_count, new_count)
+            self.logger.debug(f'Black pixels before/after resegment of line "{line.id}": {line_count}/{new_count}')
             # combine all assigned new lines to single outline polygon
             if len(new_lines) > 1:
-                self.logger.debug("joining %d new line polygons for '%s'", len(new_lines), line.id)
+                self.logger.debug(f"joining {len(new_lines)} new line polygons for '{line.id}'")
             new_polygon = join_polygons([new_line_polygons[i] #intersections[(i, j)]
                                          for i in new_lines], loc=line.id, scale=scale)
             new_baseline = join_baselines(self.logger, [new_polygon.intersection(new_baselines[i])
@@ -379,7 +374,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                                                    parent_image, parent_coords)
             line_polygon = polygon_for_parent(line_polygon, line.parent_object_)
             if line_polygon is None:
-                self.logger.warning("Ignoring extant new polygon for line '%s'", line.id)
+                self.logger.warning(f"Ignoring extant new polygon for line '{line.id}'")
                 return
             # annotate result:
             line.get_Coords().set_points(points_from_polygon(line_polygon))
@@ -394,7 +389,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                     if j == otherj:
                         continue
                     otherline = lines[otherj]
-                    self.logger.debug("subtracting new '%s' from overlapping '%s'", line.id, otherline.id)
+                    self.logger.debug(f"subtracting new '{line.id}' from overlapping '{otherline.id}'")
                     other_polygon = diff_polygons(line_polygons[otherj].context, new_polygon)
                     if other_polygon.is_empty:
                         continue
@@ -403,7 +398,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                                                             parent_image, parent_coords)
                     other_polygon = polygon_for_parent(other_polygon, otherline.parent_object_)
                     if other_polygon is None:
-                        self.logger.warning("Ignoring extant new polygon for line '%s'", otherline.id)
+                        self.logger.warning(f"Ignoring extant new polygon for line '{otherline.id}'")
                         continue
                     otherline.get_Coords().set_points(points_from_polygon(other_polygon))
 
@@ -434,29 +429,26 @@ def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, compon
             continue
         count = np.count_nonzero(old_label)
         if not count:
-            logger.warning("skipping zero-area line '%s'", line.id)
+            logger.warning(f"skipping zero-area line '{line.id}'")
             continue
         covers = np.count_nonzero(new_label) / count
         if covers < threshold / 3:
-            logger.debug("new line for '%s' only covers %.1f%% bg",
-                      line.id, covers * 100)
+            logger.debug(f"new line for '%s' only covers %.1f%% bg", covers * 100)
             continue
         count = np.count_nonzero(old_label * binarized)
         if not count:
-            logger.warning("skipping binary-empty line '%s'", line.id)
+            logger.warning(f"skipping binary-empty line '{line.id}'")
             continue
         covers = np.count_nonzero(new_label * binarized) / count
         if covers < threshold:
-            logger.debug("new line for '%s' only covers %.1f%% fg",
-                      line.id, covers * 100)
+            logger.debug(f"new line for '{line.id}' only covers %.1f%% fg", covers * 100)
             continue
-        logger.debug('Black pixels before/after resegment of line "%s": %d/%d',
-                  line.id, count, covers * count)
+        logger.debug(f'Black pixels before/after resegment of line "{line.id}": {count}/{covers * count}')
         contours = [contour[:,::-1] # get x,y order again
                     for contour, area in morph.find_contours(new_label)]
         #LOG.debug("joining %d subsegments for %s", len(contours), line.id)
         if len(contours) == 0:
-            logger.warning("no contours for %s - keeping", line.id)
+            logger.warning(f"no contours for {line.id} - keeping")
             continue
         else:
             # get alpha shape
@@ -468,7 +460,7 @@ def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, compon
         polygon = coordinates_for_segment(poly, None, coords)
         polygon = polygon_for_parent(polygon, line.parent_object_)
         if polygon is None:
-            logger.warning("Ignoring extant line for %s", line.id)
+            logger.warning(f"Ignoring extant line for {line.id}")
             continue
         line.get_Coords().set_points(points_from_polygon(polygon))
 
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index edb5751a..e8c4a1ed 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -75,8 +75,6 @@ def masks2polygons(logger: Logger, bg_labels, baselines, fg_bin, name, min_area=
     - these polygons as a list of label, polygon, baseline tuples, and
     - a Numpy array of new background labels for that list.
     """
-    if not logger:
-        raise ValueError(f"Logger has not been passed by the caller")
     # find sharp baseline
     if baselines is not None:
         def getx(xy):
@@ -93,8 +91,7 @@ def getx(xy):
         bg_mask = np.array(bg_labels == label, bool)
         if not np.count_nonzero(bg_mask * fg_bin):
             # ignore if missing foreground
-            logger.debug('skipping label %d in %s due to empty fg',
-                      label, name)
+            logger.debug(f'Skipping label {label} in {name} due to empty fg')
             continue
         # simplify to convex hull
         if simplify is not None:
@@ -102,8 +99,8 @@ def getx(xy):
             conflicts = np.setdiff1d(hull * simplify,
                                      bg_mask * simplify)
             if conflicts.any():
-                logger.debug('Cannot simplify %d: convex hull would create additional intersections %s',
-                          label, str(conflicts))
+                logger.debug(
+                    f'Cannot simplify {label}: convex hull would create additional intersections {str(conflicts)}')
             else:
                 bg_mask = hull
         if open_holes:
@@ -131,8 +128,8 @@ def getx(xy):
                     if len(hole) < 3:
                         idx_hole = hier[0, idx_hole, 0]
                         continue
-                    logger.debug("label %d contour %d [%d pts] has hole %d [%d pts]",
-                              label, idx, len(contour), idx_hole, len(hole))
+                    logger.debug(
+                        f"Label {label} contour {idx} [{len(contour)} pts] has hole {idx_hole} [{len(hole)} pts]")
                     #plot_poly(hole, 'blue')
                     # cut child from outside...
                     # first get nearest point on child
@@ -173,7 +170,7 @@ def getx(xy):
                         diff2 = (interpol[interpol_idx+1:interpol_idx+2] - cispoint2) // 5
                     cispoint1 = cispoint1 + diff1
                     cispoint2 = cispoint2 + diff2
-                    logger.debug("stitching at interpolation pos %d hole pos %d", interpol_idx, hole_idx)
+                    logger.debug(f"Stitching at interpolation pos {interpol_idx} hole pos {hole_idx}")
                     # now stitch together outer (up to cision), inner (re-arranged around cision), outer (rest)
                     # (this works, because inner contours have inverse direction)
                     contour = np.concatenate([contour[:contour_idx], cispoint1,
@@ -182,7 +179,7 @@ def getx(xy):
                     #plot_poly(contour, 'green')
                     idx_hole = hier[0, idx_hole, 0]
                 #plot_poly(contour, 'red')
-                logger.debug("adding label %d contour %d [%d pts]", label, idx, len(contour))
+                logger.debug(f"Adding label {label} contour {idx} [{len(contour)} pts]")
                 contours.append(contour)
                 idx = hier[0, idx, 0]
         else:
@@ -208,8 +205,7 @@ def getx(xy):
             contour = contours[i]
             area = areas[i]
             if min_area and area < min_area and area / total_area < 0.1:
-                logger.warning('Label %d contour %d is too small (%d/%d) in %s',
-                            label, i, area, total_area, name)
+                logger.warning(f'Label {label} contour {i} is too small ({area}/{total_area}) in {name}')
                 continue
             # simplify shape:
             # can produce invalid (self-intersecting) polygons:
@@ -226,7 +222,7 @@ def getx(xy):
                 logger.warning(explain_validity(polygon))
             poly = polygon.exterior.coords[:-1] # keep open
             if len(poly) < 4:
-                logger.warning('Label %d contour %d for %s has less than 4 points', label, i, name)
+                logger.warning(f'Label {label} contour {i} for {name} has less than 4 points')
                 continue
             # get baseline segments intersecting with this line mask
             # and concatenate them from left to right
@@ -369,7 +365,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
             if regions:
                 # page is already region-segmented
                 if overwrite_regions:
-                    self.logger.info('removing existing TextRegions in page "%s"', page_id)
+                    self.logger.info(f'Removing existing TextRegions in page "{page_id}"', )
                     # we could remove all other region types as well,
                     # but this is more flexible (for workflows with
                     # specialized separator/image/table detectors):
@@ -377,7 +373,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
                     page.set_ReadingOrder(None)
                     ro = None
                 else:
-                    self.logger.warning('keeping existing TextRegions in page "%s"', page_id)
+                    self.logger.warning(f'Keeping existing TextRegions in page "{page_id}"', )
                     ignore.extend(regions)
             # create reading order if necessary
             if not ro or overwrite_order:
@@ -404,20 +400,20 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
             ignore.extend(page.get_TextRegion())
             regions = list(page.get_TableRegion())
             if not regions:
-                self.logger.warning('Page "%s" contains no table regions', page_id)
+                self.logger.warning(f'Page "{page_id}" contains no table regions')
             for region in regions:
                 subregions = region.get_TextRegion()
                 if subregions:
                     # table is already cell-segmented
                     if overwrite_regions:
-                        self.logger.info('removing existing TextRegions in table "%s"', region.id)
+                        self.logger.info(f'Removing existing TextRegions in table "{region.id}"')
                         region.set_TextRegion([])
                         roelem = reading_order.get(region.id)
                         # replace by empty group with same index and ref
                         # (which can then take the cells as subregions)
                         reading_order[region.id] = page_subgroup_in_reading_order(self.logger, roelem)
                     else:
-                        self.logger.warning('skipping table "%s" with existing TextRegions', region.id)
+                        self.logger.warning(f'Skipping table "{region.id}" with existing TextRegions')
                         continue
                 # TODO: also allow grayscale_normalized (try/except?)
                 region_image, region_coords = self.workspace.image_from_segment(
@@ -428,19 +424,22 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
                 # create reading order group if necessary
                 roelem = reading_order.get(region.id)
                 if not roelem:
-                    self.logger.warning("Page '%s' table region '%s' is not referenced in reading order (%s)",
-                                page_id, region.id, "no target to add cells to")
+                    self.logger.warning(
+                        f"Page '{page_id}' table region '{region.id}' is not referenced in reading order "
+                        f"(no target to add cells to)")
                 elif overwrite_order:
                     # replace by empty ordered group with same (index and) ref
                     # (which can then take the cells as subregions)
                     roelem = page_subgroup_in_reading_order(self.logger, roelem)
                     reading_order[region.id] = roelem
                 elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)):
-                    self.logger.warning("Page '%s' table region '%s' already has an ordered group (%s)",
-                                page_id, region.id, "cells will be appended")
+                    self.logger.warning(
+                        f"Page '{page_id}' table region '{region.id}' already has an ordered group "
+                        f"(cells will be appended)")
                 elif isinstance(roelem, (UnorderedGroupType, UnorderedGroupIndexedType)):
-                    self.logger.warning("Page '%s' table region '%s' already has an unordered group (%s)",
-                                page_id, region.id, "cells will not be appended")
+                    self.logger.warning(
+                        f"Page '{page_id}' table region '{region.id}' already has an unordered group "
+                        f"(cells will not be appended)")
                     roelem = None
                 else:
                     # replace regionRef(Indexed) by group with same index and ref
@@ -468,14 +467,14 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
                     region.add_TextRegion(subregion)
                     regions.append(subregion)
             if not regions:
-                self.logger.warning('Page "%s" contains no text regions', page_id)
+                self.logger.warning(f'Page "{page_id}" contains no text regions')
             for region in regions:
                 if region.get_TextLine():
                     if overwrite_lines:
-                        self.logger.info('removing existing TextLines in page "%s" region "%s"', page_id, region.id)
+                        self.logger.info(f'Removing existing TextLines in page "{page_id}" region "{region.id}"')
                         region.set_TextLine([])
                     else:
-                        self.logger.warning('keeping existing TextLines in page "%s" region "%s"', page_id, region.id)
+                        self.logger.warning(f'Keeping existing TextLines in page "{page_id}" region "{region.id}"')
                         ignore.extend(region.get_TextLine())
                 # TODO: also allow grayscale_normalized (try/except?)
                 region_image, region_coords = self.workspace.image_from_segment(
@@ -517,8 +516,8 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
         sep_bin = np.zeros_like(element_bin, bool)
         ignore_labels = np.zeros_like(element_bin, int)
         for i, segment in enumerate(ignore):
-            self.logger.debug(f'masking foreground of {type(segment).__name__[:-4]} '
-                              f'"{segment.id}" for "{element.id}"')
+            self.logger.debug(
+                f'Masking foreground of {type(segment).__name__[:-4]} "{segment.id}" for "{element.id}"')
             # mark these segments (e.g. separator regions, tables, images)
             # for workflows where they have been detected already;
             # these will be:
@@ -552,7 +551,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
             fullpage = False
             report = check_region(element_bin, zoom)
             suffix = element.id + '.IMG-CLIP'
-        self.logger.info(f'computing line segmentation for {element_name} "{element.id}"')
+        self.logger.info(f'Computing line segmentation for {element_name} "{element.id}"')
         # TODO: we should downscale if DPI is large enough to save time
         try:
             if report:
@@ -577,8 +576,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                 self.logger.error(f'Cannot line-segment {element_name} "{element.id}": {err}')
             return None
 
-        self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines '
-                         f'for {element_name} "{element.id}"')
+        self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines for {element_name} "{element.id}"')
         # post-process line labels
         if isinstance(element, (PageType, TableRegionType)):
             # aggregate text lines to text regions
@@ -599,8 +597,8 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                     gap_height=self.parameter['gap_height'],
                     gap_width=self.parameter['gap_width'],
                     scale=scale, zoom=zoom)
-                self.logger.info(f'Found {len(np.unique(region_labels)) - 1} text regions '
-                                 f'for {element_name} "{element.id}"')
+                self.logger.info(
+                    f'Found {len(np.unique(region_labels)) - 1} text regions for {element_name} "{element.id}"')
             except Exception as err:
                 self.logger.error(f'Cannot region-segment {element_name} "{element.id}": {err}')
                 region_labels = np.where(line_labels > len(ignore), 1 + len(ignore), line_labels)
@@ -630,7 +628,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                     # (no new region, no actual text lines)
                     region_line_labels0 = np.intersect1d(region_line_labels0, ignore_labels)
                     assert len(region_line_labels0) == 1, \
-                        (f'region label "{region_label}" has both existing regions and new lines '
+                        (f'Region label "{region_label}" has both existing regions and new lines '
                          f'({str(region_line_labels0)})')
                     region = ignore[region_line_labels0[0] - 1]
                     if rogroup and region.parent_object_ is element and not isinstance(region, SeparatorRegionType):
@@ -907,9 +905,9 @@ def join_baselines(logger: Logger, baselines, loc=''):
                 elif geom.geom_type == 'MultiLineString':
                     lines.extend(geom)
                 else:
-                    logger.warning("ignoring baseline subtype %s in %s", geom.geom_type, loc)
+                    logger.warning(f"Ignoring baseline subtype {geom.geom_type} in {loc}")
         else:
-            logger.warning("ignoring baseline type %s in %s", baseline.geom_type, loc)
+            logger.warning(f"Ignoring baseline type {baseline.geom_type} in {loc}")
     nlines = len(lines)
     if nlines == 0:
         return None
@@ -971,7 +969,7 @@ def join_baselines(logger: Logger, baselines, loc=''):
         else:
             chains.append([prevl, nextl])
     if len(chains) > 1:
-        logger.warning("baseline merge impossible (no spanning tree) in %s", loc)
+        logger.warning(f"Baseline merge impossible (no spanning tree) in {loc}")
         return None
     assert len(chains) == 1, chains
     assert len(chains[0]) == nlines, chains[0]
@@ -983,7 +981,7 @@ def join_baselines(logger: Logger, baselines, loc=''):
         coords.extend(line.normalize().coords)
     result = LineString(coords)
     if result.is_empty:
-        logger.warning("baseline merge is empty in %s", loc)
+        logger.warning(f"Baseline merge is empty in {loc}")
         return None
     assert result.geom_type == 'LineString', result.wkt
     result = set_precision(result, 1.0)

From d8542c20d5e39c1bf8670205a75c039f25198bf8 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 16:39:43 +0200
Subject: [PATCH 136/194] spacing: train

---
 ocrd_cis/ocropy/train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py
index 8f224b86..6c627231 100644
--- a/ocrd_cis/ocropy/train.py
+++ b/ocrd_cis/ocropy/train.py
@@ -19,8 +19,8 @@ def deletefiles(filelist):
     for file in filelist:
         if exists(file):
             remove(file)
-        if exists(file[:-3]+'gt.txt'):
-            remove(file[:-3]+'gt.txt')
+        if exists(file[:-3] + 'gt.txt'):
+            remove(file[:-3] + 'gt.txt')
 
 def resize_keep_ratio(image, baseheight=48):
     hpercent = (baseheight / float(image.size[1]))

From d7859714ec6622a0b9294d9dc54d9f3e35f4606c Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 16:39:54 +0200
Subject: [PATCH 137/194] spacing: segment

---
 ocrd_cis/ocropy/segment.py | 41 ++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index e8c4a1ed..75be2a11 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -58,7 +58,9 @@
     lines2regions
 )
 
-def masks2polygons(logger: Logger, bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True):
+
+def masks2polygons(logger: Logger, bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False,
+                   reorder=True):
     """Convert label masks into polygon coordinates.
 
     Given a Numpy array of background labels ``bg_labels``,
@@ -79,6 +81,7 @@ def masks2polygons(logger: Logger, bg_labels, baselines, fg_bin, name, min_area=
     if baselines is not None:
         def getx(xy):
             return xy[0]
+
         baselines = [LineString(sorted([p[::-1] for p in line], key=getx)).simplify(5)
                      for line in baselines
                      if len(line) >= 2]
@@ -96,8 +99,7 @@ def getx(xy):
         # simplify to convex hull
         if simplify is not None:
             hull = convex_hull_image(bg_mask.astype(np.uint8)).astype(bool)
-            conflicts = np.setdiff1d(hull * simplify,
-                                     bg_mask * simplify)
+            conflicts = np.setdiff1d(hull * simplify, bg_mask * simplify)
             if conflicts.any():
                 logger.debug(
                     f'Cannot simplify {label}: convex hull would create additional intersections {str(conflicts)}')
@@ -143,10 +145,10 @@ def getx(xy):
                     contourtics = np.maximum(1, np.linalg.norm(contour2, axis=2).astype(int)[:,0] // 10)
                     interpol = []
                     for i, ntics in enumerate(contourtics):
-                        interpol.extend(np.array(contour[i:i+1] +
-                                                 contour2[i:i+1] *
-                                                 np.linspace(0, 1, ntics)[:,np.newaxis,np.newaxis],
-                                                 int))
+                        interpol.extend(np.array(
+                            contour[i:i + 1] +
+                            contour2[i:i + 1] *
+                            np.linspace(0, 1, ntics)[:, np.newaxis, np.newaxis], int))
                     interpol.append(contour[-1])
                     interpol = np.array(interpol)
                     contourtics = np.insert(np.cumsum(contourtics), 0, 0)
@@ -159,23 +161,24 @@ def getx(xy):
                         contour_idx2 = contour_idx
                     if contour_idx2 >= len(contour):
                         contour_idx2 = 0
-                    cispoint1 = cispoint2 = interpol[interpol_idx:interpol_idx+1]
+                    cispoint1 = cispoint2 = interpol[interpol_idx:interpol_idx + 1]
                     if interpol_idx == 0:
                         diff1 = (interpol[-1:] - cispoint1) // 5
                     else:
-                        diff1 = (interpol[interpol_idx-1:interpol_idx] - cispoint1) // 5
+                        diff1 = (interpol[interpol_idx - 1: interpol_idx] - cispoint1) // 5
                     if interpol_idx + 1 >= len(interpol):
                         diff2 = (interpol[0:1] - cispoint2) // 5
                     else:
-                        diff2 = (interpol[interpol_idx+1:interpol_idx+2] - cispoint2) // 5
+                        diff2 = (interpol[interpol_idx + 1: interpol_idx + 2] - cispoint2) // 5
                     cispoint1 = cispoint1 + diff1
                     cispoint2 = cispoint2 + diff2
                     logger.debug(f"Stitching at interpolation pos {interpol_idx} hole pos {hole_idx}")
                     # now stitch together outer (up to cision), inner (re-arranged around cision), outer (rest)
                     # (this works, because inner contours have inverse direction)
-                    contour = np.concatenate([contour[:contour_idx], cispoint1,
-                                              hole[hole_idx:], hole[:hole_idx],
-                                              cispoint2, contour[contour_idx:]])
+                    contour = np.concatenate(
+                        [contour[:contour_idx], cispoint1,
+                         hole[hole_idx:], hole[:hole_idx],
+                         cispoint2, contour[contour_idx:]])
                     #plot_poly(contour, 'green')
                     idx_hole = hier[0, idx_hole, 0]
                 #plot_poly(contour, 'red')
@@ -210,7 +213,7 @@ def getx(xy):
             # simplify shape:
             # can produce invalid (self-intersecting) polygons:
             #polygon = cv2.approxPolyDP(contour, 2, False)[:, 0, ::] # already ordered x,y
-            polygon = contour[:, 0, ::] # already ordered x,y
+            polygon = contour[:, 0, ::]  # already ordered x,y
             # simplify and validate:
             polygon = Polygon(polygon)
             if not polygon.is_valid:
@@ -220,22 +223,22 @@ def getx(xy):
             if not polygon.is_valid:
                 #LOG.debug(polygon.wkt)
                 logger.warning(explain_validity(polygon))
-            poly = polygon.exterior.coords[:-1] # keep open
+            poly = polygon.exterior.coords[:-1]  # keep open
             if len(poly) < 4:
                 logger.warning(f'Label {label} contour {i} for {name} has less than 4 points')
                 continue
             # get baseline segments intersecting with this line mask
             # and concatenate them from left to right
             if baselines is not None:
-                base = join_baselines(logger, [baseline.intersection(polygon)
-                                       for baseline in baselines
-                                       if baseline.intersects(polygon)], name)
+                base = join_baselines(
+                    logger,
+                    [baseline.intersection(polygon) for baseline in baselines if baseline.intersects(polygon)], name)
                 if base is not None:
                     base = base.coords
             else:
                 base = None
             results.append((label, poly, base))
-            result_labels[contour_labels == i+1] = len(results)
+            result_labels[contour_labels == i + 1] = len(results)
     return results, result_labels
 
 

From 7ca78a97db34559ebf1a8dd819ea08e5415ec8d9 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 16:40:08 +0200
Subject: [PATCH 138/194] spacing: resegment

---
 ocrd_cis/ocropy/resegment.py | 94 +++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 51 deletions(-)

diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index d429c1de..48bb0d40 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -144,11 +144,11 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
                 else:
                     self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines')
         return OcrdPageResult(pcgts)
- 
+
     def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, lines, ignore):
         threshold = self.parameter['min_fraction']
         method = self.parameter['method']
-        maxdist = self.parameter['spread']/zoom*300/72 # in pt
+        maxdist = self.parameter['spread'] / zoom * 300 / 72  # in pt
         # prepare line segmentation
         parent_array = pil2array(parent_image)
         #parent_array, _ = common.binarize(parent_array, maxskew=0) # just in case still raw
@@ -172,7 +172,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
         for i, line in enumerate(lines):
             if self.parameter['baseline_only'] and line.Baseline:
                 line_base = baseline_of_segment(line, parent_coords)
-                line_poly = polygon_from_baseline(line_base, 30/zoom)
+                line_poly = polygon_from_baseline(line_base, 30 / zoom)
             else:
                 line_poly = coordinates_of_segment(line, parent_image, parent_coords)
                 line_poly = make_valid(Polygon(line_poly))
@@ -184,9 +184,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             # (causing negative/above-max indices), either fully or partially,
             # then this will silently ignore them. The caller does not need
             # to concern herself with this.
-            line_y, line_x = draw.polygon(polygon[:, 1],
-                                          polygon[:, 0],
-                                          parent_bin.shape)
+            line_y, line_x = draw.polygon(polygon[:, 1], polygon[:, 0], parent_bin.shape)
             line_labels[i, line_y, line_x] = True
         # only text region(s) may contain new text lines
         for i, region in enumerate(set(line.parent_object_ for line in lines)):
@@ -194,17 +192,13 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             region_polygon = coordinates_of_segment(region, parent_image, parent_coords)
             region_polygon = make_valid(Polygon(region_polygon))
             region_polygon = np.array(region_polygon.exterior.coords, int)[:-1]
-            ignore_bin[draw.polygon(region_polygon[:, 1],
-                                    region_polygon[:, 0],
-                                    parent_bin.shape)] = False
+            ignore_bin[draw.polygon(region_polygon[:, 1], region_polygon[:, 0], parent_bin.shape)] = False
         # mask/ignore overlapping neighbours
         for i, segment in enumerate(ignore):
             self.logger.debug(f'Masking area of {type(segment).__name__[:-4]} "{segment.id}" for '
                               f'"{page_id if fullpage else parent.id}"')
             segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords)
-            ignore_bin[draw.polygon(segment_polygon[:, 1],
-                                    segment_polygon[:, 0],
-                                    parent_bin.shape)] = True
+            ignore_bin[draw.polygon(segment_polygon[:, 1], segment_polygon[:, 0], parent_bin.shape)] = True
         if method != 'lineest':
             self.logger.debug(f'Calculating connected component and distance transforms for "{parent.id}"')
             bin = parent_bin & ~ ignore_bin
@@ -213,8 +207,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             _, counts = np.unique(components, return_counts=True)
             if counts.shape[0] > 1:
                 counts = np.sqrt(3 * counts)
-                scale = int(np.median(counts[(5/zoom < counts) & (counts < 100/zoom)]))
-                components *= (counts > 15/zoom)[components]
+                scale = int(np.median(counts[(5 / zoom < counts) & (counts < 100 / zoom)]))
+                components *= (counts > 15 / zoom)[components]
                 self.logger.debug(f"Estimated scale: {scale}")
             else:
                 scale = 43
@@ -244,12 +238,12 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                                                   parent_bin.shape)
                     new_labels[line_y, line_x] = i + 1
             spread_dist(self.logger, lines, line_labels, new_labels, parent_bin, components, parent_coords,
-                        maxdist=maxdist or scale/2, loc=parent.id, threshold=threshold)
+                        maxdist=maxdist or scale / 2, loc=parent.id, threshold=threshold)
             return
         try:
             # TODO: 'scale' passed as a param may not be always defined (mehmedGIT)
             new_line_labels, new_baselines, _, _, _, scale = compute_segmentation(
-                parent_bin, seps=ignore_bin, zoom=zoom, spread_dist=maxdist or scale/2,
+                parent_bin, seps=ignore_bin, zoom=zoom, spread_dist=maxdist or scale / 2,
                 fullpage=fullpage, maxseps=0, maxcolseps=len(ignore), maximages=0)
         except Exception as err:
             self.logger.error(f'Cannot line-segment {tag} "{page_id if fullpage else parent.id}": {err}')
@@ -257,13 +251,13 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
         self.logger.info(
             f"Found {new_line_labels.max()} new line labels for {len(lines)} existing lines on {tag} '{parent.id}'")
         # polygonalize and prepare comparison
-        new_line_polygons, new_line_labels = masks2polygons(self.logger,
-            new_line_labels, new_baselines, parent_bin, name=f'{tag} "{parent.id}"',
-            min_area=640/zoom/zoom)
+        new_line_polygons, new_line_labels = masks2polygons(
+            self.logger, new_line_labels, new_baselines, parent_bin, name=f'{tag} "{parent.id}"',
+            min_area=640 / zoom / zoom)
         DSAVE('line_labels', [np.argmax(np.insert(line_labels, 0, 0, axis=0), axis=0), parent_bin])
         DSAVE('new_line_labels', [new_line_labels, parent_bin])
-        new_line_polygons, new_baselines = list(zip(*[(Polygon(poly), LineString(base))
-                                                      for _, poly, base in new_line_polygons])) or ([], [])
+        new_line_polygons, new_baselines = list(zip(
+            *[(Polygon(poly), LineString(base)) for _, poly, base in new_line_polygons])) or ([], [])
         # polygons for intersecting pairs
         intersections = dict()
         # ratio of overlap between intersection and new line
@@ -281,12 +275,12 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                 inter = make_intersection(line_poly.context, new_line_poly)
                 if not inter:
                     continue
-                new_line_mask = (new_line_labels == i+1) & parent_bin
+                new_line_mask = (new_line_labels == i + 1) & parent_bin
                 line_mask = line_labels[j] & parent_bin
                 inter_mask = new_line_mask & line_mask
                 if (not np.count_nonzero(inter_mask) or
-                    not np.count_nonzero(new_line_mask) or
-                    not np.count_nonzero(line_mask)):
+                        not np.count_nonzero(new_line_mask) or
+                        not np.count_nonzero(line_mask)):
                     continue
                 intersections[(i, j)] = inter
                 fits_bg[i, j] = inter.area / new_line_poly.area
@@ -344,17 +338,17 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             if not np.prod(new_lines.shape):
                 self.logger.debug(f"no lines for '{line.id}' match or fit", )
                 continue
-            covers = np.sum(covers_bg[new_lines,j])
+            covers = np.sum(covers_bg[new_lines, j])
             if covers < threshold / 3:
                 self.logger.debug(f"new lines for '{line.id}' only cover %.1f%% bg", covers * 100)
                 continue
-            covers = np.sum(covers_fg[new_lines,j])
+            covers = np.sum(covers_fg[new_lines, j])
             if covers < threshold:
                 self.logger.debug(f"new lines for '{line.id}' only cover %.1f%% fg", covers * 100)
                 continue
-            looses = (assignments < 0) & (covers_bg[:,j] > 0.1)
+            looses = (assignments < 0) & (covers_bg[:, j] > 0.1)
             if looses.any():
-                covers = np.sum(covers_bg[np.nonzero(looses)[0],j])
+                covers = np.sum(covers_bg[np.nonzero(looses)[0], j])
                 self.logger.debug(
                     f"new lines for '{line.id}' would loose {np.count_nonzero(looses)} non-matching segments "
                     f"totalling %.1f%% bg", covers * 100)
@@ -365,13 +359,12 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             # combine all assigned new lines to single outline polygon
             if len(new_lines) > 1:
                 self.logger.debug(f"joining {len(new_lines)} new line polygons for '{line.id}'")
-            new_polygon = join_polygons([new_line_polygons[i] #intersections[(i, j)]
-                                         for i in new_lines], loc=line.id, scale=scale)
-            new_baseline = join_baselines(self.logger, [new_polygon.intersection(new_baselines[i])
-                                           for i in new_lines], loc=line.id)
+            # intersections[(i, j)]
+            new_polygon = join_polygons([new_line_polygons[i] for i in new_lines], loc=line.id, scale=scale)
+            new_baseline = join_baselines(
+                self.logger, [new_polygon.intersection(new_baselines[i]) for i in new_lines], loc=line.id)
             # convert back to absolute (page) coordinates:
-            line_polygon = coordinates_for_segment(new_polygon.exterior.coords[:-1],
-                                                   parent_image, parent_coords)
+            line_polygon = coordinates_for_segment(new_polygon.exterior.coords[:-1], parent_image, parent_coords)
             line_polygon = polygon_for_parent(line_polygon, line.parent_object_)
             if line_polygon is None:
                 self.logger.warning(f"Ignoring extant new polygon for line '{line.id}'")
@@ -379,8 +372,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             # annotate result:
             line.get_Coords().set_points(points_from_polygon(line_polygon))
             if new_baseline is not None:
-                new_baseline = coordinates_for_segment(new_baseline.coords,
-                                                       parent_image, parent_coords)
+                new_baseline = coordinates_for_segment(new_baseline.coords, parent_image, parent_coords)
                 line.set_Baseline(BaselineType(points=points_from_polygon(new_baseline)))
             line_polygons[j] = prep(new_polygon)
             # now also ensure the assigned lines do not overlap other existing lines
@@ -394,20 +386,22 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                     if other_polygon.is_empty:
                         continue
                     # convert back to absolute (page) coordinates:
-                    other_polygon = coordinates_for_segment(other_polygon.exterior.coords[:-1],
-                                                            parent_image, parent_coords)
+                    other_polygon = coordinates_for_segment(
+                        other_polygon.exterior.coords[:-1], parent_image, parent_coords)
                     other_polygon = polygon_for_parent(other_polygon, otherline.parent_object_)
                     if other_polygon is None:
                         self.logger.warning(f"Ignoring extant new polygon for line '{otherline.id}'")
                         continue
                     otherline.get_Coords().set_points(points_from_polygon(other_polygon))
 
-def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, components, coords,
-                maxdist=43, loc='', threshold=0.9):
+
+def spread_dist(
+        logger: Logger, lines, old_labels, new_labels, binarized, components, coords, maxdist=43, loc='',
+        threshold=0.9):
     """redefine line coordinates by contourizing spread of connected components propagated from new labels"""
     DSAVE('seeds', [new_labels, (components>0)])
     # allocate to connected components consistently
-    # (ignoring smallest components like punctuation)
+    # (ignoring the smallest components like punctuation)
     # but when there are conflicts, meet in the middle via watershed
     new_labels2 = morph.propagate_labels(components > 0, new_labels, conflict=0)
     new_labels2 = segmentation.watershed(new_labels2, markers=new_labels, mask=(components > 0))
@@ -415,7 +409,7 @@ def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, compon
     # dilate/grow labels from connected components against each other and bg
     new_labels = morph.spread_labels(new_labels2, maxdist=maxdist)
     DSAVE('spread', new_labels)
-    # now propagate again to catch smallest components like punctuation
+    # now propagate again to catch the smallest components like punctuation
     new_labels2 = morph.propagate_labels(binarized, new_labels, conflict=0)
     new_labels2 = segmentation.watershed(new_labels2, markers=new_labels, mask=binarized)
     DSAVE('propagated-again', [new_labels2, binarized & (new_labels2==0)])
@@ -444,7 +438,7 @@ def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, compon
             logger.debug(f"new line for '{line.id}' only covers %.1f%% fg", covers * 100)
             continue
         logger.debug(f'Black pixels before/after resegment of line "{line.id}": {count}/{covers * count}')
-        contours = [contour[:,::-1] # get x,y order again
+        contours = [contour[:, :: -1]  # get x,y order again
                     for contour, area in morph.find_contours(new_label)]
         #LOG.debug("joining %d subsegments for %s", len(contours), line.id)
         if len(contours) == 0:
@@ -452,10 +446,9 @@ def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, compon
             continue
         else:
             # get alpha shape
-            poly = join_polygons([make_valid(Polygon(contour))
-                                  for contour in contours
-                                  if len(contour) >= 4],
-                                 loc=line.id, scale=maxdist)
+            poly = join_polygons(
+                [make_valid(Polygon(contour)) for contour in contours if len(contour) >= 4],
+                loc=line.id, scale=maxdist)
         poly = poly.exterior.coords[:-1]
         polygon = coordinates_for_segment(poly, None, coords)
         polygon = polygon_for_parent(polygon, line.parent_object_)
@@ -472,9 +465,8 @@ def baseline_of_segment(segment, coords):
 
 # zzz should go into core ocrd_utils
 def polygon_from_baseline(baseline, scale):
-    ltr = baseline[0,0] < baseline[-1,0]
+    ltr = baseline[0, 0] < baseline[-1, 0]
     # left-hand side if left-to-right, and vice versa
-    polygon = make_valid(join_polygons([LineString(baseline).buffer(scale * (-1) ** ltr,
-                                                                    single_sided=True)],
-                                       scale=scale))
+    polygon = make_valid(join_polygons(
+        [LineString(baseline).buffer(scale * (-1) ** ltr, single_sided=True)], scale=scale))
     return polygon

From 1004b431e451be4288aa98054dff843bce3e306b Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 16:52:51 +0200
Subject: [PATCH 139/194] spacing: rest

---
 ocrd_cis/ocropy/binarize.py  | 11 ++++++-----
 ocrd_cis/ocropy/clip.py      | 34 ++++++++++++++++++----------------
 ocrd_cis/ocropy/denoise.py   |  9 ++++-----
 ocrd_cis/ocropy/deskew.py    | 22 +++++++++++-----------
 ocrd_cis/ocropy/dewarp.py    | 21 ++++++++++-----------
 ocrd_cis/ocropy/recognize.py | 35 +++++++++++++++--------------------
 ocrd_cis/ocropy/resegment.py |  2 +-
 ocrd_cis/ocropy/segment.py   |  2 +-
 8 files changed, 66 insertions(+), 70 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 782dd578..35b28c5a 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -38,14 +38,14 @@ def binarize(logger: Logger, pil_image, method='ocropy', maxskew=2, threshold=0.
 
         if method == 'global':
             # global thresholding
-            _, th = cv2.threshold(img,threshold*255,255,cv2.THRESH_BINARY)
+            _, th = cv2.threshold(img, threshold * 255, 255, cv2.THRESH_BINARY)
         elif method == 'otsu':
             # Otsu's thresholding
-            _, th = cv2.threshold(img,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
+            _, th = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
         elif method == 'gauss-otsu':
             # Otsu's thresholding after Gaussian filtering
             blur = cv2.GaussianBlur(img, (5, 5), 0)
-            _, th = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU)
+            _, th = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
         else:
             raise Exception('unknown binarization method %s' % method)
         return Image.fromarray(th), 0
@@ -95,7 +95,8 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
         page = pcgts.get_Page()
         assert page
 
-        page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
+        page_image, page_xywh, page_image_info = self.workspace.image_from_page(
+            page, page_id, feature_filter='binarized')
         zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
         result = OcrdPageResult(pcgts)
@@ -162,7 +163,7 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id) -> OcrdPageRe
         # to do consistent coordinate transforms, and non-consumers
         # to redo the rotation themselves):
         orientation = -page_xywh['angle']
-        orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
+        orientation = 180 - (180 - orientation) % 360  # map to [-179.999,180]
         page.set_orientation(orientation)
         if self.parameter['grayscale']:
             suffix = '.IMG-NRM'
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 7f40a214..f5390dde 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -124,16 +124,17 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = No
             masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8) for polygon in polygons]
         for i, region in enumerate(regions):
             if i >= num_texts:
-                break # keep non-text regions unchanged
+                break  # keep non-text regions unchanged
             if level == 'region':
                 if region.get_AlternativeImage():
                     # FIXME: This should probably be an exception (bad workflow configuration).
                     self.logger.warning(f'Page "{page_id}" region "{region.id}" already contains image data: skipping')
                     continue
                 shape = prep(shapes[i])
-                neighbours = [(regionj, maskj) for shapej, regionj, maskj
-                              in zip(shapes[:i] + shapes[i+1:], regions[:i] + regions[i+1:], masks[:i] + masks[i+1:])
-                              if shape.intersects(shapej)]
+                neighbours = [
+                    (regionj, maskj) for shapej, regionj, maskj in
+                    zip(shapes[:i] + shapes[i + 1:], regions[:i] + regions[i + 1:], masks[:i] + masks[i + 1:])
+                    if shape.intersects(shapej)]
                 if neighbours:
                     ret.images.append(self.process_segment(
                         region, masks[i], polygons[i], neighbours, background_image,
@@ -161,24 +162,25 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = No
                         f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image data: skipping')
                     continue
                 shape = prep(shapes[j])
-                neighbours = [(linej, maskj) for shapej, linej, maskj
-                              in zip(shapes[:j] + shapes[j+1:], lines[:j] + lines[j+1:], masks[:j] + masks[j+1:])
-                              if shape.intersects(shapej)]
+                neighbours = [
+                    (linej, maskj) for shapej, linej, maskj in
+                    zip(shapes[:j] + shapes[j + 1:], lines[:j] + lines[j + 1:], masks[:j] + masks[j + 1:])
+                    if shape.intersects(shapej)]
                 if neighbours:
                     ret.images.append(self.process_segment(
                         line, masks[j], polygons[j], neighbours, background_image,
                         region_image, region_coords, region_bin, page_id))
         return ret
 
-    def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
-                        background_image, parent_image, parent_coords, parent_bin,
-                        page_id) -> OcrdPageResultImage:
+    def process_segment(
+            self, segment, segment_mask, segment_polygon, neighbours, background_image, parent_image, parent_coords,
+            parent_bin, page_id
+    ) -> OcrdPageResultImage:
         # initialize AlternativeImage@comments classes from parent, except
         # for those operations that can apply on multiple hierarchy levels:
         features = ','.join(
             [feature for feature in parent_coords['features'].split(',')
-             if feature in ['binarized', 'grayscale_normalized',
-                            'despeckled', 'dewarped']]) + ',clipped'
+             if feature in ['binarized', 'grayscale_normalized', 'despeckled', 'dewarped']]) + ',clipped'
         # mask segment within parent image:
         segment_image = image_from_polygon(parent_image, segment_polygon)
         segment_bbox = bbox_from_polygon(segment_polygon)
@@ -188,8 +190,8 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
                     f'Ignoring enclosing neighbour "{neighbour.id}" of segment "{segment.id}" on page "{page_id}"')
                 continue
             # find connected components that (only) belong to the neighbour:
-            intruders = segment_mask * morph.keep_marked(parent_bin, neighbour_mask > 0) # overlaps neighbour
-            intruders = morph.remove_marked(intruders, segment_mask > neighbour_mask) # but exclusively
+            intruders = segment_mask * morph.keep_marked(parent_bin, neighbour_mask > 0)  # overlaps neighbour
+            intruders = morph.remove_marked(intruders, segment_mask > neighbour_mask)  # but exclusively
             num_intruders = np.count_nonzero(intruders)
             num_foreground = np.count_nonzero(segment_mask * parent_bin)
             if not num_intruders:
@@ -202,14 +204,14 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
             segment_mask -= intruders
             # suppress in derived image result to be annotated
             clip_mask = array2pil(intruders)
-            segment_image.paste(background_image, mask=clip_mask) # suppress in raw image
+            segment_image.paste(background_image, mask=clip_mask)  # suppress in raw image
             if segment_image.mode in ['RGB', 'L', 'RGBA', 'LA']:
                 # for consumers that do not have to rely on our
                 # guessed background color, but can cope with transparency:
                 segment_image.putalpha(ImageOps.invert(clip_mask))
         # recrop segment into rectangle, just as image_from_segment would do
         # (and also clipping with background colour):
-        segment_image = crop_image(segment_image,box=segment_bbox)
+        segment_image = crop_image(segment_image, box=segment_bbox)
         # update PAGE (reference the image file):
         alternative_image = AlternativeImageType(comments=features)
         segment.add_AlternativeImage(alternative_image)
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index b3c219fb..0dd14ef8 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -19,7 +19,7 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropyDenoise')
 
-    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """Despeckle the pages / regions / lines of the workspace.
 
         Open and deserialise PAGE input file and its respective images,
@@ -72,8 +72,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
                     self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines')
                 for line in lines:
                     line_image, line_xywh = self.workspace.image_from_segment(
-                        line, region_image, region_xywh,
-                        feature_selector='binarized')
+                        line, region_image, region_xywh, feature_selector='binarized')
                     image = self.process_segment(line, line_image, line_xywh, zoom)
                     if image:
                         result.images.append(image)
@@ -83,8 +82,8 @@ def process_segment(self, segment, segment_image, segment_xywh, zoom) -> Optiona
             self.logger.warning(f"Skipping '{segment.id}' with zero size")
             return None
         self.logger.info(f"About to despeckle '{segment.id}'")
-        bin_image = remove_noise(segment_image,
-                                 maxsize=self.parameter['noise_maxsize']/zoom*300/72) # in pt
+        bin_image = remove_noise(
+            segment_image, maxsize=self.parameter['noise_maxsize'] / zoom * 300 / 72)  # in pt
         # update PAGE (reference the image file):
         alt_image = AlternativeImageType(comments=segment_xywh['features'] + ',despeckled')
         segment.add_AlternativeImage(alt_image)
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index 84475d81..7bdbba2d 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -25,7 +25,7 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropyDeskew')
 
-    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """Deskew the pages or regions of the workspace.
 
         Open and deserialise PAGE input file and its respective images,
@@ -61,7 +61,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
             return result
         if level == 'table':
             regions = page.get_TableRegion()
-        else: # region
+        else:  # region
             regions = page.get_AllRegions(classes=['Text'], order='reading-order')
         if not regions:
             self.logger.warning('Page "%s" contains no text regions', page_id)
@@ -78,29 +78,29 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
                 result.images.append(image)
         return result
 
-    def _process_segment(self, segment, segment_image, segment_coords, segment_id, page_id) -> Optional[OcrdPageResultImage]:
+    def _process_segment(
+            self, segment, segment_image, segment_coords, segment_id, page_id
+    ) -> Optional[OcrdPageResultImage]:
         if not segment_image.width or not segment_image.height:
             self.logger.warning("Skipping %s with zero size", segment_id)
             return None
-        angle0 = segment_coords['angle'] # deskewing (w.r.t. top image) already applied to segment_image
+        angle0 = segment_coords['angle']  # deskewing (w.r.t. top image) already applied to segment_image
         self.logger.info(f"About to deskew {segment_id}")
-        angle = deskew(segment_image, maxskew=self.parameter['maxskew']) # additional angle to be applied
+        angle = deskew(segment_image, maxskew=self.parameter['maxskew'])  # additional angle to be applied
         # segment angle: PAGE orientation is defined clockwise,
         # whereas PIL/ndimage rotation is in mathematical direction:
         orientation = -(angle + angle0)
-        orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
-        segment.set_orientation(orientation) # also removes all deskewed AlternativeImages
+        orientation = 180 - (180 - orientation) % 360  # map to [-179.999,180]
+        segment.set_orientation(orientation)  # also removes all deskewed AlternativeImages
         self.logger.info(f"Found angle for {segment_id}: %.1f", angle)
         # delegate reflection, rotation and re-cropping to core:
         if isinstance(segment, PageType):
             segment_image, segment_coords, _ = self.workspace.image_from_page(
-                segment, page_id,
-                fill='background', transparency=True)
+                segment, page_id, fill='background', transparency=True)
             suffix = '.IMG-DESKEW'
         else:
             segment_image, segment_coords = self.workspace.image_from_segment(
-                segment, segment_image, segment_coords,
-                fill='background', transparency=True)
+                segment, segment_image, segment_coords, fill='background', transparency=True)
             suffix = segment.id + '.IMG-DESKEW'
         if not angle:
             # zero rotation does not change coordinates,
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 302cf2e0..e06718c8 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -22,27 +22,27 @@ def dewarp(image, lnorm, check=True, max_neighbour=0.02, zoom=1.0):
     if not image.width or not image.height:
         raise InvalidLine('image size is zero')
     line = pil2array(image)
-    
+
     if np.prod(line.shape) == 0:
         raise InvalidLine('image dimensions are zero')
     if np.amax(line) == np.amin(line):
         raise InvalidLine('image is blank')
-    
-    temp = np.amax(line)-line # inverse, zero-closed
+
+    temp = np.amax(line) - line  # inverse, zero-closed
     if check:
         report = check_line(temp, zoom=zoom)
         if report:
             raise InadequateLine(report)
-    
-    temp = temp * 1.0 / np.amax(temp) # normalized
+
+    temp = temp * 1.0 / np.amax(temp)  # normalized
     if check:
         report = lnorm.check(temp, max_ignore=max_neighbour)
         if report:
             raise InvalidLine(report)
 
-    lnorm.measure(temp) # find centerline
+    lnorm.measure(temp)  # find centerline
     line = lnorm.dewarp(line, cval=np.amax(line))
-    
+
     return array2pil(line)
 
 # pad with white above and below (as a fallback for dewarp)
@@ -72,7 +72,7 @@ def setup(self):
                     #  and extra params)
                     0.3))
 
-    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """Dewarp the lines of the workspace.
 
         Open and deserialise PAGE input file and its respective images,
@@ -115,9 +115,8 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
 
                 self.logger.info(f"About to dewarp page '{page_id}' region '{region.id}' line '{line.id}'")
                 try:
-                    dew_image = dewarp(line_image, self.lnorm, check=True,
-                                       max_neighbour=self.parameter['max_neighbour'],
-                                       zoom=zoom)
+                    dew_image = dewarp(
+                        line_image, self.lnorm, check=True, max_neighbour=self.parameter['max_neighbour'], zoom=zoom)
                 except InvalidLine as err:
                     self.logger.error(f'Cannot dewarp line "{line.id}": {err}')
                     continue
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index f0c4b520..02d29e7c 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -47,7 +47,7 @@ def recognize(image, pad, network, check=True):
 
     # getting confidence
     result = lstm.translate_back(network.outputs, pos=1)
-    scale = len(raw_line.T)*1.0/(len(network.outputs)-2*pad)
+    scale = len(raw_line.T) * 1.0 / (len(network.outputs) - 2 * pad)
 
     clist = []
     rlist = []
@@ -57,7 +57,7 @@ def recognize(image, pad, network, check=True):
         if c != 0:
             confid = network.outputs[r, c]
             c = network.l2s([c])
-            r = (r-pad)*scale
+            r = (r - pad) * scale
 
             confidlist.append(confid)
             clist.append(c)
@@ -88,7 +88,7 @@ def setup(self):
 
     def get_model(self):
         """Search for the model file.  First checks if parameter['model'] can
-        be resolved with OcrdResourceManager to a valid readeable file and
+        be resolved with OcrdResourceManager to a valid readable file and
         returns it.  If not, it checks if the model can be found in the
         dirname(__file__)/models/ directory."""
         canread = lambda p: isfile(p) and access(p, R_OK)
@@ -202,8 +202,8 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh):
 
             words = [x.strip() for x in linepred.split(' ') if x.strip()]
 
-            word_r_list = [[0]] # r-positions of every glyph in every word
-            word_conf_list = [[]] # confidences of every glyph in every word
+            word_r_list = [[0]]  # r-positions of every glyph in every word
+            word_conf_list = [[]]  # confidences of every glyph in every word
             if words != []:
                 w_no = 0
                 found_char = False
@@ -215,7 +215,7 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh):
                     if c == ' ' and found_char:
                         if i == 0:
                             word_r_list[0][0] = rlist[i]
-                        elif i+1 <= len(clist)-1 and clist[i+1] != ' ':
+                        elif i + 1 <= len(clist) - 1 and clist[i + 1] != ' ':
                             word_conf_list.append([])
                             word_r_list.append([rlist[i]])
                             w_no += 1
@@ -224,9 +224,9 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh):
                 word_r_list = [[0, line_image.width]]
 
             # conf for each word
-            wordsconf = [(min(x)+max(x))/2 for x in word_conf_list]
+            wordsconf = [(min(x) + max(x)) / 2 for x in word_conf_list]
             # conf for the line
-            line_conf = (min(wordsconf) + max(wordsconf))/2
+            line_conf = (min(wordsconf) + max(wordsconf)) / 2
             # line text
             line.add_TextEquiv(TextEquivType(Unicode=linepred, conf=line_conf))
 
@@ -235,32 +235,27 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh):
                     word_points = points_from_polygon(
                         coordinates_for_segment(
                             np.array(polygon_from_bbox(
-                                word_r_list[word_no][0] / scale,
-                                0,
-                                word_r_list[word_no][-1] / scale,
-                                0 + line_image.height)),
+                                word_r_list[word_no][0] / scale,0,
+                                word_r_list[word_no][-1] / scale, 0 + line_image.height)),
                             line_image,
                             line_coords))
                     word_id = '%s_word%04d' % (line.id, word_no)
                     word = WordType(id=word_id, Coords=CoordsType(word_points))
                     line.add_Word(word)
-                    word.add_TextEquiv(TextEquivType(
-                        Unicode=word_str, conf=wordsconf[word_no]))
+                    word.add_TextEquiv(TextEquivType(Unicode=word_str, conf=wordsconf[word_no]))
 
                     if maxlevel == 'glyph':
                         for glyph_no, glyph_str in enumerate(word_str):
                             glyph_points = points_from_polygon(
                                 coordinates_for_segment(
                                     np.array(polygon_from_bbox(
-                                        word_r_list[word_no][glyph_no] / scale,
-                                        0,
-                                        word_r_list[word_no][glyph_no+1] / scale,
-                                        0 + line_image.height)),
+                                        word_r_list[word_no][glyph_no] / scale, 0,
+                                        word_r_list[word_no][glyph_no + 1] / scale, 0 + line_image.height)),
                                     line_image,
                                     line_coords))
                             glyph_id = '%s_glyph%04d' % (word.id, glyph_no)
                             glyph = GlyphType(id=glyph_id, Coords=CoordsType(glyph_points))
                             word.add_Glyph(glyph)
-                            glyph.add_TextEquiv(TextEquivType(
-                                Unicode=glyph_str, conf=word_conf_list[word_no][glyph_no]))
+                            glyph.add_TextEquiv(
+                                TextEquivType(Unicode=glyph_str, conf=word_conf_list[word_no][glyph_no]))
         return edits, lengs
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 48bb0d40..5a8c7e96 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -52,7 +52,7 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropyResegment')
 
-    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """Resegment lines of the workspace.
 
         Open and deserialise PAGE input file and its respective images,
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 75be2a11..6dc75056 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -252,7 +252,7 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropySegment')
 
-    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """Segment pages into regions+lines, tables into cells+lines, or regions into lines.
         
         Open and deserialise PAGE input file and its respective images,

From c5498a0e8d8bc9a8e3fe3bf0848df9b135bae69c Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 16:55:44 +0200
Subject: [PATCH 140/194] spacing: dewarp

---
 ocrd_cis/ocropy/dewarp.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index e06718c8..89901efd 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -95,24 +95,19 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
         result = OcrdPageResult(pcgts)
         page = pcgts.get_Page()
 
-        page_image, page_xywh, page_image_info = self.workspace.image_from_page(
-            page, page_id)
+        page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id)
         zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
         regions = page.get_AllRegions(classes=['Text'], order='reading-order')
         if not regions:
             self.logger.warning(f'Page "{page_id}" contains no text regions')
         for region in regions:
-            region_image, region_xywh = self.workspace.image_from_segment(
-                region, page_image, page_xywh)
-
+            region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh)
             lines = region.get_TextLine()
             if not lines:
                 self.logger.warning(f'Region {region.id} contains no text lines')
             for line in lines:
-                line_image, line_xywh = self.workspace.image_from_segment(
-                    line, region_image, region_xywh)
-
+                line_image, line_xywh = self.workspace.image_from_segment(line, region_image, region_xywh)
                 self.logger.info(f"About to dewarp page '{page_id}' region '{region.id}' line '{line.id}'")
                 try:
                     dew_image = dewarp(

From 31e124577faad71f2bb039a6b094900b6cdf9df1 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 16:58:52 +0200
Subject: [PATCH 141/194] fix: dewarp return

---
 ocrd_cis/ocropy/dewarp.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 89901efd..17d0b4ce 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -123,5 +123,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                     dew_image = padvert(line_image, self.parameter['range'])
                 # update PAGE (reference the image file):
                 alt_image = AlternativeImageType(comments=line_xywh['features'] + ',dewarped')
-                line.add_AlternativeImage(alternative_image)
-                return OcrdPageResultImage(dew_image, region.id + '_' + line.id + '.IMG-DEWARP', alt_image)
+                line.add_AlternativeImage(alt_image)
+                suffix = f"{region.id}_{line.id}.IMG-DEWARP"
+                result.images.append(OcrdPageResultImage(dew_image, suffix, alt_image))
+        return result

From f86c99391e987d4918b6d626dbf1b2f990d7712b Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 17:21:14 +0200
Subject: [PATCH 142/194] improve str speed: precompute element_name_id

---
 ocrd_cis/ocropy/segment.py | 92 +++++++++++++++++---------------------
 1 file changed, 42 insertions(+), 50 deletions(-)

diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 6dc75056..9daf59de 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -388,13 +388,12 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                 rogroup = OrderedGroupType(id="reading-order")
                 ro.set_OrderedGroup(rogroup)
             if (not rogroup.get_RegionRefIndexed() and
-                not rogroup.get_OrderedGroupIndexed() and
-                not rogroup.get_UnorderedGroupIndexed()):
-                 # schema forbids empty OrderedGroup
+                    not rogroup.get_OrderedGroupIndexed() and
+                    not rogroup.get_UnorderedGroupIndexed()):
+                # schema forbids empty OrderedGroup
                 ro.set_OrderedGroup(None)
             # go get TextRegions with TextLines (and SeparatorRegions):
-            image = self._process_element(page, ignore, page_image, page_coords,
-                                          zoom=zoom, rogroup=rogroup)
+            image = self._process_element(page, ignore, page_image, page_coords, zoom=zoom, rogroup=rogroup)
             if image:
                 result.images.append(image)
             return result
@@ -450,11 +449,11 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                     roelem = page_subgroup_in_reading_order(self.logger, roelem)
                     reading_order[region.id] = roelem
                 # go get TextRegions with TextLines (and SeparatorRegions)
-                image = self._process_element(region, subignore, region_image, region_coords,
-                                              zoom=zoom, rogroup=roelem)
+                image = self._process_element(
+                    region, subignore, region_image, region_coords, zoom=zoom, rogroup=roelem)
                 if image:
                     result.images.append(image)
-        else: # 'region'
+        else:  # 'region'
             regions = list(page.get_TextRegion())
             # besides top-level text regions, line-segment any table cells,
             # and for tables without any cells, add a pseudo-cell
@@ -463,10 +462,8 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                 if subregions:
                     regions.extend(subregions)
                 else:
-                    subregion = TextRegionType(id=region.id + '_text',
-                                               Coords=region.get_Coords(),
-                                               # as if generated from parser:
-                                               parent_object_=region)
+                    subregion = TextRegionType(
+                        id=f'{region.id}_text', Coords=region.get_Coords(), parent_object_=region)
                     region.add_TextRegion(subregion)
                     regions.append(subregion)
             if not regions:
@@ -490,7 +487,6 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                 image = self._process_element(region, ignore, region_image, region_coords, zoom=zoom)
                 if image:
                     result.images.append(image)
-
         return result
 
     def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=None) -> Optional[OcrdPageResultImage]:
@@ -535,7 +531,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
             sp_col = segment_polygon[:, 0]
             if isinstance(segment, SeparatorRegionType):
                 sep_bin[draw.polygon(sp_row, sp_col, sep_bin.shape)] = True
-            ignore_labels[draw.polygon(sp_row, sp_col, ignore_labels.shape)] = i+1 # mapped back for RO
+            ignore_labels[draw.polygon(sp_row, sp_col, ignore_labels.shape)] = i + 1 #  mapped back for RO
         if isinstance(element, PageType):
             element_name = 'page'
             fullpage = True
@@ -555,6 +551,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
             report = check_region(element_bin, zoom)
             suffix = element.id + '.IMG-CLIP'
         self.logger.info(f'Computing line segmentation for {element_name} "{element.id}"')
+        element_name_id = f'{element_name} "{element.id}"'
         # TODO: we should downscale if DPI is large enough to save time
         try:
             if report:
@@ -564,7 +561,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                 # but keep them for h/v-line detection (in fullpage mode):
                 element_bin, seps=(sep_bin + ignore_labels) > 0,
                 zoom=zoom, fullpage=fullpage,
-                spread_dist=round(self.parameter['spread'] / zoom * 300 / 72), # in pt
+                spread_dist=round(self.parameter['spread'] / zoom * 300 / 72),  # in pt
                 # these are ignored when not in fullpage mode:
                 maxcolseps=self.parameter['maxcolseps'],
                 maxseps=self.parameter['maxseps'],
@@ -576,10 +573,10 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                 # as a fallback, add a single text line comprising the whole region:
                 element.add_TextLine(TextLineType(id=element.id + "_line", Coords=element.get_Coords()))
             else:
-                self.logger.error(f'Cannot line-segment {element_name} "{element.id}": {err}')
+                self.logger.error(f'Cannot line-segment {element_name_id}: {err}')
             return None
 
-        self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines for {element_name} "{element.id}"')
+        self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines for {element_name_id}')
         # post-process line labels
         if isinstance(element, (PageType, TableRegionType)):
             # aggregate text lines to text regions
@@ -594,18 +591,18 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                 region_labels = lines2regions(
                     element_bin, line_labels,
                     rlabels=ignore_labels,
-                    sepmask=np.maximum(sepmask, colseps), # add bg
+                    sepmask=np.maximum(sepmask, colseps),  # add bg
                     # decide horizontal vs vertical cut when gaps of similar size
                     prefer_vertical=not isinstance(element, TableRegionType),
                     gap_height=self.parameter['gap_height'],
                     gap_width=self.parameter['gap_width'],
                     scale=scale, zoom=zoom)
                 self.logger.info(
-                    f'Found {len(np.unique(region_labels)) - 1} text regions for {element_name} "{element.id}"')
+                    f'Found {len(np.unique(region_labels)) - 1} text regions for {element_name_id}')
             except Exception as err:
-                self.logger.error(f'Cannot region-segment {element_name} "{element.id}": {err}')
+                self.logger.error(f'Cannot region-segment {element_name_id}: {err}')
                 region_labels = np.where(line_labels > len(ignore), 1 + len(ignore), line_labels)
-            
+
             # prepare reading order group index
             if rogroup:
                 if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)):
@@ -622,7 +619,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
             region_no = 0
             for region_label in np.unique(region_labels):
                 if not region_label:
-                    continue # no bg
+                    continue  # no bg
                 region_mask = region_labels == region_label
                 region_line_labels = line_labels * region_mask
                 region_line_labels0 = np.setdiff1d(region_line_labels, [0])
@@ -644,18 +641,17 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                 order[np.setdiff1d(region_line_labels0, element_bin * region_line_labels)] = 0
                 region_line_labels = order[region_line_labels]
                 # avoid horizontal gaps
-                region_line_labels = hmerge_line_seeds(element_bin, region_line_labels, scale,
-                                                       seps=np.maximum(sepmask, colseps))
+                region_line_labels = hmerge_line_seeds(
+                    element_bin, region_line_labels, scale, seps=np.maximum(sepmask, colseps))
                 region_mask |= region_line_labels > 0
                 # find contours for region (can be non-contiguous)
-                regions, _ = masks2polygons(self.logger, region_mask * region_label, None, element_bin,
-                                            name=f'{element_name} "{element.id}"',
-                                            min_area=6000 / zoom / zoom,
-                                            simplify=ignore_labels * ~(sep_bin))
+                regions, _ = masks2polygons(
+                    self.logger, region_mask * region_label, None, element_bin, name=element_name_id,
+                    min_area=6000 / zoom / zoom, simplify=ignore_labels * ~(sep_bin))
                 # find contours for lines (can be non-contiguous)
-                lines, _ = masks2polygons(self.logger, region_line_labels, baselines, element_bin,
-                                          name=f'region "{element.id}"',
-                                          min_area=640 / zoom / zoom)
+                lines, _ = masks2polygons(
+                    self.logger, region_line_labels, baselines, element_bin, name=f'region "{element.id}"',
+                    min_area=640 / zoom / zoom)
                 # create new lines in new regions (allocating by intersection)
                 line_polys = [Polygon(polygon) for _, polygon, _ in lines]
                 for _, region_polygon, _ in regions:
@@ -674,7 +670,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                     # find out which line (contours) belong to which region (contours)
                     line_no = 0
                     for i, line_poly in enumerate(line_polys):
-                        if not region_poly.intersects(line_poly): # .contains
+                        if not region_poly.intersects(line_poly):  # .contains
                             continue
                         line_label, line_polygon, line_baseline = lines[i]
                         # convert back to absolute (page) coordinates:
@@ -696,16 +692,14 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                     # if the region has received text lines, keep it
                     if region.get_TextLine():
                         element.add_TextRegion(region)
-                        self.logger.info(f'Added region "{region_id}" with {line_no} lines '
-                                         f'for {element_name} "{element.id}"')
+                        self.logger.info(f'Added region "{region_id}" with {line_no} lines for {element_name_id}')
                         if rogroup:
                             index = page_add_to_reading_order(rogroup, region.id, index)
             # add additional image/non-text regions from compute_segmentation
             # (e.g. drop-capitals or images) ...
-            self.logger.info(f'Found {images.max()} large image regions for {element_name} "{element.id}"')
+            self.logger.info(f'Found {images.max()} large image regions for {element_name_id}')
             # find contours around region labels (can be non-contiguous):
-            image_polygons, _ = masks2polygons(self.logger, images, None, element_bin,
-                                               name=f'{element_name} "{element.id}"')
+            image_polygons, _ = masks2polygons(self.logger, images, None, element_bin, name=element_name_id)
             for image_label, polygon, _ in image_polygons:
                 # convert back to absolute (page) coordinates:
                 region_polygon = coordinates_for_segment(polygon, image, coords)
@@ -719,11 +713,10 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                 element.add_ImageRegion(ImageRegionType(
                     id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon))))
             # split detected separator labels into separator regions:
-            self.logger.info(f'Found {seplines.max()} separators for {element_name} "{element.id}"')
+            self.logger.info(f'Found {seplines.max()} separators for {element_name_id}')
             # find contours around region labels (can be non-contiguous):
-            sep_polygons, _ = masks2polygons(self.logger, seplines, None, element_bin,
-                                             name=f'{element_name} "{element.id}"',
-                                             open_holes=True, reorder=False)
+            sep_polygons, _ = masks2polygons(
+                self.logger, seplines, None, element_bin, name=element_name_id, open_holes=True, reorder=False)
             for sep_label, polygon, _ in sep_polygons:
                 # convert back to absolute (page) coordinates:
                 region_polygon = coordinates_for_segment(polygon, image, coords)
@@ -737,7 +730,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                 element.add_SeparatorRegion(SeparatorRegionType(
                     id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon))))
             # annotate a text/image-separated image
-            element_array[sepmask] = np.amax(element_array) # clip to white/bg
+            element_array[sepmask] = np.amax(element_array)  # clip to white/bg
             image_clipped = array2pil(element_array)
             image_ref = AlternativeImageType(comments=coords['features'] + ',clipped')
             element.add_AlternativeImage(image_ref)
@@ -746,15 +739,14 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
             # get mask from region polygon:
             region_polygon = coordinates_of_segment(element, image, coords)
             region_mask = np.zeros_like(element_bin, bool)
-            region_mask[draw.polygon(region_polygon[:, 1],
-                                     region_polygon[:, 0],
-                                     region_mask.shape)] = True
+            region_mask[draw.polygon(
+                region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True
             # ensure the new line labels do not extrude from the region:
             line_labels = line_labels * region_mask
             # find contours around labels (can be non-contiguous):
-            line_polygons, _ = masks2polygons(self.logger, line_labels, baselines, element_bin,
-                                              name=f'region "{element.id}"',
-                                              min_area=640 / zoom / zoom)
+            line_polygons, _ = masks2polygons(
+                self.logger, line_labels, baselines, element_bin,
+                name=f'region "{element.id}"', min_area=640 / zoom / zoom)
             line_no = 0
             for line_label, polygon, baseline in line_polygons:
                 # convert back to absolute (page) coordinates:
@@ -772,9 +764,9 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                     line.set_Baseline(BaselineType(points=points_from_polygon(line_baseline)))
                 element.add_TextLine(line)
             if not sep_bin.any():
-                return None # no derived image
+                return None  # no derived image
             # annotate a text/image-separated image
-            element_array[sep_bin] = np.amax(element_array) # clip to white/bg
+            element_array[sep_bin] = np.amax(element_array)  # clip to white/bg
             image_clipped = array2pil(element_array)
             image_ref = AlternativeImageType(comments=coords['features'] + ',clipped')
             element.add_AlternativeImage(image_ref)

From b8e3ad6207a832fad65bccf5ea4756c004bb1f96 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 22:26:33 +0200
Subject: [PATCH 143/194] fix: clip suffix

---
 ocrd_cis/ocropy/clip.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index f5390dde..b81c731c 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -213,6 +213,7 @@ def process_segment(
         # (and also clipping with background colour):
         segment_image = crop_image(segment_image, box=segment_bbox)
         # update PAGE (reference the image file):
+        suffix = f'{segment.id}.IMG_CLIP'
         alternative_image = AlternativeImageType(comments=features)
         segment.add_AlternativeImage(alternative_image)
-        return OcrdPageResultImage(segment_image, '.IMG-CLIP', alternative_image)
+        return OcrdPageResultImage(segment_image, suffix, alternative_image)

From 02724f2db8c1d29f739282a42330c1a9b14e27d2 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 22:30:11 +0200
Subject: [PATCH 144/194] fix: denoise return

---
 ocrd_cis/ocropy/denoise.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index 0dd14ef8..4ae883fd 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -76,6 +76,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                     image = self.process_segment(line, line_image, line_xywh, zoom)
                     if image:
                         result.images.append(image)
+        return result
 
     def process_segment(self, segment, segment_image, segment_xywh, zoom) -> Optional[OcrdPageResultImage]:
         if not segment_image.width or not segment_image.height:

From aac6fe0989ccb483626af6b238e98162b780aac5 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Sat, 17 Aug 2024 00:50:08 +0200
Subject: [PATCH 145/194] try to fix: ocropy denoise

---
 ocrd_cis/ocropy/denoise.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index 4ae883fd..fd9812f8 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -51,7 +51,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
         zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
         if level == 'page':
-            image = self.process_segment(page, page_image, page_xywh, zoom)
+            image = self.process_segment(page, page_image, page_xywh, zoom, page_id)
             if image:
                 result.images.append(image)
         else:
@@ -63,7 +63,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                     region, page_image, page_xywh,
                     feature_selector='binarized' if level == 'region' else '')
                 if level == 'region':
-                    image = self.process_segment(region, region_image, region_xywh, zoom)
+                    image = self.process_segment(region, region_image, region_xywh, zoom, page_id)
                     if image:
                         result.images.append(image)
                     continue
@@ -73,12 +73,12 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                 for line in lines:
                     line_image, line_xywh = self.workspace.image_from_segment(
                         line, region_image, region_xywh, feature_selector='binarized')
-                    image = self.process_segment(line, line_image, line_xywh, zoom)
+                    image = self.process_segment(line, line_image, line_xywh, zoom, page_id)
                     if image:
                         result.images.append(image)
         return result
 
-    def process_segment(self, segment, segment_image, segment_xywh, zoom) -> Optional[OcrdPageResultImage]:
+    def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id) -> Optional[OcrdPageResultImage]:
         if not segment_image.width or not segment_image.height:
             self.logger.warning(f"Skipping '{segment.id}' with zero size")
             return None
@@ -87,5 +87,6 @@ def process_segment(self, segment, segment_image, segment_xywh, zoom) -> Optiona
             segment_image, maxsize=self.parameter['noise_maxsize'] / zoom * 300 / 72)  # in pt
         # update PAGE (reference the image file):
         alt_image = AlternativeImageType(comments=segment_xywh['features'] + ',despeckled')
+        suffix = f"{page_id}_{segment.id}.IMG-DESPECK"
         segment.add_AlternativeImage(alt_image)
-        return OcrdPageResultImage(bin_image, segment.id + '.IMG-DESPECK', alt_image)
+        return OcrdPageResultImage(bin_image, suffix, alt_image)

From 5548d0e6043e32d7409fef9817775670b2d1b96f Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Sat, 17 Aug 2024 00:58:12 +0200
Subject: [PATCH 146/194] fix: ocropy denoise

---
 ocrd_cis/ocropy/denoise.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index fd9812f8..eb3e7d23 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -63,7 +63,8 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                     region, page_image, page_xywh,
                     feature_selector='binarized' if level == 'region' else '')
                 if level == 'region':
-                    image = self.process_segment(region, region_image, region_xywh, zoom, page_id)
+                    file_id = f"{page_id}_{region.id}"
+                    image = self.process_segment(region, region_image, region_xywh, zoom, file_id)
                     if image:
                         result.images.append(image)
                     continue
@@ -73,12 +74,13 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                 for line in lines:
                     line_image, line_xywh = self.workspace.image_from_segment(
                         line, region_image, region_xywh, feature_selector='binarized')
-                    image = self.process_segment(line, line_image, line_xywh, zoom, page_id)
+                    file_id = f"{page_id}_{region.id}_{line.id}"
+                    image = self.process_segment(line, line_image, line_xywh, zoom, file_id)
                     if image:
                         result.images.append(image)
         return result
 
-    def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id) -> Optional[OcrdPageResultImage]:
+    def process_segment(self, segment, segment_image, segment_xywh, zoom, file_id) -> Optional[OcrdPageResultImage]:
         if not segment_image.width or not segment_image.height:
             self.logger.warning(f"Skipping '{segment.id}' with zero size")
             return None
@@ -87,6 +89,6 @@ def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id) -
             segment_image, maxsize=self.parameter['noise_maxsize'] / zoom * 300 / 72)  # in pt
         # update PAGE (reference the image file):
         alt_image = AlternativeImageType(comments=segment_xywh['features'] + ',despeckled')
-        suffix = f"{page_id}_{segment.id}.IMG-DESPECK"
+        suffix = f"{file_id}.IMG-DESPECK"
         segment.add_AlternativeImage(alt_image)
         return OcrdPageResultImage(bin_image, suffix, alt_image)

From c9f0f56787f2d34d718bc504ee3d07f7501dff75 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Sat, 17 Aug 2024 01:26:54 +0200
Subject: [PATCH 147/194] fix: resegment

---
 ocrd_cis/ocropy/resegment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 5a8c7e96..c1809569 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -427,7 +427,7 @@ def spread_dist(
             continue
         covers = np.count_nonzero(new_label) / count
         if covers < threshold / 3:
-            logger.debug(f"new line for '%s' only covers %.1f%% bg", covers * 100)
+            logger.debug(f"new line for '{line.id}' only covers %.1f%% bg", covers * 100)
             continue
         count = np.count_nonzero(old_label * binarized)
         if not count:

From fff909746f1347fc9336f8413fd311ac4e3ce206 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Sat, 17 Aug 2024 01:27:05 +0200
Subject: [PATCH 148/194] optimize segment

---
 ocrd_cis/ocropy/segment.py | 48 ++++++++++++++++----------------------
 1 file changed, 20 insertions(+), 28 deletions(-)

diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 9daf59de..b363cbd2 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -544,14 +544,14 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
             element_name = 'table'
             fullpage = True
             report = check_region(element_bin, zoom)
-            suffix = element.id + '.IMG-CLIP'
+            suffix = f"{element.id}.IMG-CLIP"
         else:
             element_name = 'region'
             fullpage = False
             report = check_region(element_bin, zoom)
-            suffix = element.id + '.IMG-CLIP'
-        self.logger.info(f'Computing line segmentation for {element_name} "{element.id}"')
+            suffix = f"{element.id}.IMG-CLIP"
         element_name_id = f'{element_name} "{element.id}"'
+        self.logger.info(f'Computing line segmentation for {element_name_id}')
         # TODO: we should downscale if DPI is large enough to save time
         try:
             if report:
@@ -571,7 +571,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
             if isinstance(element, TextRegionType):
                 self.logger.error(f'Cannot line-segment region "{element.id}": {err}')
                 # as a fallback, add a single text line comprising the whole region:
-                element.add_TextLine(TextLineType(id=element.id + "_line", Coords=element.get_Coords()))
+                element.add_TextLine(TextLineType(id=f"{element.id}_line", Coords=element.get_Coords()))
             else:
                 self.logger.error(f'Cannot line-segment {element_name_id}: {err}')
             return None
@@ -664,7 +664,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                         continue
                     # annotate result:
                     region_no += 1
-                    region_id = element.id + "_region%04d" % region_no
+                    region_id = f"{element.id}_region%04d" % region_no
                     self.logger.debug(f'Region label {region_label} becomes ID "{region_id}"')
                     region = TextRegionType(id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))
                     # find out which line (contours) belong to which region (contours)
@@ -682,7 +682,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                             continue
                         # annotate result:
                         line_no += 1
-                        line_id = region_id + "_line%04d" % line_no
+                        line_id = f"{region_id}_line%04d" % line_no
                         self.logger.debug(f'Line label {line_label} becomes ID "{line_id}"')
                         line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon)))
                         if line_baseline:
@@ -709,7 +709,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                     continue
                 region_no += 1
                 # annotate result:
-                region_id = element.id + "_image%04d" % region_no
+                region_id = f"{element.id}_image%04d" % region_no
                 element.add_ImageRegion(ImageRegionType(
                     id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon))))
             # split detected separator labels into separator regions:
@@ -726,7 +726,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                     continue
                 # annotate result:
                 region_no += 1
-                region_id = element.id + "_sep%04d" % region_no
+                region_id = f"{element.id}_sep%04d" % region_no
                 element.add_SeparatorRegion(SeparatorRegionType(
                     id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon))))
             # annotate a text/image-separated image
@@ -739,8 +739,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
             # get mask from region polygon:
             region_polygon = coordinates_of_segment(element, image, coords)
             region_mask = np.zeros_like(element_bin, bool)
-            region_mask[draw.polygon(
-                region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True
+            region_mask[draw.polygon(region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True
             # ensure the new line labels do not extrude from the region:
             line_labels = line_labels * region_mask
             # find contours around labels (can be non-contiguous):
@@ -757,7 +756,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                     continue
                 # annotate result:
                 line_no += 1
-                line_id = element.id + "_line%04d" % line_no
+                line_id = f"{element.id}_line%04d" % line_no
                 line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon)))
                 if baseline:
                     line_baseline = coordinates_for_segment(baseline, image, coords)
@@ -868,11 +867,12 @@ def join_polygons(polygons, loc='', scale=20):
         dists[j, i] = dist
     dists = minimum_spanning_tree(dists, overwrite=True)
     # add bridge polygons (where necessary)
+    max_dist = max(1.0, scale / 5)
     for prevp, nextp in zip(*dists.nonzero()):
         prevp = polygons[prevp]
         nextp = polygons[nextp]
         nearest = nearest_points(prevp, nextp)
-        bridgep = LineString(nearest).buffer(max(1, scale/5), resolution=1)
+        bridgep = LineString(nearest).buffer(max_dist, resolution=1)
         polygons.append(bridgep)
     jointp = unary_union(polygons)
     assert jointp.geom_type == 'Polygon', jointp.wkt
@@ -1017,11 +1017,9 @@ def page_add_to_reading_order(rogroup, region_id, index=None):
     """
     if rogroup:
         if index is None:
-            rogroup.add_RegionRef(RegionRefType(
-                regionRef=region_id))
+            rogroup.add_RegionRef(RegionRefType(regionRef=region_id))
         else:
-            rogroup.add_RegionRefIndexed(RegionRefIndexedType(
-                regionRef=region_id, index=index))
+            rogroup.add_RegionRefIndexed(RegionRefIndexedType(regionRef=region_id, index=index))
             index += 1
     return index
 
@@ -1045,36 +1043,30 @@ def page_subgroup_in_reading_order(logger: Logger, roelem):
     if not roelem.parent_object_:
         logger.error('Cannot subgroup from orphan ReadingOrder element')
         return roelem
-    if isinstance(roelem, (OrderedGroupType,OrderedGroupIndexedType)) and not (
+    if isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)) and not (
             roelem.get_OrderedGroupIndexed() or
             roelem.get_UnorderedGroupIndexed() or
             roelem.get_RegionRefIndexed()):
         # is already a group and still empty
         return roelem
-    if isinstance(roelem, (OrderedGroupType,
-                           UnorderedGroupType,
-                           RegionRefType)):
+    if isinstance(roelem, (OrderedGroupType, UnorderedGroupType, RegionRefType)):
         getattr(roelem.parent_object_, {
             OrderedGroupType: 'get_OrderedGroup',
             UnorderedGroupType: 'get_UnorderedGroup',
             RegionRefType: 'get_RegionRef',
         }.get(roelem.__class__))().remove(roelem)
-        roelem2 = OrderedGroupType(id=roelem.regionRef + '_group',
-                                   regionRef=roelem.regionRef)
+        roelem2 = OrderedGroupType(id=f"{roelem.regionRef}_group", regionRef=roelem.regionRef)
         roelem.parent_object_.add_OrderedGroup(roelem2)
         roelem2.parent_object_ = roelem.parent_object_
         return roelem2
-    if isinstance(roelem, (OrderedGroupIndexedType,
-                           UnorderedGroupIndexedType,
-                           RegionRefIndexedType)):
+    if isinstance(roelem, (OrderedGroupIndexedType, UnorderedGroupIndexedType, RegionRefIndexedType)):
         getattr(roelem.parent_object_, {
             OrderedGroupIndexedType: 'get_OrderedGroupIndexed',
             UnorderedGroupIndexedType: 'get_UnorderedGroupIndexed',
             RegionRefIndexedType: 'get_RegionRefIndexed'
         }.get(roelem.__class__))().remove(roelem)
-        roelem2 = OrderedGroupIndexedType(id=roelem.regionRef + '_group',
-                                          index=roelem.index,
-                                          regionRef=roelem.regionRef)
+        roelem2 = OrderedGroupIndexedType(
+            id=f"{roelem.regionRef}_group", index=roelem.index, regionRef=roelem.regionRef)
         roelem.parent_object_.add_OrderedGroupIndexed(roelem2)
         roelem2.parent_object_ = roelem.parent_object_
         return roelem2

From 8b9283232a57b7c49a78420b32c915b32992ee9a Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Sat, 17 Aug 2024 02:02:28 +0200
Subject: [PATCH 149/194] optimize ocropy common

---
 ocrd_cis/ocropy/common.py | 186 +++++++++++++++++++-------------------
 1 file changed, 93 insertions(+), 93 deletions(-)

diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index c5b56ed0..a5806517 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -184,16 +184,19 @@ def estimate_thresholds(flat, bignore=0.1, escale=1.0, lo=5, hi=90):
     d0, d1 = flat.shape
     o0, o1 = int(bignore * d0), int(bignore * d1)
     est = flat[o0:d0 - o0, o1:d1 - o1]
+
     if escale > 0:
         # by default, we use only regions that contain
         # significant variance; this makes the percentile
         # based low and high estimates more reliable
         e = escale
-        v = est - filters.gaussian_filter(est, e * 20.0)
-        v = filters.gaussian_filter(v ** 2, e * 20.0) ** 0.5
+        e_20_0 = e * 20.0
+        e_50 = int(e * 50)
+        v = est - filters.gaussian_filter(est, e_20_0)
+        v = filters.gaussian_filter(v ** 2, e_20_0) ** 0.5
         v = (v > 0.3 * np.amax(v))
-        v = morphology.binary_dilation(v, structure=np.ones((int(e * 50), 1)))
-        v = morphology.binary_dilation(v, structure=np.ones((1, int(e * 50))))
+        v = morphology.binary_dilation(v, structure=np.ones((e_50, 1)))
+        v = morphology.binary_dilation(v, structure=np.ones((1, e_50)))
         est = est[v]
     lo = stats.scoreatpercentile(est.ravel(), lo)
     hi = stats.scoreatpercentile(est.ravel(), hi)
@@ -310,24 +313,24 @@ def check_line(binary, zoom=1.0):
 
     Returns an error report, or None if valid.
     """
-    if np.prod(binary.shape)==0: return "image dimensions are zero"
-    if len(binary.shape)==3: return "image is not monochrome %s"%(binary.shape,)
+    if np.prod(binary.shape) == 0: return "image dimensions are zero"
+    if len(binary.shape) == 3: return f"image is not monochrome {binary.shape}"
     if np.amax(binary)==np.amin(binary): return "image is blank"
     if np.mean(binary)<np.median(binary): return "image may be inverted"
     h,w = binary.shape
-    if h<20/zoom: return "image not tall enough for a text line %s"%(binary.shape,)
-    if h>200/zoom: return "image too tall for a text line %s"%(binary.shape,)
+    if h<20/zoom: return f"image not tall enough for a text line {binary.shape}"
+    if h>200/zoom: return f"image too tall for a text line {binary.shape}"
     ##if w<1.5*h: return "line too short %s"%(binary.shape,)
-    if w<1.5*h and w<32/zoom: return "image too short for a line image %s"%(binary.shape,)
-    if w>4000/zoom: return "image too long for a line image %s"%(binary.shape,)
+    if w<1.5*h and w<32/zoom: return f"image too short for a line image {binary.shape}"
+    if w>4000/zoom: return f"image too long for a line image {binary.shape}"
     return None
     ratio = w*1.0/h
     _, ncomps = measurements.label(binary)
     lo = int(0.5*ratio+0.5)
     hi = int(4*ratio)+1
-    if ncomps<lo: return "too few connected components (got %d, wanted >=%d)"%(ncomps,lo)
-    ##if ncomps>hi*ratio: return "too many connected components (got %d, wanted <=%d)"%(ncomps,hi)
-    if ncomps>hi*ratio and ncomps>10: return "too many connected components (got %d, wanted <=%d)"%(ncomps,hi)
+    if ncomps<lo: return f"too few connected components (got {ncomps}, wanted >={lo})"
+    ##if ncomps>hi*ratio: return f"too many connected components (got {ncomps}, wanted <={hi})"
+    if ncomps>hi*ratio and ncomps>10: return f"too many connected components (got {ncomps}, wanted <={hi})"
     return None
 
 # inspired by ocropus-gpageseg check_page
@@ -341,21 +344,21 @@ def check_region(binary, zoom=1.0):
 
     Returns an error report, or None if valid.
     """
-    if np.prod(binary.shape)==0: return "image dimensions are zero"
-    if len(binary.shape)==3: return "image is not monochrome %s"%(binary.shape,)
+    if np.prod(binary.shape) == 0: return "image dimensions are zero"
+    if len(binary.shape) == 3: return f"image is not monochrome {binary.shape}"
     if np.amax(binary)==np.amin(binary): return "image is blank"
     if np.mean(binary)<np.median(binary): return "image may be inverted"
     h,w = binary.shape
-    if h<45/zoom: return "image not tall enough for a region image %s"%(binary.shape,)
-    if h>5000/zoom: return "image too tall for a region image %s"%(binary.shape,)
-    if w<100/zoom: return "image too narrow for a region image %s"%(binary.shape,)
-    if w>5000/zoom: return "image too wide for a region image %s"%(binary.shape,)
+    if h<45/zoom: return f"image not tall enough for a region image {binary.shape}"
+    if h>5000/zoom: return f"image too tall for a region image {binary.shape}"
+    if w<100/zoom: return f"image too narrow for a region image {binary.shape}"
+    if w>5000/zoom: return f"image too wide for a region image {binary.shape}"
     return None
     # zoom factor (DPI relative) and 4 (against fragmentation from binarization)
     slots = int(w*h*1.0/(30*30)*zoom*zoom) * 4
     _,ncomps = measurements.label(binary)
-    if ncomps<5: return "too few connected components for a region image (got %d)"%(ncomps,)
-    if ncomps>slots and ncomps>10: return "too many connected components for a region image (%d > %d)"%(ncomps,slots)
+    if ncomps<5: return f"too few connected components for a region image (got {ncomps})"
+    if ncomps>slots and ncomps>10: return f"too many connected components for a region image ({ncomps} > {slots})"
     return None
 
 # from ocropus-gpageseg, but with zoom parameter
@@ -369,21 +372,21 @@ def check_page(binary, zoom=1.0):
 
     Returns an error report, or None if valid.
     """
-    if np.prod(binary.shape)==0: return "image dimensions are zero"
-    if len(binary.shape)==3: return "image not monochrome %s"%(binary.shape,)
+    if np.prod(binary.shape) == 0: return "image dimensions are zero"
+    if len(binary.shape) == 3: return f"image not monochrome {binary.shape}"
     if np.amax(binary)==np.amin(binary): return "image is blank"
     if np.mean(binary)<np.median(binary): return "image may be inverted"
     h,w = binary.shape
-    if h<600/zoom: return "image not tall enough for a page image %s"%(binary.shape,)
-    if h>20000/zoom: return "image too tall for a page image %s"%(binary.shape,)
-    if w<600/zoom: return "image too narrow for a page image %s"%(binary.shape,)
-    if w>20000/zoom: return "image too wide for a page image %s"%(binary.shape,)
+    if h<600/zoom: return f"image not tall enough for a page image {binary.shape}"
+    if h>20000/zoom: return f"image too tall for a page image {binary.shape}"
+    if w<600/zoom: return f"image too narrow for a page image {binary.shape}"
+    if w>20000/zoom: return f"image too wide for a page image {binary.shape}"
     return None
     # zoom factor (DPI relative) and 4 (against fragmentation from binarization)
     slots = int(w*h*1.0/(30*30)*zoom*zoom) * 4
     _,ncomps = measurements.label(binary)
-    if ncomps<10: return "too few connected components for a page image (got %d)"%(ncomps,)
-    if ncomps>slots and ncomps>10: return "too many connected components for a page image (%d > %d)"%(ncomps,slots)
+    if ncomps<10: return f"too few connected components for a page image (got {ncomps})"
+    if ncomps>slots and ncomps>10: return f"too many connected components for a page image ({ncomps} > {slots})"
     return None
 
 def odd(num):
@@ -476,8 +479,13 @@ def compute_images(binary, scale, maximages=5):
     #images = morph.rb_closing(images, (d0,d1))
     #DSAVE('images1_closed', images+0.6*binary)
     # 1- filter largest connected components
-    images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=2*maximages)
-    DSAVE('images1_large', images+0.6*binary)
+    binary_0_6 = 0.6 * binary
+    odd_scale = odd(scale)
+    odd_half_scale = odd(scale / 2)
+    odd_doubled_scale = odd(2 * scale)
+    region_min = (4 * scale) ** 2
+    images = morph.select_regions(images, sl.area, min=region_min, nbest=2 * maximages)
+    DSAVE('images1_large', images + binary_0_6)
     if not images.any():
         return np.zeros_like(binary, int)
     # 2- open horizontally and vertically to suppress
@@ -486,31 +494,31 @@ def compute_images(binary, scale, maximages=5):
     #    single frame, because then the hull polygon
     #    can cover/overlap large text/table parts which
     #    we cannot discern from the actual image anymore
-    h_opened = morph.rb_opening(images, (1, odd(scale/2)))
-    DSAVE('images2_h-opened', h_opened+0.6*binary)
-    v_opened = morph.rb_opening(images, (odd(scale/2), 1))
-    DSAVE('images2_v-opened', v_opened+0.6*binary)
+    h_opened = morph.rb_opening(images, (1, odd_half_scale))
+    DSAVE('images2_h-opened', h_opened + binary_0_6)
+    v_opened = morph.rb_opening(images, (odd_half_scale, 1))
+    DSAVE('images2_v-opened', v_opened + binary_0_6)
     # 3- close whatever remains
-    closed = morph.rb_closing(h_opened&v_opened, (odd(2*scale),odd(2*scale)))
-    DSAVE('images3_closed', closed+0.6*binary)
+    closed = morph.rb_closing(h_opened&v_opened, (odd_doubled_scale, odd_doubled_scale))
+    DSAVE('images3_closed', closed + binary_0_6)
     # 4- reconstruct the losses up to a certain distance
     #    to avoid creeping into pure h/v-lines again but still
     #    cover most of the large object
     #images = np.where(images, closed, 2)
     #images = morph.spread_labels(images, maxdist=scale) % 2 | closed
     images = morph.rb_reconstruction(closed, images, step=2, maxsteps=scale)
-    DSAVE('images4_reconstructed', images+0.6*binary)
+    DSAVE('images4_reconstructed', images + binary_0_6)
     # 5- select nbest
-    images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=maximages)
-    DSAVE('images5_selected', images+0.6*binary)
+    images = morph.select_regions(images, sl.area, min=region_min, nbest=maximages)
+    DSAVE('images5_selected', images + binary_0_6)
     if not images.any():
         return np.zeros_like(binary, int)
     # 6- dilate a little to get a smooth contour without gaps
-    dilated = morph.r_dilation(images, (odd(scale),odd(scale)))
+    dilated = morph.r_dilation(images, (odd_scale, odd_scale))
     images = morph.propagate_labels_majority(binary, dilated+1)
     images = morph.spread_labels(images, maxdist=scale)==2
     images, _ = morph.label(images)
-    DSAVE('images6_dilated', images+0.6*binary)
+    DSAVE('images6_dilated', images + binary_0_6)
     # we could repeat reconstruct-dilate here...
     return images
 
@@ -548,6 +556,7 @@ def compute_seplines(binary, scale, maxseps=0):
     sepsizes = [0]
     sepslices = [None]
     sepdists = [0]
+    doubled_scale = 2 * scale
     for label in range(1, nlabels + 1):
         labelslice = slices[label]
         labelmask = labels == label
@@ -599,8 +608,8 @@ def compute_seplines(binary, scale, maxseps=0):
                 binmask = sublabels == bin + 1
                 binlabels, nbinlabels = morph.label(binmask)
                 _, binlabelcounts = np.unique(binlabels, return_counts=True)
-                largemask = (binlabelcounts > 2 * scale)[binlabels]
-                smallmask = (binlabelcounts <= 2 * scale)[binlabels]
+                largemask = (binlabelcounts > doubled_scale)[binlabels]
+                smallmask = (binlabelcounts <= doubled_scale)[binlabels]
                 sublabels2[binmask & smallmask] = 1
                 if not np.any(binmask & largemask):
                     continue
@@ -1843,11 +1852,13 @@ def find_topological():
             else:
                 llab[box] = lbinary[box]
             # show projection at the sides
-            for i in range(int(scale/2)):
-                llab[box[0],box[1].start+i] = -10*np.log(y+1e-9)
-                llab[box[0],box[1].stop-1-i] = -10*np.log(y+1e-9)
-                llab[box[0].start+i,box[1]] = -10*np.log(x+1e-9)
-                llab[box[0].stop-1-i,box[1]] = -10*np.log(x+1e-9)
+            log_y = -10 * np.log(y + 1e-9)
+            log_x = -10 * np.log(x + 1e-9)
+            for i in range(int(scale / 2)):
+                llab[box[0], box[1].start + i] = log_y
+                llab[box[0], box[1].stop - 1 - i] = log_y
+                llab[box[0].start + i, box[1]] = log_x
+                llab[box[0].stop - 1 - i, box[1]] = log_x
             DSAVE('recursive_x_y_cut_' + (partition_type or 'sliced'), llab)
         gap_weights = list()
         for is_horizontal, profile in enumerate([y, x]):
@@ -1877,19 +1888,19 @@ def find_topological():
                 weights = weights * (1 + 0.5 * props['peak_heights']/gap_height)
             gap_weights.append((gaps, weights))
             if debug:
-                LOG.debug('  {} gaps {} {} weights {}'.format(
-                    'horizontal' if is_horizontal else 'vertical',
-                    gaps, props, weights))
+                orientation = 'horizontal' if is_horizontal else 'vertical'
+                LOG.debug(f'  {orientation} gaps {gaps} {props} weights {weights}')
                 if not gaps.shape[0]:
                     continue
+                half_scale = int(scale / 2)
                 for start, stop, height in sorted(zip(
                         props['left_ips'].astype(int),
                         props['right_ips'].astype(int),
                         props['peak_heights']), key=lambda x: x[2]):
                     if is_horizontal:
-                        llab[box[0].start+int(scale/2):box[0].stop-int(scale/2),box[1].start+start:box[1].start+stop] = -10*np.log(-height+1e-9)
+                        llab[box[0].start+half_scale:box[0].stop-half_scale,box[1].start+start:box[1].start+stop] = -10*np.log(-height+1e-9)
                     else:
-                        llab[box[0].start+start:box[0].start+stop,box[1].start+int(scale/2):box[1].stop-int(scale/2)] = -10*np.log(-height+1e-9)
+                        llab[box[0].start+start:box[0].start+stop,box[1].start+half_scale:box[1].stop-half_scale] = -10*np.log(-height+1e-9)
                 DSAVE('recursive_x_y_cut_gaps_' + ('h' if is_horizontal else 'v'), llab)
         # heuristic (not strict) decision on x or y cut,
         # factors to consider:
@@ -1916,32 +1927,27 @@ def find_topological():
         #   are not allowed
         y_gaps, y_weights = gap_weights[0][0], gap_weights[0][1]
         x_gaps, x_weights = gap_weights[1][0], gap_weights[1][1]
-        if debug: LOG.debug('   all y_gaps {} x_gaps {}'.format(y_gaps, x_gaps))
+        if debug: LOG.debug(f'   all y_gaps {y_gaps} x_gaps {x_gaps}')
         # suppress cuts that significantly split any line labels
+        min_line_scale = min_line * scale
         y_allowed = [not(np.any(np.intersect1d(
             # significant line labels above
-            np.nonzero(np.bincount(lbin[:gap,:].flatten(),
-                                   minlength=len(objects))[1:] > min_line * scale)[0],
+            np.nonzero(np.bincount(lbin[:gap,:].flatten(), minlength=len(objects))[1:] > min_line_scale)[0],
             # significant line labels below
-            np.nonzero(np.bincount(lbin[gap:,:].flatten(),
-                                   minlength=len(objects))[1:] > min_line * scale)[0],
-            assume_unique=True)))
-                        for gap in y_gaps]
+            np.nonzero(np.bincount(lbin[gap:,:].flatten(), minlength=len(objects))[1:] > min_line_scale)[0],
+            assume_unique=True))) for gap in y_gaps]
         x_allowed = [not(np.any(np.intersect1d(
             # significant line labels left
-            np.nonzero(np.bincount(lbin[:,:gap].flatten(),
-                                   minlength=len(objects))[1:] > min_line * scale)[0],
+            np.nonzero(np.bincount(lbin[:,:gap].flatten(), minlength=len(objects))[1:] > min_line_scale)[0],
             # significant line labels right
-            np.nonzero(np.bincount(lbin[:,gap:].flatten(),
-                                   minlength=len(objects))[1:] > min_line * scale)[0],
-            assume_unique=True)))
-                        for gap in x_gaps]
+            np.nonzero(np.bincount(lbin[:,gap:].flatten(), minlength=len(objects))[1:] > min_line_scale)[0],
+            assume_unique=True))) for gap in x_gaps]
         y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed]
         x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed]
-        if debug: LOG.debug('   allowed y_gaps {} x_gaps {}'.format(y_gaps, x_gaps))
+        if debug: LOG.debug(f'   allowed y_gaps {y_gaps} x_gaps {x_gaps}')
         y_prominence = np.amax(y_weights, initial=0)
         x_prominence = np.amax(x_weights, initial=0)
-        if debug: LOG.debug('   y_prominence {} x_prominence {}'.format(y_prominence, x_prominence))
+        if debug: LOG.debug(f'   y_prominence {y_prominence} x_prominence {x_prominence}')
         # suppress less prominent peaks (another heuristic...)
         # they must compete with the other direction next time
         # (when already new cuts or partitions will become visible)
@@ -1949,33 +1955,30 @@ def find_topological():
         x_allowed = x_weights > 0.8 * x_prominence
         y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed]
         x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed]
-        if debug: LOG.debug('   prominent y_gaps {} x_gaps {}'.format(y_gaps, x_gaps))
+        if debug: LOG.debug(f'   prominent y_gaps {y_gaps} x_gaps {x_gaps}')
         if npartitions > 0:
             # TODO this can be avoided when backtracking below
             # suppress peaks creating fewer partitions than others --
             # how large in our preferred direction will the new partitions
             # of sepmask in both slices created by each cut candidate
             # add up?
-            y_partitionscores = [sum(map(sl.height if prefer_vertical else sl.width,
-                                         morph.find_objects(morph.label(
-                                             partitions[:gap,:]>0)[0]) +
-                                         morph.find_objects(morph.label(
-                                             partitions[gap:,:]>0)[0])))
-                                 for gap in y_gaps]
-            x_partitionscores = [sum(map(sl.height if prefer_vertical else sl.width,
-                                         morph.find_objects(morph.label(
-                                             partitions[:,:gap]>0)[0]) +
-                                         morph.find_objects(morph.label(
-                                             partitions[:,gap:]>0)[0])))
-                                 for gap in x_gaps]
-            if debug: LOG.debug('   y_partitionscores {} x_partitionscores {}'.format(
-                    y_partitionscores, x_partitionscores))
+            y_partitionscores = [sum(map(
+                sl.height if prefer_vertical else sl.width,
+                morph.find_objects(morph.label(partitions[:gap, :] > 0)[0]) +
+                morph.find_objects(morph.label(partitions[gap:, :] > 0)[0])))
+                for gap in y_gaps]
+            x_partitionscores = [sum(map(
+                sl.height if prefer_vertical else sl.width,
+                morph.find_objects(morph.label(partitions[:, : gap] > 0)[0]) +
+                morph.find_objects(morph.label(partitions[:, gap :] > 0)[0])))
+                for gap in x_gaps]
+            if debug: LOG.debug(f'   y_partitionscores {y_partitionscores} x_partitionscores {x_partitionscores}')
             # Now identify those gaps with the largest overall score
             y_allowed = y_partitionscores == np.max(y_partitionscores, initial=0)
             x_allowed = x_partitionscores == np.max(x_partitionscores, initial=0)
             y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed]
             x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed]
-            if debug: LOG.debug('   most partitioning y_gaps {} x_gaps {}'.format(y_gaps, x_gaps))
+            if debug: LOG.debug(f'   most partitioning y_gaps {y_gaps} x_gaps {x_gaps}')
         else:
             y_partitionscores = None
             x_partitionscores = None
@@ -1986,7 +1989,7 @@ def find_topological():
         x_allowed = x_weights > 0.9 * x_prominence
         y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed]
         x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed]
-        if debug: LOG.debug('   prominent y_gaps {} x_gaps {}'.format(y_gaps, x_gaps))
+        if debug: LOG.debug(f'   prominent y_gaps {y_gaps} x_gaps {x_gaps}')
         
         # decide which direction, x or y
         # TODO: this most likely needs a backtracking mechanism
@@ -2052,7 +2055,7 @@ def find_topological():
                     llab2[box] = partitions
                 DSAVE('recursive_x_y_cut_partitions', llab2)
             for label in range(1, npartitions+1):
-                LOG.debug('next partition %d on %s', label, box)
+                LOG.debug(f'next partition %d on %s', label, box)
                 recursive_x_y_cut(box, mask=partitions==label, partition_type=new_partition_type)
             return
         
@@ -2060,10 +2063,9 @@ def find_topological():
             # no gaps left
             finalize()
             return
+        orientation = 'vertical' if choose_vertical else 'horizontal'
         # otherwise: cut on gaps
-        LOG.debug('cutting %s on %s into %s', 'vertically'
-                  if choose_vertical else 'horizontally',
-                  box, gaps)
+        LOG.debug(f'cutting {orientation}ly on {box} into {gaps}')
         cuts = list(zip(np.insert(gaps, 0, 0), np.append(gaps, lim)))
         if choose_vertical:
             if rl:
@@ -2078,9 +2080,7 @@ def find_topological():
                 sub = sl.box(0, len(y), start, stop)
             else: # "cut in horizontal direction"
                 sub = sl.box(start, stop, 0, len(x))
-            LOG.debug('next %s block on %s is %s', 'horizontal'
-                      if choose_vertical else 'vertical',
-                      box, sub)
+            LOG.debug(f'next {orientation} block on {box} is {sub}')
             recursive_x_y_cut(sl.compose(box,sub),
                               mask=sl.cut(mask,sub) if isinstance(mask, np.ndarray)
                               else None)

From fceaffe4e928bff7ea70aece7baa3d3717c03cff Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Sat, 17 Aug 2024 02:03:47 +0200
Subject: [PATCH 150/194] optimize ocrolib

---
 ocrd_cis/ocropy/ocrolib/morph.py    | 18 ++++++++++--------
 ocrd_cis/ocropy/ocrolib/toplevel.py | 26 ++++++++------------------
 2 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py
index 7d6ffc85..b9619cca 100644
--- a/ocrd_cis/ocropy/ocrolib/morph.py
+++ b/ocrd_cis/ocropy/ocrolib/morph.py
@@ -292,8 +292,9 @@ def propagate_labels_majority(image,labels):
     with the largest overlap."""
     rlabels,_ = label(image)
     cors = correspondences(rlabels,labels)
-    outputs = zeros(amax(rlabels)+1,'i')
-    counts = zeros(amax(rlabels)+1,'i')
+    amax_rlabels = amax(rlabels) + 1
+    outputs = zeros(amax_rlabels,'i')
+    counts = zeros(amax_rlabels,'i')
     for rlabel, label_, count in cors.T:
         if not rlabel or not label_:
             # ignore background correspondences
@@ -347,12 +348,13 @@ def all_neighbors(image, dist=1, bg=NaN):
     """Given an image with labels, find all pairs of labels
     that are directly (up to ``dist``) neighboring each other, ignoring the label ``bg``."""
     q = 100000
-    assert amax(image)<q
-    assert amin(image)>=0
-    u = unique(q*image+shift(image,(dist,0),order=0,cval=bg))
-    d = unique(q*image+shift(image,(-dist,0),order=0,cval=bg))
-    l = unique(q*image+shift(image,(0,dist),order=0,cval=bg))
-    r = unique(q*image+shift(image,(0,-dist),order=0,cval=bg))
+    assert amax(image) < q
+    assert amin(image) >= 0
+    q_image = q * image
+    u = unique(q_image + shift(image, (dist, 0), order=0, cval=bg))
+    d = unique(q_image + shift(image, (-dist, 0), order=0, cval=bg))
+    l = unique(q_image + shift(image, (0, dist), order=0, cval=bg))
+    r = unique(q_image + shift(image, (0, -dist), order=0, cval=bg))
     all = unique(r_[u,d,l,r])
     all = all[all!=bg]
     all = c_[all//q,all%q]
diff --git a/ocrd_cis/ocropy/ocrolib/toplevel.py b/ocrd_cis/ocropy/ocrolib/toplevel.py
index 87ed18c5..72e397af 100644
--- a/ocrd_cis/ocropy/ocrolib/toplevel.py
+++ b/ocrd_cis/ocropy/ocrolib/toplevel.py
@@ -125,14 +125,10 @@ def __init__(self,*args,**kw):
         self.fun = kw.get("fun","?")
         self.var = kw.get("var","?")
         self.description = " ".join([strc(x) for x in args])
+
     def __str__(self):
-        result = "\nCheckError for argument "
-        result += str(self.var)
-        result += " of function "
-        result += str(self.fun)
-        result += "\n"
-        result += self.description
-        return result
+        return f"\nCheckError for argument {str(self.var)} of function {str(self.fun)}\n{self.description}"
+
 
 class CheckWarning(CheckError):
     def __init__(self,*args,**kw):
@@ -142,14 +138,8 @@ def __init__(self,*args,**kw):
         CheckError.__init__(self, *args, **kw)
 
     def __str__(self):
-        result = "\nCheckWarning for argument "
-        result += str(self.var)
-        result += " of function "
-        result += str(self.fun)
-        result += "\n"
-        result += self.description
-        result += "(This can happen occasionally during normal operations and isn't necessarily a bug or problem.)\n"
-        return result
+        return (f"\nCheckWarning for argument {str(self.var)} of function {str(self.fun)}\n{self.description} "
+                f"(This can happen occasionally during normal operations and isn't necessarily a bug or problem.)\n")
 
 def checktype(value,type_):
     """Check value against the type spec.  If everything
@@ -211,7 +201,7 @@ def argument_checks(*args,**kw):
                     e.var = var
                     raise e
                 except:
-                    LOG.critical("unknown exception while checking function: '%s'", name)
+                    LOG.critical(f"unknown exception while checking function: '{name}'")
                     raise
             result = f(*args,**kw)
             checktype(result,kw.get("_",True))
@@ -225,9 +215,9 @@ def decorator(f):
         def wrapper(arg):
             if not f(arg):
                 if warning:
-                    raise CheckWarning(strc(arg)+" of type "+str(type(arg))+": "+str(message))
+                    raise CheckWarning(f"{strc(arg)} of type {str(type(arg))}: {str(message)}")
                 else:
-                    raise CheckError(strc(arg)+" of type "+str(type(arg))+": "+str(message))
+                    raise CheckError(f"{strc(arg)} of type {str(type(arg))}: {str(message)}")
         return wrapper
     return decorator
 

From 3de2585787ea2b59126a4a1c39d9df3e42d18362 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Sat, 17 Aug 2024 02:03:58 +0200
Subject: [PATCH 151/194] optimize align cli

---
 ocrd_cis/align/cli.py | 85 ++++++++++++++++++-------------------------
 1 file changed, 35 insertions(+), 50 deletions(-)

diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py
index 7747622e..7d6599c2 100644
--- a/ocrd_cis/align/cli.py
+++ b/ocrd_cis/align/cli.py
@@ -57,16 +57,16 @@ def process(self):
     def align(self, alignments, ift):
         """align the alignment objects with the according input file tuples"""
         for t in ift:
-            self.log.debug("tuple %s", os.path.basename(t.input_file.url))
+            self.log.debug(f"tuple {os.path.basename(t.input_file.url)}")
         pcgtst = self.open_input_file_tuples(ift)
         i = 0
         for mi, mr in enumerate(pcgtst[0].get_Page().get_TextRegion()):
             for mj, _ in enumerate(mr.get_TextLine()):
                 for iiii, u in enumerate(mr.get_TextLine()[mj].get_TextEquiv()):
-                    self.log.debug("[%d] %s", iiii, u.Unicode)
+                    self.log.debug(f"[{iiii}] {u.Unicode}")
                 for xx in mr.get_TextLine()[mj].get_Word():
                     for iiii, u in enumerate(xx.get_TextEquiv()):
-                        self.log.debug("[%d] %s", iiii, u.Unicode)
+                        self.log.debug(f"[{iiii}] {u.Unicode}")
 
                 lines = []
                 for ii, t in enumerate(ift):
@@ -88,23 +88,21 @@ def align_lines(self, lines):
         for i, line in enumerate(lines):
             if lines[0].region.get_TextEquiv() is None:
                 lines[0].region.TextEquiv = []
-            self.log.debug('line alignment: %s [%s - %s]',
-                          get_textequiv_unicode(line.region),
-                          line.region.get_id(),
-                          line.input_file.input_file_group)
-            ddt = line.input_file.input_file_group + "/" + line.region.get_id()
+            self.log.debug(f'line alignment: {get_textequiv_unicode(line.region)} '
+                           f'[{line.region.get_id()} - {line.input_file.input_file_group}]')
+            ddt = f"{line.input_file.input_file_group}/{line.region.get_id()}"
             if i != 0:
                 te = TextEquivType(
                     Unicode=get_textequiv_unicode(line.region),
                     conf=get_textequiv_conf(line.region),
                     dataType="other",
-                    dataTypeDetails="ocrd-cis-line-alignment:" + ddt)
+                    dataTypeDetails=f"ocrd-cis-line-alignment:{ddt}")
                 lines[0].region.add_TextEquiv(te)
             else:
                 self.log.debug("len: %i, i: %i", len(lines[0].region.get_TextEquiv()), i)
                 lines[0].region.get_TextEquiv()[i].set_dataType("other")
                 lines[0].region.get_TextEquiv()[i].set_dataTypeDetails(
-                    "ocrd-cis-line-alignment-master-ocr:" + ddt)
+                    f"ocrd-cis-line-alignment-master-ocr:{ddt}")
             lines[0].region.get_TextEquiv()[i].set_index(i+1)
         self.align_words(lines)
 
@@ -113,18 +111,18 @@ def align_words(self, lines):
         mregion = lines[0].region.get_Word()
         oregion = [lines[i].region.get_Word() for i in range(1, len(lines))]
         for word in lines[0].alignment['wordAlignments']:
-            self.log.debug("aligning word %s", word['master'])
+            self.log.debug(f"aligning word {word['master']}", )
             master, rest = self.find_word([word['master']], mregion, "master")
             mregion = rest
             if master is None or len(master) != 1:
-                self.log.warn("cannot find {}; giving up".format(word['master']))
-                # raise Exception("cannot find {}; giving up".format(word['master']))
+                self.log.warn(f"cannot find {word['master']}; giving up")
+                # raise Exception(f"cannot find {word['master']}; giving up")
                 return
             others = list()
             for i, other in enumerate(word['alignments']):
                 match, rest = self.find_word(other, oregion[i])
                 if match is None:
-                    self.log.warn("cannot find {}; giving up".format(other))
+                    self.log.warn(f"cannot find {other}; giving up")
                     return
                 others.append(match)
                 oregion[i] = rest
@@ -132,10 +130,7 @@ def align_words(self, lines):
             words.append(
                 Alignment(lines[0].input_file, master, lines[0].alignment))
             for i, other in enumerate(others):
-                words.append(Alignment(
-                    lines[i+1].input_file,
-                    other,
-                    lines[i+1].alignment))
+                words.append(Alignment(lines[i+1].input_file, other, lines[i+1].alignment))
             self.align_word_regions(words)
 
     def align_word_regions(self, words):
@@ -144,10 +139,8 @@ def te0(x):
         for i, word in enumerate(words):
             if not word.region:
                 ifg = word.input_file.input_file_group
-                self.log.debug("(empty) word alignment: [%s]", ifg)
-                te = TextEquivType(
-                    dataType="other",
-                    dataTypeDetails="ocrd-cis-empty-word-alignment:" + ifg)
+                self.log.debug(f"(empty) word alignment: [{ifg}]")
+                te = TextEquivType(dataType="other", dataTypeDetails=f"ocrd-cis-empty-word-alignment:{ifg}")
                 words[0].region[0].add_TextEquiv(te)
                 words[0].region[0].get_TextEquiv()[i].set_index(i+1)
                 continue
@@ -157,46 +150,38 @@ def te0(x):
             ddt = word.input_file.input_file_group + "/" + _id
             # if conf is none it is most likely ground truth data
             conf = min([float(te0(x).get_conf() or "1.0") for x in word.region])
-            self.log.debug("word alignment: %s [%s - %s]", _str, _id, ifg)
+            self.log.debug(f"word alignment: {_str} [{_id} - {ifg}]")
             if i != 0:
                 te = TextEquivType(
-                    Unicode=_str,
-                    conf=conf,
-                    dataType="other",
-                    dataTypeDetails="ocrd-cis-word-alignment:" + ddt)
+                    Unicode=_str, conf=conf, dataType="other", dataTypeDetails=f"ocrd-cis-word-alignment:{ddt}")
                 words[0].region[0].add_TextEquiv(te)
             else:
                 words[0].region[0].get_TextEquiv()[i].set_dataType("other")
-                words[0].region[0].get_TextEquiv()[i].set_dataTypeDetails(
-                    "ocrd-cis-word-alignment-master-ocr:" + ddt)
+                words[0].region[0].get_TextEquiv()[i].set_dataTypeDetails(f"ocrd-cis-word-alignment-master-ocr:{ddt}")
             words[0].region[0].get_TextEquiv()[i].set_index(i+1)
 
     def find_word(self, tokens, regions, t="other"):
-        self.log.debug("tokens = %s [%s]", tokens, t)
+        tokens_str = f"tokens = {tokens} [{t}]"
+        self.log.debug(tokens_str)
         for i, _ in enumerate(regions):
             n = self.match_tokens(tokens, regions, i)
             if n == 0:
                 continue
             return tuple([regions[i:n], regions[i:]])
         # not found try again with levenshtein
-        self.log.warn(
-            "could not find tokens = %s [%s]; trying again",
-            tokens, t)
+        self.log.warn(f"could not find {tokens_str}; trying again")
         for i, _ in enumerate(regions):
             n = self.match_tokens_lev(tokens, regions, i)
             if n == 0:
                 continue
             return tuple([regions[i:n], regions[i:]])
         # not found try again to match token within another one
-        self.log.warn(
-            "could not find tokens = %s [%s]; trying again",
-            tokens, t)
+        self.log.warn(f"could not find {tokens_str}; trying again")
         for i, _ in enumerate(regions):
             n = self.match_tokens_within(tokens, regions, i)
             if n == 0:
                 continue
             return tuple([regions[i:n], regions[i:]])
-
         # nothing could be found
         return tuple([None, regions])
 
@@ -212,7 +197,7 @@ def match_tokens_lev(self, tokens, regions, i):
         def f(a, b):
             k = 3  # int(len(a)/3)
             d = Levenshtein.distance(a, b)
-            self.log.debug("lev %s <=> %s: %d (%d)", a, b, d, d)
+            self.log.debug(f"lev {a} <=> {b}: {d} ({d})")
             return d <= 1 or d <= k
         return self.match_tokens_lambda(tokens, regions, i, f)
 
@@ -227,14 +212,15 @@ def match_tokens_lambda(self, tokens, regions, i, f):
         Returns 0 if nothing could be matched.
         """
         for j, token in enumerate(tokens):
-            if j + i >= len(regions):
+            sum_i_j = j + i
+            if sum_i_j >= len(regions):
                 return 0
-            if not regions[i+j].get_TextEquiv()[0].Unicode:
-                self.log.warn("cannot find %s", token)
+            unicode = regions[sum_i_j].get_TextEquiv()[0].Unicode
+            if not unicode:
+                self.log.warn(f"cannot find {token}")
                 return 0
-            self.log.debug('checking %s with %s', token,
-                           regions[i+j].get_TextEquiv()[0].Unicode)
-            if f(token, regions[i+j].get_TextEquiv()[0].Unicode):
+            self.log.debug(f'checking {token} with {unicode}')
+            if f(token, unicode):
                 continue
             if j == 0:
                 return 0
@@ -259,19 +245,18 @@ def zip_input_files(self, ifgs):
         """Zip files of the given input file groups"""
         files = list()
         for ifg in ifgs:
-            self.log.info("input file group: %s", ifg)
+            self.log.info(f"input file group: {ifg}")
             ifiles = sorted(
                 self.workspace.mets.find_files(fileGrp=ifg),
                 key=lambda ifile: ifile.url)
             for i in ifiles:
-                self.log.debug("sorted file: %s %s",
-                              os.path.basename(i.url), i.ID)
+                self.log.debug(f"sorted file: {os.path.basename(i.url)} {i.ID}")
             ifiles = [FileAlignment(self.workspace, x, ifg) for x in ifiles]
             files.append(ifiles)
         return zip(*files)
 
     def read_lines_from_input_file(self, ifile):
-        self.log.info("reading input file: %s", ifile)
+        self.log.info(f"reading input file: {ifile}")
         lines = list()
         pcgts = ifile.open()
         for region in pcgts.get_Page().get_TextRegion():
@@ -286,7 +271,7 @@ def run_java_aligner(self, ifs):
         lines = zip(*lines)
         _input = [x.strip() for t in lines for x in t]
         for i in _input:
-            self.log.debug("input line: %s", i)
+            self.log.debug(f"input line: {i}")
         n = len(ifs)
         self.log.debug("starting java client")
         p = JavaAligner(n, getLevelName(self.log.getEffectiveLevel()))
@@ -300,7 +285,7 @@ def __init__(self, workspace, ifile, ifg):
         self.log = getLogger('cis.FileAlignment')
 
     def open(self):
-        self.log.info("opening: %s", os.path.basename(self.input_file.url))
+        self.log.info(f"opening: {os.path.basename(self.input_file.url)}")
         return page_from_file(self.workspace.download_file(self.input_file))
 
 

From 0949277dbe049c1cd6776b3c701980c48cf2ebc8 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 22 Aug 2024 02:21:34 +0200
Subject: [PATCH 152/194] align: use final v3 API

---
 ocrd_cis/align/cli.py | 229 ++++++++++++++++--------------------------
 1 file changed, 85 insertions(+), 144 deletions(-)

diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py
index 7d6599c2..f85b7348 100644
--- a/ocrd_cis/align/cli.py
+++ b/ocrd_cis/align/cli.py
@@ -1,97 +1,71 @@
 from __future__ import absolute_import
+from __future__ import annotations
+
 import click
 import json
 import os
+from typing import Optional, List, Dict, Type
+
 from rapidfuzz.distance import Levenshtein
-from ocrd import Processor
+
+from ocrd import Processor, OcrdPage, OcrdPageResult
 from ocrd.decorators import ocrd_cli_options
 from ocrd.decorators import ocrd_cli_wrap_processor
-from ocrd_utils import MIMETYPE_PAGE
 from ocrd_utils import getLogger
 from ocrd_utils import getLevelName
-from ocrd_utils import make_file_id
-from ocrd_modelfactory import page_from_file
-from ocrd_models.ocrd_page import to_xml
-from ocrd_models.ocrd_page_generateds import TextEquivType
+from ocrd_models.ocrd_page import TextRegionType, TextEquivType
 from ocrd_cis import JavaAligner
-from ocrd_cis import get_ocrd_tool
 
 @click.command()
 @ocrd_cli_options
 def ocrd_cis_align(*args, **kwargs):
-    return ocrd_cli_wrap_processor(Aligner, *args, **kwargs)
+    return ocrd_cli_wrap_processor(CISAligner, *args, **kwargs)
 
-class Aligner(Processor):
-    def __init__(self, *args, **kwargs):
-        ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-align']
-        kwargs['version'] = ocrd_tool['version']
-        super(Aligner, self).__init__(*args, **kwargs)
+class CISAligner(Processor):
+    @property
+    def executable(self):
+        return 'ocrd-cis-align'
 
-        if hasattr(self, 'workspace'):
-            self.log = getLogger('cis.Processor.Aligner')
-
-    def process(self):
-        ifgs = self.input_file_grp.split(",")  # input file groups
-        if len(ifgs) < 2:
-            raise Exception("need at least two input file groups to align")
-        ifts = self.zip_input_files(ifgs)  # input file tuples
-        for _id, ift in enumerate(ifts):
-            alignments = json.loads(self.run_java_aligner(ift))
-            pcgts = self.align(alignments, ift)
-            # keep the right part after OCR-D-...-filename
-            # and prepend output_file_grp
-            input_file = ift[0].input_file
-            file_id = make_file_id(input_file, self.output_file_grp)
-            pcgts.set_pcGtsId(file_id)
-            out = self.workspace.add_file(
-                ID=file_id,
-                file_grp=self.output_file_grp,
-                pageId=input_file.pageId,
-                local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),
-                mimetype=MIMETYPE_PAGE,
-                content=to_xml(pcgts),
-            )
-            self.log.info('created file %s', out)
+    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
+        assert len(input_pcgts) >= 2
+        alignments = json.loads(self.run_java_aligner(input_pcgts))
+        pcgts = self.align(alignments, input_pcgts)
+        return OcrdPageResult(pcgts)
 
-    def align(self, alignments, ift):
+    def align(self, alignments: List[Dict], pcgts: List[OcrdPage]) -> OcrdPage:
         """align the alignment objects with the according input file tuples"""
-        for t in ift:
-            self.log.debug(f"tuple {os.path.basename(t.input_file.url)}")
-        pcgtst = self.open_input_file_tuples(ift)
         i = 0
-        for mi, mr in enumerate(pcgtst[0].get_Page().get_TextRegion()):
+        file_groups = self.input_file_grp.split(',')
+        for mi, mr in enumerate(pcgts[0].get_Page().get_AllRegions(classes=['Text'])):
             for mj, _ in enumerate(mr.get_TextLine()):
-                for iiii, u in enumerate(mr.get_TextLine()[mj].get_TextEquiv()):
-                    self.log.debug(f"[{iiii}] {u.Unicode}")
-                for xx in mr.get_TextLine()[mj].get_Word():
-                    for iiii, u in enumerate(xx.get_TextEquiv()):
-                        self.log.debug(f"[{iiii}] {u.Unicode}")
-
                 lines = []
-                for ii, t in enumerate(ift):
+                for ii, page in enumerate(pcgts):
                     if i >= len(alignments):
                         break
-                    tr = pcgtst[ii].get_Page().get_TextRegion()
+                    tr = page.get_Page().get_AllRegions(classes=['Text'])
                     region = tr[mi].get_TextLine()[mj]
-                    lines.append(Alignment(t, region, alignments[i]))
+                    lines.append(Alignment(file_groups[ii], page, region, alignments[i]))
                 self.align_lines(lines)
                 i += 1
-        return pcgtst[0]
+        return pcgts[0]
 
-    def align_lines(self, lines):
+    def align_lines(self, lines: List[Alignment]) -> None:
         """align the given line alignment with the lines"""
         if not lines:
             return
-        if len(lines[0].region.get_TextEquiv()) > 1:
-            del lines[0].region.get_TextEquiv()[1:]
+        if len(lines[0].region.TextEquiv) > 1:
+            del lines[0].region.TextEquiv[1:]
         for i, line in enumerate(lines):
             if lines[0].region.get_TextEquiv() is None:
                 lines[0].region.TextEquiv = []
-            self.log.debug(f'line alignment: {get_textequiv_unicode(line.region)} '
-                           f'[{line.region.get_id()} - {line.input_file.input_file_group}]')
-            ddt = f"{line.input_file.input_file_group}/{line.region.get_id()}"
-            if i != 0:
+            self.logger.debug(
+                'line alignment: %s [%s - %s]',
+                get_textequiv_unicode(line.region),
+                line.region.get_id(),
+                line.file_grp
+            )
+            ddt = line.file_grp + "/" + line.region.get_id()
+            if i > 0:
                 te = TextEquivType(
                     Unicode=get_textequiv_unicode(line.region),
                     conf=get_textequiv_conf(line.region),
@@ -99,58 +73,64 @@ def align_lines(self, lines):
                     dataTypeDetails=f"ocrd-cis-line-alignment:{ddt}")
                 lines[0].region.add_TextEquiv(te)
             else:
-                self.log.debug("len: %i, i: %i", len(lines[0].region.get_TextEquiv()), i)
-                lines[0].region.get_TextEquiv()[i].set_dataType("other")
-                lines[0].region.get_TextEquiv()[i].set_dataTypeDetails(
-                    f"ocrd-cis-line-alignment-master-ocr:{ddt}")
-            lines[0].region.get_TextEquiv()[i].set_index(i+1)
+                self.logger.debug("len: %i, i: %i", len(lines[0].region.TextEquiv), i)
+                lines[0].region.TextEquiv[i].set_dataType("other")
+                lines[0].region.TextEquiv[i].set_dataTypeDetails(
+                    "ocrd-cis-line-alignment-master-ocr:" + ddt)
+            lines[0].region.TextEquiv[i].set_index(i+1)
         self.align_words(lines)
 
-    def align_words(self, lines):
-        # self.log.info(json.dumps(lines[0].alignment))
+    def align_words(self, lines: List[Alignment]) -> None:
+        # self.logger.info(json.dumps(lines[0].alignment))
         mregion = lines[0].region.get_Word()
         oregion = [lines[i].region.get_Word() for i in range(1, len(lines))]
         for word in lines[0].alignment['wordAlignments']:
-            self.log.debug(f"aligning word {word['master']}", )
+            self.logger.debug("aligning word %s", word['master'])
             master, rest = self.find_word([word['master']], mregion, "master")
             mregion = rest
             if master is None or len(master) != 1:
-                self.log.warn(f"cannot find {word['master']}; giving up")
-                # raise Exception(f"cannot find {word['master']}; giving up")
+                self.logger.warn("cannot find {}; giving up".format(word['master']))
+                # raise Exception("cannot find {}; giving up".format(word['master']))
                 return
             others = list()
             for i, other in enumerate(word['alignments']):
                 match, rest = self.find_word(other, oregion[i])
                 if match is None:
-                    self.log.warn(f"cannot find {other}; giving up")
+                    self.logger.warn(f"cannot find {other}; giving up")
                     return
                 others.append(match)
                 oregion[i] = rest
             words = list()
             words.append(
-                Alignment(lines[0].input_file, master, lines[0].alignment))
+                Alignment(lines[0].file_grp, lines[0].pcgts, master, lines[0].alignment))
             for i, other in enumerate(others):
-                words.append(Alignment(lines[i+1].input_file, other, lines[i+1].alignment))
+                words.append(Alignment(
+                    lines[i+1].file_grp,
+                    lines[i+1].pcgts,
+                    other,
+                    lines[i+1].alignment))
             self.align_word_regions(words)
 
-    def align_word_regions(self, words):
+    def align_word_regions(self, words: List[Alignment]) -> None:
         def te0(x):
-            return x.get_TextEquiv()[0]
+            return x.TextEquiv[0]
         for i, word in enumerate(words):
             if not word.region:
-                ifg = word.input_file.input_file_group
-                self.log.debug(f"(empty) word alignment: [{ifg}]")
-                te = TextEquivType(dataType="other", dataTypeDetails=f"ocrd-cis-empty-word-alignment:{ifg}")
+                ifg = word.file_grp
+                self.logger.debug("(empty) word alignment: [%s]", ifg)
+                te = TextEquivType(
+                    dataType="other",
+                    dataTypeDetails="ocrd-cis-empty-word-alignment:" + ifg)
                 words[0].region[0].add_TextEquiv(te)
                 words[0].region[0].get_TextEquiv()[i].set_index(i+1)
                 continue
             _str = " ".join([te0(x).Unicode for x in word.region])
             _id = ",".join([x.get_id() for x in word.region])
-            ifg = word.input_file.input_file_group
-            ddt = word.input_file.input_file_group + "/" + _id
+            ifg = word.file_grp
+            ddt = word.file_grp + "/" + _id
             # if conf is none it is most likely ground truth data
             conf = min([float(te0(x).get_conf() or "1.0") for x in word.region])
-            self.log.debug(f"word alignment: {_str} [{_id} - {ifg}]")
+            self.logger.debug(f"word alignment: {_str} [{_id} - {ifg}]")
             if i != 0:
                 te = TextEquivType(
                     Unicode=_str, conf=conf, dataType="other", dataTypeDetails=f"ocrd-cis-word-alignment:{ddt}")
@@ -162,21 +142,21 @@ def te0(x):
 
     def find_word(self, tokens, regions, t="other"):
         tokens_str = f"tokens = {tokens} [{t}]"
-        self.log.debug(tokens_str)
+        self.logger.debug(tokens_str)
         for i, _ in enumerate(regions):
             n = self.match_tokens(tokens, regions, i)
             if n == 0:
                 continue
             return tuple([regions[i:n], regions[i:]])
         # not found try again with levenshtein
-        self.log.warn(f"could not find {tokens_str}; trying again")
+        self.logger.warn(f"could not find {tokens_str}; trying again")
         for i, _ in enumerate(regions):
             n = self.match_tokens_lev(tokens, regions, i)
             if n == 0:
                 continue
             return tuple([regions[i:n], regions[i:]])
         # not found try again to match token within another one
-        self.log.warn(f"could not find {tokens_str}; trying again")
+        self.logger.warn(f"could not find {tokens_str}; trying again")
         for i, _ in enumerate(regions):
             n = self.match_tokens_within(tokens, regions, i)
             if n == 0:
@@ -197,7 +177,7 @@ def match_tokens_lev(self, tokens, regions, i):
         def f(a, b):
             k = 3  # int(len(a)/3)
             d = Levenshtein.distance(a, b)
-            self.log.debug(f"lev {a} <=> {b}: {d} ({d})")
+            self.logger.debug(f"lev {a} <=> {b}: {d} ({d})")
             return d <= 1 or d <= k
         return self.match_tokens_lambda(tokens, regions, i, f)
 
@@ -215,11 +195,11 @@ def match_tokens_lambda(self, tokens, regions, i, f):
             sum_i_j = j + i
             if sum_i_j >= len(regions):
                 return 0
-            unicode = regions[sum_i_j].get_TextEquiv()[0].Unicode
+            unicode = regions[sum_i_j].TextEquiv[0].Unicode
             if not unicode:
-                self.log.warn(f"cannot find {token}")
+                self.logger.warn(f"cannot find {token}")
                 return 0
-            self.log.debug(f'checking {token} with {unicode}')
+            self.logger.debug(f'checking {token} with {unicode}')
             if f(token, unicode):
                 continue
             if j == 0:
@@ -230,68 +210,29 @@ def match_tokens_lambda(self, tokens, regions, i, f):
             i += 1
         return i + len(tokens)
 
-    def open_input_file_tuples(self, ift):
-        """
-        opens all xml files of the given input file tuple
-        and returns them as tuples
-        """
-        res = list()
-        for ifile in ift:
-            pcgts = ifile.open()
-            res.append(pcgts)
-        return tuple(res)
-
-    def zip_input_files(self, ifgs):
-        """Zip files of the given input file groups"""
-        files = list()
-        for ifg in ifgs:
-            self.log.info(f"input file group: {ifg}")
-            ifiles = sorted(
-                self.workspace.mets.find_files(fileGrp=ifg),
-                key=lambda ifile: ifile.url)
-            for i in ifiles:
-                self.log.debug(f"sorted file: {os.path.basename(i.url)} {i.ID}")
-            ifiles = [FileAlignment(self.workspace, x, ifg) for x in ifiles]
-            files.append(ifiles)
-        return zip(*files)
-
-    def read_lines_from_input_file(self, ifile):
-        self.log.info(f"reading input file: {ifile}")
+    def run_java_aligner(self, input_pcgts: List[OcrdPage]) -> str:
         lines = list()
-        pcgts = ifile.open()
-        for region in pcgts.get_Page().get_TextRegion():
-            for line in region.get_TextLine():
-                lines.append(get_textequiv_unicode(line))
-        return lines
-
-    def run_java_aligner(self, ifs):
-        lines = list()
-        for ifile in ifs:
-            lines.append(self.read_lines_from_input_file(ifile))
+        for pcgts in input_pcgts:
+            lines.append([get_textequiv_unicode(line)
+                          for line in pcgts.get_Page().get_AllTextLines()])
+        # JavaAligner expects a strange input format
         lines = zip(*lines)
         _input = [x.strip() for t in lines for x in t]
         for i in _input:
-            self.log.debug(f"input line: {i}")
-        n = len(ifs)
-        self.log.debug("starting java client")
-        p = JavaAligner(n, getLevelName(self.log.getEffectiveLevel()))
+            self.logger.debug("input line: %s", i)
+        n = len(input_pcgts)
+        self.logger.debug("starting java client")
+        p = JavaAligner(n, getLevelName(self.logger.getEffectiveLevel()))
         return p.run("\n".join(_input))
 
-class FileAlignment:
-    def __init__(self, workspace, ifile, ifg):
-        self.workspace = workspace
-        self.input_file = ifile
-        self.input_file_group = ifg
-        self.log = getLogger('cis.FileAlignment')
-
-    def open(self):
-        self.log.info(f"opening: {os.path.basename(self.input_file.url)}")
-        return page_from_file(self.workspace.download_file(self.input_file))
-
-
 class Alignment:
-    def __init__(self, ifile, region, alignment):
-        self.input_file = ifile
+    file_grp: str
+    pcgts: OcrdPage
+    region: TextRegionType
+    alignment: Alignment
+    def __init__(self, file_grp: str, pcgts: OcrdPage, region: TextRegionType, alignment: Alignment):
+        self.file_grp = file_grp
+        self.pcgts = pcgts
         self.region = region
         self.alignment = alignment
 

From d4f8483ffdefac50161e4376637b9f8e813c384f Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 22 Aug 2024 02:21:58 +0200
Subject: [PATCH 153/194] use ocrd_utils instead of pkg_resources

---
 ocrd_cis/data/__main__.py | 10 +++++-----
 ocrd_cis/javaprocess.py   |  5 ++---
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/ocrd_cis/data/__main__.py b/ocrd_cis/data/__main__.py
index 3d8ef735..8fdcddd6 100644
--- a/ocrd_cis/data/__main__.py
+++ b/ocrd_cis/data/__main__.py
@@ -1,18 +1,18 @@
-import pkg_resources
 import sys
+from ocrd_utils import resource_filename
 
 def main():
     usage = 'usage: ' + sys.argv[0] + ' -jar|-3gs|-model|-config'
     if '-h' in sys.argv:
         print(usage)
     elif '-jar' in sys.argv:
-        print(pkg_resources.resource_filename('ocrd_cis', 'data/ocrd-cis.jar'))
+        print(resource_filename('ocrd_cis', 'data/ocrd-cis.jar'))
     elif '-3gs' in sys.argv:
-        print(pkg_resources.resource_filename('ocrd_cis', 'data/3gs.csv.gz'))
+        print(resource_filename('ocrd_cis', 'data/3gs.csv.gz'))
     elif '-model' in sys.argv:
-        print(pkg_resources.resource_filename('ocrd_cis', 'data/model.zip'))
+        print(resource_filename('ocrd_cis', 'data/model.zip'))
     elif '-config' in sys.argv:
-        print(pkg_resources.resource_filename('ocrd_cis', 'data/config.json'))
+        print(resource_filename('ocrd_cis', 'data/config.json'))
     else:
         raise ValueError(usage)
 
diff --git a/ocrd_cis/javaprocess.py b/ocrd_cis/javaprocess.py
index ce2f6bfd..72915d68 100644
--- a/ocrd_cis/javaprocess.py
+++ b/ocrd_cis/javaprocess.py
@@ -1,12 +1,11 @@
 import subprocess
 import json
-import pkg_resources
 
-from ocrd_utils import getLogger
+from ocrd_utils import getLogger, resource_filename
 from pathlib import Path
 
 MAIN = "de.lmu.cis.ocrd.cli.Main"
-JAR = pkg_resources.resource_filename('ocrd_cis', 'data/ocrd-cis.jar')
+JAR = str(resource_filename('ocrd_cis', 'data/ocrd-cis.jar'))
 
 def JavaAligner(n, loglvl):
     """Create a java process that calls -c align -D '{"n":n}'"""

From ecc44c0358354c0c3c3ba6000e7de7413dc9cef1 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 22 Aug 2024 13:31:09 +0200
Subject: [PATCH 154/194] postcorrect: use final v3 API

---
 ocrd_cis/align/cli.py       |  1 +
 ocrd_cis/postcorrect/cli.py | 22 ++++++++++------------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py
index f85b7348..f5e47785 100644
--- a/ocrd_cis/align/cli.py
+++ b/ocrd_cis/align/cli.py
@@ -16,6 +16,7 @@
 from ocrd_models.ocrd_page import TextRegionType, TextEquivType
 from ocrd_cis import JavaAligner
 
+
 @click.command()
 @ocrd_cli_options
 def ocrd_cis_align(*args, **kwargs):
diff --git a/ocrd_cis/postcorrect/cli.py b/ocrd_cis/postcorrect/cli.py
index dc3ee48e..71fbaad1 100644
--- a/ocrd_cis/postcorrect/cli.py
+++ b/ocrd_cis/postcorrect/cli.py
@@ -1,14 +1,15 @@
 from __future__ import absolute_import
+import os
+
 import click
 import json
-import os
+
 from ocrd import Processor
-from ocrd.decorators import ocrd_cli_options
-from ocrd.decorators import ocrd_cli_wrap_processor
+from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
 from ocrd_utils import getLogger, getLevelName
 from ocrd_models.ocrd_mets import OcrdMets
 from ocrd_cis import JavaPostCorrector
-from ocrd_cis import get_ocrd_tool
+
 
 @click.command()
 @ocrd_cli_options
@@ -16,26 +17,23 @@ def ocrd_cis_postcorrect(*args, **kwargs):
     return ocrd_cli_wrap_processor(PostCorrector, *args, **kwargs)
 
 class PostCorrector(Processor):
-    def __init__(self, *args, **kwargs):
-        ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-postcorrect']
-        kwargs['version'] = ocrd_tool['version']
-        super(PostCorrector, self).__init__(*args, **kwargs)
+    @property
+    def executable(self):
+        return 'ocrd-cis-postcorrect'
 
     def process(self):
-        self.log = getLogger('processor.CISPostCorrector')
         profiler = {}
         profiler["path"] = self.parameter["profilerPath"]
         profiler["config"] = self.parameter["profilerConfig"]
         profiler["noCache"] = True
         self.parameter["profiler"] = profiler
         self.parameter["runDM"] = True
-        self.log.debug(json.dumps(self.parameter, indent=4))
+        self.logger.debug(json.dumps(self.parameter, indent=4))
         p = JavaPostCorrector(self.workspace.mets_target,
                               self.input_file_grp,
                               self.output_file_grp,
                               self.parameter,
-                              getLevelName(self.log.getEffectiveLevel()))
+                              getLevelName(self.logger.getEffectiveLevel()))
         p.exe()
         # reload the mets file to prevent run_processor's save_mets
         # from overriding the results from the Java process

From 2b310b4690b1a83be75cd93432ea38be7250ee35 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 23 Aug 2024 13:51:07 +0200
Subject: [PATCH 155/194] revert: ocropy.ocrolib changes

---
 ocrd_cis/ocropy/ocrolib/morph.py    | 18 ++++++++----------
 ocrd_cis/ocropy/ocrolib/toplevel.py | 26 ++++++++++++++++++--------
 2 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py
index b9619cca..f7ccdc31 100644
--- a/ocrd_cis/ocropy/ocrolib/morph.py
+++ b/ocrd_cis/ocropy/ocrolib/morph.py
@@ -292,9 +292,8 @@ def propagate_labels_majority(image,labels):
     with the largest overlap."""
     rlabels,_ = label(image)
     cors = correspondences(rlabels,labels)
-    amax_rlabels = amax(rlabels) + 1
-    outputs = zeros(amax_rlabels,'i')
-    counts = zeros(amax_rlabels,'i')
+    outputs = zeros(amax(rlabels)+1,'i')
+    counts = zeros(amax(rlabels)+1,'i')
     for rlabel, label_, count in cors.T:
         if not rlabel or not label_:
             # ignore background correspondences
@@ -348,13 +347,12 @@ def all_neighbors(image, dist=1, bg=NaN):
     """Given an image with labels, find all pairs of labels
     that are directly (up to ``dist``) neighboring each other, ignoring the label ``bg``."""
     q = 100000
-    assert amax(image) < q
-    assert amin(image) >= 0
-    q_image = q * image
-    u = unique(q_image + shift(image, (dist, 0), order=0, cval=bg))
-    d = unique(q_image + shift(image, (-dist, 0), order=0, cval=bg))
-    l = unique(q_image + shift(image, (0, dist), order=0, cval=bg))
-    r = unique(q_image + shift(image, (0, -dist), order=0, cval=bg))
+    assert amax(image)<q
+    assert amin(image)>=0
+    u = unique(q*image+shift(image, (dist, 0), order=0, cval=bg))
+    d = unique(q*image+shift(image, (-dist, 0), order=0, cval=bg))
+    l = unique(q*image+shift(image, (0, dist), order=0, cval=bg))
+    r = unique(q*image+shift(image, (0, -dist), order=0, cval=bg))
     all = unique(r_[u,d,l,r])
     all = all[all!=bg]
     all = c_[all//q,all%q]
diff --git a/ocrd_cis/ocropy/ocrolib/toplevel.py b/ocrd_cis/ocropy/ocrolib/toplevel.py
index 72e397af..87ed18c5 100644
--- a/ocrd_cis/ocropy/ocrolib/toplevel.py
+++ b/ocrd_cis/ocropy/ocrolib/toplevel.py
@@ -125,10 +125,14 @@ def __init__(self,*args,**kw):
         self.fun = kw.get("fun","?")
         self.var = kw.get("var","?")
         self.description = " ".join([strc(x) for x in args])
-
     def __str__(self):
-        return f"\nCheckError for argument {str(self.var)} of function {str(self.fun)}\n{self.description}"
-
+        result = "\nCheckError for argument "
+        result += str(self.var)
+        result += " of function "
+        result += str(self.fun)
+        result += "\n"
+        result += self.description
+        return result
 
 class CheckWarning(CheckError):
     def __init__(self,*args,**kw):
@@ -138,8 +142,14 @@ def __init__(self,*args,**kw):
         CheckError.__init__(self, *args, **kw)
 
     def __str__(self):
-        return (f"\nCheckWarning for argument {str(self.var)} of function {str(self.fun)}\n{self.description} "
-                f"(This can happen occasionally during normal operations and isn't necessarily a bug or problem.)\n")
+        result = "\nCheckWarning for argument "
+        result += str(self.var)
+        result += " of function "
+        result += str(self.fun)
+        result += "\n"
+        result += self.description
+        result += "(This can happen occasionally during normal operations and isn't necessarily a bug or problem.)\n"
+        return result
 
 def checktype(value,type_):
     """Check value against the type spec.  If everything
@@ -201,7 +211,7 @@ def argument_checks(*args,**kw):
                     e.var = var
                     raise e
                 except:
-                    LOG.critical(f"unknown exception while checking function: '{name}'")
+                    LOG.critical("unknown exception while checking function: '%s'", name)
                     raise
             result = f(*args,**kw)
             checktype(result,kw.get("_",True))
@@ -215,9 +225,9 @@ def decorator(f):
         def wrapper(arg):
             if not f(arg):
                 if warning:
-                    raise CheckWarning(f"{strc(arg)} of type {str(type(arg))}: {str(message)}")
+                    raise CheckWarning(strc(arg)+" of type "+str(type(arg))+": "+str(message))
                 else:
-                    raise CheckError(f"{strc(arg)} of type {str(type(arg))}: {str(message)}")
+                    raise CheckError(strc(arg)+" of type "+str(type(arg))+": "+str(message))
         return wrapper
     return decorator
 

From 4420c6fa246c81f1fc7c14e7a1cb6dc1d2460e5f Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 23 Aug 2024 15:06:41 +0200
Subject: [PATCH 156/194] revert: ocropy.common changes

---
 ocrd_cis/ocropy/common.py | 186 +++++++++++++++++++-------------------
 1 file changed, 93 insertions(+), 93 deletions(-)

diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index a5806517..c23e89b9 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -184,19 +184,16 @@ def estimate_thresholds(flat, bignore=0.1, escale=1.0, lo=5, hi=90):
     d0, d1 = flat.shape
     o0, o1 = int(bignore * d0), int(bignore * d1)
     est = flat[o0:d0 - o0, o1:d1 - o1]
-
     if escale > 0:
         # by default, we use only regions that contain
         # significant variance; this makes the percentile
         # based low and high estimates more reliable
         e = escale
-        e_20_0 = e * 20.0
-        e_50 = int(e * 50)
-        v = est - filters.gaussian_filter(est, e_20_0)
-        v = filters.gaussian_filter(v ** 2, e_20_0) ** 0.5
+        v = est - filters.gaussian_filter(est, e*20.0)
+        v = filters.gaussian_filter(v ** 2, e*20.0) ** 0.5
         v = (v > 0.3 * np.amax(v))
-        v = morphology.binary_dilation(v, structure=np.ones((e_50, 1)))
-        v = morphology.binary_dilation(v, structure=np.ones((1, e_50)))
+        v = morphology.binary_dilation(v, structure=np.ones((int(e * 50), 1)))
+        v = morphology.binary_dilation(v, structure=np.ones((1, int(e * 50))))
         est = est[v]
     lo = stats.scoreatpercentile(est.ravel(), lo)
     hi = stats.scoreatpercentile(est.ravel(), hi)
@@ -313,24 +310,24 @@ def check_line(binary, zoom=1.0):
 
     Returns an error report, or None if valid.
     """
-    if np.prod(binary.shape) == 0: return "image dimensions are zero"
-    if len(binary.shape) == 3: return f"image is not monochrome {binary.shape}"
+    if np.prod(binary.shape)==0: return "image dimensions are zero"
+    if len(binary.shape)==3: return "image is not monochrome %s"%(binary.shape,)
     if np.amax(binary)==np.amin(binary): return "image is blank"
     if np.mean(binary)<np.median(binary): return "image may be inverted"
     h,w = binary.shape
-    if h<20/zoom: return f"image not tall enough for a text line {binary.shape}"
-    if h>200/zoom: return f"image too tall for a text line {binary.shape}"
+    if h<20/zoom: return "image not tall enough for a text line %s"%(binary.shape,)
+    if h>200/zoom: return "image too tall for a text line %s"%(binary.shape,)
     ##if w<1.5*h: return "line too short %s"%(binary.shape,)
-    if w<1.5*h and w<32/zoom: return f"image too short for a line image {binary.shape}"
-    if w>4000/zoom: return f"image too long for a line image {binary.shape}"
+    if w<1.5*h and w<32/zoom: return "image too short for a line image %s"%(binary.shape,)
+    if w>4000/zoom: return "image too long for a line image %s"%(binary.shape,)
     return None
     ratio = w*1.0/h
     _, ncomps = measurements.label(binary)
     lo = int(0.5*ratio+0.5)
     hi = int(4*ratio)+1
-    if ncomps<lo: return f"too few connected components (got {ncomps}, wanted >={lo})"
-    ##if ncomps>hi*ratio: return f"too many connected components (got {ncomps}, wanted <={hi})"
-    if ncomps>hi*ratio and ncomps>10: return f"too many connected components (got {ncomps}, wanted <={hi})"
+    if ncomps<lo: return "too few connected components (got %d, wanted >=%d)"%(ncomps,lo)
+    ##if ncomps>hi*ratio: return "too many connected components (got %d, wanted <=%d)"%(ncomps,hi)
+    if ncomps>hi*ratio and ncomps>10: return "too many connected components (got %d, wanted <=%d)"%(ncomps,hi)
     return None
 
 # inspired by ocropus-gpageseg check_page
@@ -344,21 +341,21 @@ def check_region(binary, zoom=1.0):
 
     Returns an error report, or None if valid.
     """
-    if np.prod(binary.shape) == 0: return "image dimensions are zero"
-    if len(binary.shape) == 3: return f"image is not monochrome {binary.shape}"
+    if np.prod(binary.shape)==0: return "image dimensions are zero"
+    if len(binary.shape)==3: return "image is not monochrome %s"%(binary.shape,)
     if np.amax(binary)==np.amin(binary): return "image is blank"
     if np.mean(binary)<np.median(binary): return "image may be inverted"
     h,w = binary.shape
-    if h<45/zoom: return f"image not tall enough for a region image {binary.shape}"
-    if h>5000/zoom: return f"image too tall for a region image {binary.shape}"
-    if w<100/zoom: return f"image too narrow for a region image {binary.shape}"
-    if w>5000/zoom: return f"image too wide for a region image {binary.shape}"
+    if h<45/zoom: return "image not tall enough for a region image %s"%(binary.shape,)
+    if h>5000/zoom: return "image too tall for a region image %s"%(binary.shape,)
+    if w<100/zoom: return "image too narrow for a region image %s"%(binary.shape,)
+    if w>5000/zoom: return "image too wide for a region image %s"%(binary.shape,)
     return None
     # zoom factor (DPI relative) and 4 (against fragmentation from binarization)
     slots = int(w*h*1.0/(30*30)*zoom*zoom) * 4
     _,ncomps = measurements.label(binary)
-    if ncomps<5: return f"too few connected components for a region image (got {ncomps})"
-    if ncomps>slots and ncomps>10: return f"too many connected components for a region image ({ncomps} > {slots})"
+    if ncomps<5: return "too few connected components for a region image (got %d)"%(ncomps,)
+    if ncomps>slots and ncomps>10: return "too many connected components for a region image (%d > %d)"%(ncomps,slots)
     return None
 
 # from ocropus-gpageseg, but with zoom parameter
@@ -372,21 +369,21 @@ def check_page(binary, zoom=1.0):
 
     Returns an error report, or None if valid.
     """
-    if np.prod(binary.shape) == 0: return "image dimensions are zero"
-    if len(binary.shape) == 3: return f"image not monochrome {binary.shape}"
+    if np.prod(binary.shape)==0: return "image dimensions are zero"
+    if len(binary.shape)==3: return "image not monochrome %s"%(binary.shape,)
     if np.amax(binary)==np.amin(binary): return "image is blank"
     if np.mean(binary)<np.median(binary): return "image may be inverted"
     h,w = binary.shape
-    if h<600/zoom: return f"image not tall enough for a page image {binary.shape}"
-    if h>20000/zoom: return f"image too tall for a page image {binary.shape}"
-    if w<600/zoom: return f"image too narrow for a page image {binary.shape}"
-    if w>20000/zoom: return f"image too wide for a page image {binary.shape}"
+    if h<600/zoom: return "image not tall enough for a page image %s"%(binary.shape,)
+    if h>20000/zoom: return "image too tall for a page image %s"%(binary.shape,)
+    if w<600/zoom: return "image too narrow for a page image %s"%(binary.shape,)
+    if w>20000/zoom: return "image too wide for a page image %s"%(binary.shape,)
     return None
     # zoom factor (DPI relative) and 4 (against fragmentation from binarization)
     slots = int(w*h*1.0/(30*30)*zoom*zoom) * 4
     _,ncomps = measurements.label(binary)
-    if ncomps<10: return f"too few connected components for a page image (got {ncomps})"
-    if ncomps>slots and ncomps>10: return f"too many connected components for a page image ({ncomps} > {slots})"
+    if ncomps<10: return "too few connected components for a page image (got %d)"%(ncomps,)
+    if ncomps>slots and ncomps>10: return "too many connected components for a page image (%d > %d)"%(ncomps,slots)
     return None
 
 def odd(num):
@@ -479,13 +476,8 @@ def compute_images(binary, scale, maximages=5):
     #images = morph.rb_closing(images, (d0,d1))
     #DSAVE('images1_closed', images+0.6*binary)
     # 1- filter largest connected components
-    binary_0_6 = 0.6 * binary
-    odd_scale = odd(scale)
-    odd_half_scale = odd(scale / 2)
-    odd_doubled_scale = odd(2 * scale)
-    region_min = (4 * scale) ** 2
-    images = morph.select_regions(images, sl.area, min=region_min, nbest=2 * maximages)
-    DSAVE('images1_large', images + binary_0_6)
+    images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=2*maximages)
+    DSAVE('images1_large', images+0.6*binary)
     if not images.any():
         return np.zeros_like(binary, int)
     # 2- open horizontally and vertically to suppress
@@ -494,31 +486,31 @@ def compute_images(binary, scale, maximages=5):
     #    single frame, because then the hull polygon
     #    can cover/overlap large text/table parts which
     #    we cannot discern from the actual image anymore
-    h_opened = morph.rb_opening(images, (1, odd_half_scale))
-    DSAVE('images2_h-opened', h_opened + binary_0_6)
-    v_opened = morph.rb_opening(images, (odd_half_scale, 1))
-    DSAVE('images2_v-opened', v_opened + binary_0_6)
+    h_opened = morph.rb_opening(images, (1, odd(scale/2)))
+    DSAVE('images2_h-opened', h_opened+0.6*binary)
+    v_opened = morph.rb_opening(images, (odd(scale/2), 1))
+    DSAVE('images2_v-opened', v_opened+0.6*binary)
     # 3- close whatever remains
-    closed = morph.rb_closing(h_opened&v_opened, (odd_doubled_scale, odd_doubled_scale))
-    DSAVE('images3_closed', closed + binary_0_6)
+    closed = morph.rb_closing(h_opened&v_opened, (odd(2*scale), odd(2*scale)))
+    DSAVE('images3_closed', closed + 0.6*binary)
     # 4- reconstruct the losses up to a certain distance
     #    to avoid creeping into pure h/v-lines again but still
     #    cover most of the large object
     #images = np.where(images, closed, 2)
     #images = morph.spread_labels(images, maxdist=scale) % 2 | closed
     images = morph.rb_reconstruction(closed, images, step=2, maxsteps=scale)
-    DSAVE('images4_reconstructed', images + binary_0_6)
+    DSAVE('images4_reconstructed', images+0.6*binary)
     # 5- select nbest
-    images = morph.select_regions(images, sl.area, min=region_min, nbest=maximages)
-    DSAVE('images5_selected', images + binary_0_6)
+    images = morph.select_regions(images, sl.area, min=(4*scale)**2, nbest=maximages)
+    DSAVE('images5_selected', images+0.6*binary)
     if not images.any():
         return np.zeros_like(binary, int)
     # 6- dilate a little to get a smooth contour without gaps
-    dilated = morph.r_dilation(images, (odd_scale, odd_scale))
+    dilated = morph.r_dilation(images, (odd(scale), odd(scale)))
     images = morph.propagate_labels_majority(binary, dilated+1)
     images = morph.spread_labels(images, maxdist=scale)==2
     images, _ = morph.label(images)
-    DSAVE('images6_dilated', images + binary_0_6)
+    DSAVE('images6_dilated', images+0.6*binary)
     # we could repeat reconstruct-dilate here...
     return images
 
@@ -556,7 +548,6 @@ def compute_seplines(binary, scale, maxseps=0):
     sepsizes = [0]
     sepslices = [None]
     sepdists = [0]
-    doubled_scale = 2 * scale
     for label in range(1, nlabels + 1):
         labelslice = slices[label]
         labelmask = labels == label
@@ -608,8 +599,8 @@ def compute_seplines(binary, scale, maxseps=0):
                 binmask = sublabels == bin + 1
                 binlabels, nbinlabels = morph.label(binmask)
                 _, binlabelcounts = np.unique(binlabels, return_counts=True)
-                largemask = (binlabelcounts > doubled_scale)[binlabels]
-                smallmask = (binlabelcounts <= doubled_scale)[binlabels]
+                largemask = (binlabelcounts > 2 * scale)[binlabels]
+                smallmask = (binlabelcounts <= 2 * scale)[binlabels]
                 sublabels2[binmask & smallmask] = 1
                 if not np.any(binmask & largemask):
                     continue
@@ -1852,13 +1843,11 @@ def find_topological():
             else:
                 llab[box] = lbinary[box]
             # show projection at the sides
-            log_y = -10 * np.log(y + 1e-9)
-            log_x = -10 * np.log(x + 1e-9)
-            for i in range(int(scale / 2)):
-                llab[box[0], box[1].start + i] = log_y
-                llab[box[0], box[1].stop - 1 - i] = log_y
-                llab[box[0].start + i, box[1]] = log_x
-                llab[box[0].stop - 1 - i, box[1]] = log_x
+            for i in range(int(scale/2)):
+                llab[box[0],box[1].start+i] = -10*np.log(y+1e-9)
+                llab[box[0],box[1].stop-1-i] = -10*np.log(y+1e-9)
+                llab[box[0].start+i,box[1]] = -10*np.log(x+1e-9)
+                llab[box[0].stop-1-i,box[1]] = -10*np.log(x+1e-9)
             DSAVE('recursive_x_y_cut_' + (partition_type or 'sliced'), llab)
         gap_weights = list()
         for is_horizontal, profile in enumerate([y, x]):
@@ -1888,19 +1877,19 @@ def find_topological():
                 weights = weights * (1 + 0.5 * props['peak_heights']/gap_height)
             gap_weights.append((gaps, weights))
             if debug:
-                orientation = 'horizontal' if is_horizontal else 'vertical'
-                LOG.debug(f'  {orientation} gaps {gaps} {props} weights {weights}')
+                LOG.debug('  {} gaps {} {} weights {}'.format(
+                    'horizontal' if is_horizontal else 'vertical',
+                    gaps, props, weights))
                 if not gaps.shape[0]:
                     continue
-                half_scale = int(scale / 2)
                 for start, stop, height in sorted(zip(
                         props['left_ips'].astype(int),
                         props['right_ips'].astype(int),
                         props['peak_heights']), key=lambda x: x[2]):
                     if is_horizontal:
-                        llab[box[0].start+half_scale:box[0].stop-half_scale,box[1].start+start:box[1].start+stop] = -10*np.log(-height+1e-9)
+                        llab[box[0].start+int(scale/2):box[0].stop-int(scale/2),box[1].start+start:box[1].start+stop] = -10*np.log(-height+1e-9)
                     else:
-                        llab[box[0].start+start:box[0].start+stop,box[1].start+half_scale:box[1].stop-half_scale] = -10*np.log(-height+1e-9)
+                        llab[box[0].start+start:box[0].start+stop,box[1].start+int(scale/2):box[1].stop-int(scale/2)] = -10*np.log(-height+1e-9)
                 DSAVE('recursive_x_y_cut_gaps_' + ('h' if is_horizontal else 'v'), llab)
         # heuristic (not strict) decision on x or y cut,
         # factors to consider:
@@ -1927,27 +1916,32 @@ def find_topological():
         #   are not allowed
         y_gaps, y_weights = gap_weights[0][0], gap_weights[0][1]
         x_gaps, x_weights = gap_weights[1][0], gap_weights[1][1]
-        if debug: LOG.debug(f'   all y_gaps {y_gaps} x_gaps {x_gaps}')
+        if debug: LOG.debug('   all y_gaps {} x_gaps {}'.format(y_gaps, x_gaps))
         # suppress cuts that significantly split any line labels
-        min_line_scale = min_line * scale
         y_allowed = [not(np.any(np.intersect1d(
             # significant line labels above
-            np.nonzero(np.bincount(lbin[:gap,:].flatten(), minlength=len(objects))[1:] > min_line_scale)[0],
+            np.nonzero(np.bincount(lbin[:gap,:].flatten(),
+                                   minlength=len(objects))[1:] > min_line * scale)[0],
             # significant line labels below
-            np.nonzero(np.bincount(lbin[gap:,:].flatten(), minlength=len(objects))[1:] > min_line_scale)[0],
-            assume_unique=True))) for gap in y_gaps]
+            np.nonzero(np.bincount(lbin[gap:,:].flatten(),
+                                   minlength=len(objects))[1:] > min_line * scale)[0],
+            assume_unique=True)))
+                        for gap in y_gaps]
         x_allowed = [not(np.any(np.intersect1d(
             # significant line labels left
-            np.nonzero(np.bincount(lbin[:,:gap].flatten(), minlength=len(objects))[1:] > min_line_scale)[0],
+            np.nonzero(np.bincount(lbin[:,:gap].flatten(),
+                                   minlength=len(objects))[1:] > min_line * scale)[0],
             # significant line labels right
-            np.nonzero(np.bincount(lbin[:,gap:].flatten(), minlength=len(objects))[1:] > min_line_scale)[0],
-            assume_unique=True))) for gap in x_gaps]
+            np.nonzero(np.bincount(lbin[:,gap:].flatten(),
+                                   minlength=len(objects))[1:] > min_line * scale)[0],
+            assume_unique=True)))
+                        for gap in x_gaps]
         y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed]
         x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed]
-        if debug: LOG.debug(f'   allowed y_gaps {y_gaps} x_gaps {x_gaps}')
+        if debug: LOG.debug('   allowed y_gaps {} x_gaps {}'.format(y_gaps, x_gaps))
         y_prominence = np.amax(y_weights, initial=0)
         x_prominence = np.amax(x_weights, initial=0)
-        if debug: LOG.debug(f'   y_prominence {y_prominence} x_prominence {x_prominence}')
+        if debug: LOG.debug('   y_prominence {} x_prominence {}'.format(y_prominence, x_prominence))
         # suppress less prominent peaks (another heuristic...)
         # they must compete with the other direction next time
         # (when already new cuts or partitions will become visible)
@@ -1955,30 +1949,33 @@ def find_topological():
         x_allowed = x_weights > 0.8 * x_prominence
         y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed]
         x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed]
-        if debug: LOG.debug(f'   prominent y_gaps {y_gaps} x_gaps {x_gaps}')
+        if debug: LOG.debug('   prominent y_gaps {} x_gaps {}'.format(y_gaps, x_gaps))
         if npartitions > 0:
             # TODO this can be avoided when backtracking below
             # suppress peaks creating fewer partitions than others --
             # how large in our preferred direction will the new partitions
             # of sepmask in both slices created by each cut candidate
             # add up?
-            y_partitionscores = [sum(map(
-                sl.height if prefer_vertical else sl.width,
-                morph.find_objects(morph.label(partitions[:gap, :] > 0)[0]) +
-                morph.find_objects(morph.label(partitions[gap:, :] > 0)[0])))
-                for gap in y_gaps]
-            x_partitionscores = [sum(map(
-                sl.height if prefer_vertical else sl.width,
-                morph.find_objects(morph.label(partitions[:, : gap] > 0)[0]) +
-                morph.find_objects(morph.label(partitions[:, gap :] > 0)[0])))
-                for gap in x_gaps]
-            if debug: LOG.debug(f'   y_partitionscores {y_partitionscores} x_partitionscores {x_partitionscores}')
+            y_partitionscores = [sum(map(sl.height if prefer_vertical else sl.width,
+                                         morph.find_objects(morph.label(
+                                             partitions[:gap,:]>0)[0]) +
+                                         morph.find_objects(morph.label(
+                                             partitions[gap:,:]>0)[0])))
+                                 for gap in y_gaps]
+            x_partitionscores = [sum(map(sl.height if prefer_vertical else sl.width,
+                                         morph.find_objects(morph.label(
+                                             partitions[:,:gap]>0)[0]) +
+                                         morph.find_objects(morph.label(
+                                             partitions[:,gap:]>0)[0])))
+                                 for gap in x_gaps]
+            if debug: LOG.debug('   y_partitionscores {} x_partitionscores {}'.format(
+                y_partitionscores, x_partitionscores))
             # Now identify those gaps with the largest overall score
             y_allowed = y_partitionscores == np.max(y_partitionscores, initial=0)
             x_allowed = x_partitionscores == np.max(x_partitionscores, initial=0)
             y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed]
             x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed]
-            if debug: LOG.debug(f'   most partitioning y_gaps {y_gaps} x_gaps {x_gaps}')
+            if debug: LOG.debug('   most partitioning y_gaps {} x_gaps {}'.format(y_gaps, x_gaps))
         else:
             y_partitionscores = None
             x_partitionscores = None
@@ -1989,7 +1986,7 @@ def find_topological():
         x_allowed = x_weights > 0.9 * x_prominence
         y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed]
         x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed]
-        if debug: LOG.debug(f'   prominent y_gaps {y_gaps} x_gaps {x_gaps}')
+        if debug: LOG.debug('   prominent y_gaps {} x_gaps {}'.format(y_gaps, x_gaps))
         
         # decide which direction, x or y
         # TODO: this most likely needs a backtracking mechanism
@@ -2055,7 +2052,7 @@ def find_topological():
                     llab2[box] = partitions
                 DSAVE('recursive_x_y_cut_partitions', llab2)
             for label in range(1, npartitions+1):
-                LOG.debug(f'next partition %d on %s', label, box)
+                LOG.debug('next partition %d on %s', label, box)
                 recursive_x_y_cut(box, mask=partitions==label, partition_type=new_partition_type)
             return
         
@@ -2063,9 +2060,10 @@ def find_topological():
             # no gaps left
             finalize()
             return
-        orientation = 'vertical' if choose_vertical else 'horizontal'
         # otherwise: cut on gaps
-        LOG.debug(f'cutting {orientation}ly on {box} into {gaps}')
+        LOG.debug('cutting %s on %s into %s', 'vertically'
+        if choose_vertical else 'horizontally',
+                  box, gaps)
         cuts = list(zip(np.insert(gaps, 0, 0), np.append(gaps, lim)))
         if choose_vertical:
             if rl:
@@ -2080,7 +2078,9 @@ def find_topological():
                 sub = sl.box(0, len(y), start, stop)
             else: # "cut in horizontal direction"
                 sub = sl.box(start, stop, 0, len(x))
-            LOG.debug(f'next {orientation} block on {box} is {sub}')
+            LOG.debug('next %s block on %s is %s', 'horizontal'
+            if choose_vertical else 'vertical',
+                      box, sub)
             recursive_x_y_cut(sl.compose(box,sub),
                               mask=sl.cut(mask,sub) if isinstance(mask, np.ndarray)
                               else None)

From 2d8650ed51f5e9cc627d95ae5aea217b9f7bacb6 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 23 Aug 2024 15:15:50 +0200
Subject: [PATCH 157/194] remove whitespaces in ocropy.common and
 ocropy.ocrolib

---
 ocrd_cis/ocropy/common.py        | 18 +++++++++---------
 ocrd_cis/ocropy/ocrolib/morph.py |  8 ++++----
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index c23e89b9..c5b56ed0 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -189,8 +189,8 @@ def estimate_thresholds(flat, bignore=0.1, escale=1.0, lo=5, hi=90):
         # significant variance; this makes the percentile
         # based low and high estimates more reliable
         e = escale
-        v = est - filters.gaussian_filter(est, e*20.0)
-        v = filters.gaussian_filter(v ** 2, e*20.0) ** 0.5
+        v = est - filters.gaussian_filter(est, e * 20.0)
+        v = filters.gaussian_filter(v ** 2, e * 20.0) ** 0.5
         v = (v > 0.3 * np.amax(v))
         v = morphology.binary_dilation(v, structure=np.ones((int(e * 50), 1)))
         v = morphology.binary_dilation(v, structure=np.ones((1, int(e * 50))))
@@ -491,8 +491,8 @@ def compute_images(binary, scale, maximages=5):
     v_opened = morph.rb_opening(images, (odd(scale/2), 1))
     DSAVE('images2_v-opened', v_opened+0.6*binary)
     # 3- close whatever remains
-    closed = morph.rb_closing(h_opened&v_opened, (odd(2*scale), odd(2*scale)))
-    DSAVE('images3_closed', closed + 0.6*binary)
+    closed = morph.rb_closing(h_opened&v_opened, (odd(2*scale),odd(2*scale)))
+    DSAVE('images3_closed', closed+0.6*binary)
     # 4- reconstruct the losses up to a certain distance
     #    to avoid creeping into pure h/v-lines again but still
     #    cover most of the large object
@@ -501,12 +501,12 @@ def compute_images(binary, scale, maximages=5):
     images = morph.rb_reconstruction(closed, images, step=2, maxsteps=scale)
     DSAVE('images4_reconstructed', images+0.6*binary)
     # 5- select nbest
-    images = morph.select_regions(images, sl.area, min=(4*scale)**2, nbest=maximages)
+    images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=maximages)
     DSAVE('images5_selected', images+0.6*binary)
     if not images.any():
         return np.zeros_like(binary, int)
     # 6- dilate a little to get a smooth contour without gaps
-    dilated = morph.r_dilation(images, (odd(scale), odd(scale)))
+    dilated = morph.r_dilation(images, (odd(scale),odd(scale)))
     images = morph.propagate_labels_majority(binary, dilated+1)
     images = morph.spread_labels(images, maxdist=scale)==2
     images, _ = morph.label(images)
@@ -1969,7 +1969,7 @@ def find_topological():
                                              partitions[:,gap:]>0)[0])))
                                  for gap in x_gaps]
             if debug: LOG.debug('   y_partitionscores {} x_partitionscores {}'.format(
-                y_partitionscores, x_partitionscores))
+                    y_partitionscores, x_partitionscores))
             # Now identify those gaps with the largest overall score
             y_allowed = y_partitionscores == np.max(y_partitionscores, initial=0)
             x_allowed = x_partitionscores == np.max(x_partitionscores, initial=0)
@@ -2062,7 +2062,7 @@ def find_topological():
             return
         # otherwise: cut on gaps
         LOG.debug('cutting %s on %s into %s', 'vertically'
-        if choose_vertical else 'horizontally',
+                  if choose_vertical else 'horizontally',
                   box, gaps)
         cuts = list(zip(np.insert(gaps, 0, 0), np.append(gaps, lim)))
         if choose_vertical:
@@ -2079,7 +2079,7 @@ def find_topological():
             else: # "cut in horizontal direction"
                 sub = sl.box(start, stop, 0, len(x))
             LOG.debug('next %s block on %s is %s', 'horizontal'
-            if choose_vertical else 'vertical',
+                      if choose_vertical else 'vertical',
                       box, sub)
             recursive_x_y_cut(sl.compose(box,sub),
                               mask=sl.cut(mask,sub) if isinstance(mask, np.ndarray)
diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py
index f7ccdc31..7d6ffc85 100644
--- a/ocrd_cis/ocropy/ocrolib/morph.py
+++ b/ocrd_cis/ocropy/ocrolib/morph.py
@@ -349,10 +349,10 @@ def all_neighbors(image, dist=1, bg=NaN):
     q = 100000
     assert amax(image)<q
     assert amin(image)>=0
-    u = unique(q*image+shift(image, (dist, 0), order=0, cval=bg))
-    d = unique(q*image+shift(image, (-dist, 0), order=0, cval=bg))
-    l = unique(q*image+shift(image, (0, dist), order=0, cval=bg))
-    r = unique(q*image+shift(image, (0, -dist), order=0, cval=bg))
+    u = unique(q*image+shift(image,(dist,0),order=0,cval=bg))
+    d = unique(q*image+shift(image,(-dist,0),order=0,cval=bg))
+    l = unique(q*image+shift(image,(0,dist),order=0,cval=bg))
+    r = unique(q*image+shift(image,(0,-dist),order=0,cval=bg))
     all = unique(r_[u,d,l,r])
     all = all[all!=bg]
     all = c_[all//q,all%q]

From 9a153b079a3684bf875b306ba8eaad9e1637eeed Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sun, 25 Aug 2024 02:01:36 +0200
Subject: [PATCH 158/194] postcorrect: adapt to frozendict Processor.parameter
 in v3

---
 ocrd_cis/__init__.py         |  1 -
 ocrd_cis/align/cli.py        |  1 -
 ocrd_cis/ocrd_tool.py        |  6 ----
 ocrd_cis/ocropy/binarize.py  |  6 +---
 ocrd_cis/ocropy/clip.py      |  9 +-----
 ocrd_cis/ocropy/denoise.py   |  8 +-----
 ocrd_cis/ocropy/deskew.py    |  8 +-----
 ocrd_cis/ocropy/dewarp.py    |  4 ---
 ocrd_cis/ocropy/recognize.py |  7 ++---
 ocrd_cis/ocropy/resegment.py |  9 +-----
 ocrd_cis/ocropy/segment.py   | 47 +++++++++++++++---------------
 ocrd_cis/ocropy/train.py     |  7 +----
 ocrd_cis/postcorrect/cli.py  | 55 +++++++++++++++++++++---------------
 13 files changed, 63 insertions(+), 105 deletions(-)
 delete mode 100644 ocrd_cis/ocrd_tool.py

diff --git a/ocrd_cis/__init__.py b/ocrd_cis/__init__.py
index 6f37f4f7..9d22fe3e 100644
--- a/ocrd_cis/__init__.py
+++ b/ocrd_cis/__init__.py
@@ -1,3 +1,2 @@
 from .javaprocess import JavaAligner
 from .javaprocess import JavaPostCorrector
-from .ocrd_tool import get_ocrd_tool
diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py
index f5e47785..5706461e 100644
--- a/ocrd_cis/align/cli.py
+++ b/ocrd_cis/align/cli.py
@@ -11,7 +11,6 @@
 from ocrd import Processor, OcrdPage, OcrdPageResult
 from ocrd.decorators import ocrd_cli_options
 from ocrd.decorators import ocrd_cli_wrap_processor
-from ocrd_utils import getLogger
 from ocrd_utils import getLevelName
 from ocrd_models.ocrd_page import TextRegionType, TextEquivType
 from ocrd_cis import JavaAligner
diff --git a/ocrd_cis/ocrd_tool.py b/ocrd_cis/ocrd_tool.py
deleted file mode 100644
index 36cb9d7e..00000000
--- a/ocrd_cis/ocrd_tool.py
+++ /dev/null
@@ -1,6 +0,0 @@
-import json
-from ocrd_utils import resource_string
-
-
-def get_ocrd_tool():
-    return json.loads(resource_string(__name__, 'ocrd-tool.json'))
diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 35b28c5a..9a55301d 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -8,8 +8,7 @@
 
 from ocrd_utils import getLogger
 from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage
-from ocrd import Processor
-from ocrd.processor import OcrdPageResult, OcrdPageResultImage
+from ocrd import Processor, OcrdPageResult, OcrdPageResultImage
 
 from . import common
 from .common import array2pil, determine_zoom, pil2array, remove_noise
@@ -51,14 +50,11 @@ def binarize(logger: Logger, pil_image, method='ocropy', maxskew=2, threshold=0.
         return Image.fromarray(th), 0
 
 class OcropyBinarize(Processor):
-    logger: Logger
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-binarize'
 
     def setup(self):
-        self.logger = getLogger('processor.OcropyBinarize')
         method = self.parameter['method']
         if self.parameter['grayscale'] and method != 'ocropy':
             self.logger.critical(f'Requested method {method} does not support grayscale normalized output')
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index b81c731c..18a0c115 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -8,13 +8,11 @@
 from shapely.prepared import prep
 
 from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage
-from ocrd import Processor
-from ocrd.processor import OcrdPageResult, OcrdPageResultImage
+from ocrd import Processor, OcrdPageResult, OcrdPageResultImage
 from ocrd_utils import (
     bbox_from_polygon,
     coordinates_of_segment,
     crop_image,
-    getLogger,
     image_from_polygon,
     polygon_from_points,
     polygon_mask,
@@ -25,15 +23,10 @@
 
 
 class OcropyClip(Processor):
-    logger: Logger
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-clip'
 
-    def setup(self):
-        self.logger = getLogger('processor.OcropyClip')
-
     def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = None) -> OcrdPageResult:
         """Clip text regions / lines of a page at intersections with neighbours.
 
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index eb3e7d23..eaed74df 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -4,21 +4,15 @@
 
 from ocrd_utils import getLogger
 from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage
-from ocrd import Processor
-from ocrd.processor import OcrdPageResult, OcrdPageResultImage
+from ocrd import Processor, OcrdPageResult, OcrdPageResultImage
 
 from .common import determine_zoom, remove_noise
 
 class OcropyDenoise(Processor):
-    logger: Logger
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-denoise'
 
-    def setup(self):
-        self.logger = getLogger('processor.OcropyDenoise')
-
     def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """Despeckle the pages / regions / lines of the workspace.
 
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index 7bdbba2d..b02c69d5 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -4,8 +4,7 @@
 
 from ocrd_utils import getLogger
 from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, PageType
-from ocrd import Processor
-from ocrd.processor import OcrdPageResult, OcrdPageResultImage
+from ocrd import Processor, OcrdPageResult, OcrdPageResultImage
 
 from . import common
 from .common import pil2array
@@ -16,15 +15,10 @@ def deskew(pil_image, maxskew=2):
     return angle
 
 class OcropyDeskew(Processor):
-    logger: Logger
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-deskew'
 
-    def setup(self):
-        self.logger = getLogger('processor.OcropyDeskew')
-
     def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """Deskew the pages or regions of the workspace.
 
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 17d0b4ce..e33ce024 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -5,7 +5,6 @@
 
 from ocrd import Processor
 from ocrd.processor import OcrdPageResult, OcrdPageResultImage
-from ocrd_utils import getLogger
 from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage
 
 from .ocrolib import lineest
@@ -54,14 +53,11 @@ def padvert(image, range_):
     return array2pil(line)
 
 class OcropyDewarp(Processor):
-    logger: Logger
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-dewarp'
 
     def setup(self):
-        self.logger = getLogger('processor.OcropyDewarp')
         # defaults from ocrolib.lineest:
         self.lnorm = lineest.CenterNormalizer(
             params=(self.parameter['range'],
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 02d29e7c..85a76585 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -10,10 +10,9 @@
 
 from rapidfuzz.distance import Levenshtein
 
-from ocrd_utils import coordinates_for_segment, getLogger, points_from_polygon, polygon_from_bbox
+from ocrd_utils import coordinates_for_segment, points_from_polygon, polygon_from_bbox
 from ocrd_models.ocrd_page import CoordsType, GlyphType, OcrdPage, TextEquivType, WordType
-from ocrd import Processor
-from ocrd.processor import OcrdPageResult
+from ocrd import Processor, OcrdPageResult
 
 from .common import check_line, pil2array
 from .ocrolib import lstm, load_object, midrange
@@ -67,7 +66,6 @@ def recognize(image, pad, network, check=True):
 
 
 class OcropyRecognize(Processor):
-    logger: Logger
     network: Any
     pad: int
 
@@ -76,7 +74,6 @@ def executable(self):
         return 'ocrd-cis-ocropy-recognize'
 
     def setup(self):
-        self.logger = getLogger('processor.OcropyRecognize')
         self.pad = 16
         # from ocropus-rpred:
         self.network = load_object(self.get_model(), verbose=1)
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index c1809569..0fb133c0 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -9,7 +9,6 @@
 from shapely.prepared import prep
 
 from ocrd_utils import (
-    getLogger,
     coordinates_of_segment,
     coordinates_for_segment,
     points_from_polygon,
@@ -17,8 +16,7 @@
     transform_coordinates,
 )
 from ocrd_models.ocrd_page import BaselineType, PageType, OcrdPage
-from ocrd import Processor
-from ocrd.processor import OcrdPageResult
+from ocrd import Processor, OcrdPageResult
 
 from .ocrolib import midrange, morph
 from .common import (
@@ -43,15 +41,10 @@
 )
 
 class OcropyResegment(Processor):
-    logger: Logger
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-resegment'
 
-    def setup(self):
-        self.logger = getLogger('processor.OcropyResegment')
-
     def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """Resegment lines of the workspace.
 
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index b363cbd2..493deb30 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -16,7 +16,6 @@
 from shapely import set_precision
 
 from ocrd_utils import (
-    getLogger,
     coordinates_of_segment,
     coordinates_for_segment,
     points_from_polygon,
@@ -243,21 +242,17 @@ def getx(xy):
 
 
 class OcropySegment(Processor):
-    logger: Logger
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-segment'
 
-    def setup(self):
-        self.logger = getLogger('processor.OcropySegment')
-
     def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """Segment pages into regions+lines, tables into cells+lines, or regions into lines.
-        
+
         Open and deserialise PAGE input file and its respective images,
         then iterate over the element hierarchy down to the requested level.
-        
+
+        \b
         Depending on ``level-of-operation``, consider existing segments:
         - If ``overwrite_separators=True`` on ``page`` level, then
           delete any SeparatorRegions.
@@ -270,12 +265,13 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
         - If ``overwrite_order=True`` on ``page`` or ``table`` level, then
           delete the reading order OrderedGroup entry corresponding
           to the (page/table) segment.
-        
+
         Next, get each element image according to the layout annotation (from
         the alternative image of the page/region, or by cropping via coordinates
         into the higher-level image) in binarized form, and represent it as an array
         with non-text regions and (remaining) text neighbours suppressed.
-        
+
+        \b
         Then compute a text line segmentation for that array (as a label mask).
         When ``level-of-operation`` is ``page`` or ``table``, this also entails
         detecting
@@ -284,25 +280,26 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
         - up to ``maxcolseps`` background column separators
         before text line segmentation itself, as well as aggregating text lines
         to text regions afterwards.
-        
+
         Text regions are detected via a hybrid variant recursive X-Y cut algorithm
         (RXYC): RXYC partitions the binarized image in top-down manner by detecting
         horizontal or vertical gaps. This implementation uses the bottom-up text line
         segmentation to guide the search, and also uses both pre-existing and newly
         detected separators to alternatively partition the respective boxes into
         non-rectangular parts.
-        
+
         During line segmentation, suppress the foreground of all previously annotated
         regions (of any kind) and lines, except if just removed due to ``overwrite``.
         During region aggregation however, combine the existing separators with the
         new-found separators to guide the column search.
-        
+
         All detected segments (both text line and text region) are sorted according
         to their reading order (assuming a top-to-bottom, left-to-right ordering).
         When ``level-of-operation`` is ``page``, prefer vertical (column-first)
         succession of regions. When it is ``table``, prefer horizontal (row-first)
         succession of cells.
-        
+
+        \b
         Then for each resulting segment label, convert its background mask into
         polygon outlines by finding the outer contours consistent with the element's
         polygon outline. Annotate the result by adding it as a new TextLine/TextRegion:
@@ -314,7 +311,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
         - If it is ``page``, then append the new lines to their respective regions,
           and append the new regions to the page.
           (Also, create an OrderedGroup for it in the ReadingOrder.)
-        
+
         Produce a new output file by serialising the resulting hierarchy.
         """
         # FIXME: allow passing a-priori info on reading order / textline order
@@ -495,13 +492,13 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
         Given a PageType, TableRegionType or TextRegionType ``element``, and
         a corresponding binarized PIL.Image object ``image`` with coordinate
         metadata ``coords``, run line segmentation with Ocropy.
-        
+
         If operating on the full page (or table), then also detect horizontal
         and vertical separators, and aggregate the lines into text regions
         afterwards.
-        
+
         Add the resulting sub-segments to the parent ``element``.
-        
+
         If ``ignore`` is not empty, then first suppress all foreground components
         in any of those segments' coordinates during segmentation, and if also
         in full page/table mode, then combine all separators among them with the
@@ -773,7 +770,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
 
 def polygon_for_parent(polygon, parent):
     """Clip polygon to parent polygon range.
-    
+
     (Should be moved to ocrd_utils.coordinates_for_segment.)
     """
     childp = Polygon(polygon)
@@ -986,7 +983,7 @@ def join_baselines(logger: Logger, baselines, loc=''):
 
 def page_get_reading_order(ro, rogroup):
     """Add all elements from the given reading order group to the given dictionary.
-    
+
     Given a dict ``ro`` from layout element IDs to ReadingOrder element objects,
     and an object ``rogroup`` with additional ReadingOrder element objects,
     add all references to the dict, traversing the group recursively.
@@ -1006,10 +1003,10 @@ def page_get_reading_order(ro, rogroup):
 
 def page_add_to_reading_order(rogroup, region_id, index=None):
     """Add a region reference to an un/ordered RO group.
-    
+
     Given a ReadingOrder group ``rogroup`` (of any type),
     append a reference to region ``region_id`` to it.
-    
+
     If ``index`` is given, use that as position and return
     incremented by one. (This must be an integer if ``rogroup``
     is an OrderedGroup(Indexed).
@@ -1025,16 +1022,16 @@ def page_add_to_reading_order(rogroup, region_id, index=None):
 
 def page_subgroup_in_reading_order(logger: Logger, roelem):
     """Replace given RO element by an equivalent OrderedGroup.
-    
+
     Given a ReadingOrder element ``roelem`` (of any type),
     first look up its parent group. Remove it from the respective
     member list (of its region refs or un/ordered groups),
     even if it already was an OrderedGroup(Indexed).
-    
+
     Then instantiate an empty OrderedGroup(Indexed), referencing
     the same region as ``roelem`` (and using the same index, if any).
     Add that group to the parent instead.
-    
+
     Return the new group object.
     """
     if not roelem:
diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py
index 6c627231..78302f12 100644
--- a/ocrd_cis/ocropy/train.py
+++ b/ocrd_cis/ocropy/train.py
@@ -7,9 +7,7 @@
 from os.path import abspath, dirname, exists, join, isfile
 
 from ocrd_models import OcrdPage
-from ocrd import Processor, Workspace
-from ocrd.processor import OcrdPageResult
-from ocrd_utils import getLogger
+from ocrd import Processor, Workspace, OcrdPageResult
 
 from .ocropus_rtrain import *
 from .binarize import binarize
@@ -30,9 +28,7 @@ def resize_keep_ratio(image, baseheight=48):
 
 
 class OcropyTrain(Processor):
-    logger: Logger
     modelpath: str
-    old_cwd: str
     outputpath: str
 
     @property
@@ -40,7 +36,6 @@ def executable(self):
         return 'ocrd-cis-ocropy-train'
 
     def setup(self):
-        self.logger = getLogger('processor.OcropyTrain')
         if 'model' in self.parameter:
             model = self.parameter['model']
             try:
diff --git a/ocrd_cis/postcorrect/cli.py b/ocrd_cis/postcorrect/cli.py
index 71fbaad1..6759b96a 100644
--- a/ocrd_cis/postcorrect/cli.py
+++ b/ocrd_cis/postcorrect/cli.py
@@ -4,10 +4,9 @@
 import click
 import json
 
-from ocrd import Processor
+from ocrd import Processor, Workspace
 from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
-from ocrd_utils import getLogger, getLevelName
-from ocrd_models.ocrd_mets import OcrdMets
+from ocrd_utils import getLevelName, pushd_popd
 from ocrd_cis import JavaPostCorrector
 
 
@@ -21,26 +20,38 @@ class PostCorrector(Processor):
     def executable(self):
         return 'ocrd-cis-postcorrect'
 
-    def process(self):
+    def setup(self):
+        # since ocrd v3.0 we cannot overwrite self.parameter anymore
+        # because that gets validated against the schema
+        # (so these additions would fail)
+        self.params = dict(self.parameter)
         profiler = {}
         profiler["path"] = self.parameter["profilerPath"]
         profiler["config"] = self.parameter["profilerConfig"]
         profiler["noCache"] = True
-        self.parameter["profiler"] = profiler
-        self.parameter["runDM"] = True
-        self.logger.debug(json.dumps(self.parameter, indent=4))
-        p = JavaPostCorrector(self.workspace.mets_target,
-                              self.input_file_grp,
-                              self.output_file_grp,
-                              self.parameter,
-                              getLevelName(self.logger.getEffectiveLevel()))
-        p.exe()
-        # reload the mets file to prevent run_processor's save_mets
-        # from overriding the results from the Java process
-        self.workspace.reload_mets()
-        # workaround for cisocrgroup/ocrd-postcorrection#13 (absolute paths in output):
-        for output_file in self.workspace.find_files(file_grp=self.output_file_grp):
-            flocat = output_file._el.find('{http://www.loc.gov/METS/}FLocat')
-            flocat.attrib['LOCTYPE'] = 'OTHER'
-            flocat.attrib['OTHERLOCTYPE'] = 'FILE'
-            output_file.local_filename = os.path.relpath(output_file.local_filename, self.workspace.directory)
+        self.params["profiler"] = profiler
+        self.params["runDM"] = True
+        self.logger.debug(json.dumps(self.params, indent=4))
+
+    def process_workspace(self, workspace: Workspace):
+        with pushd_popd(workspace.directory):
+            self.workspace = workspace
+            self.verify()
+            # this CLI call mimics the OCR-D processor CLI itself
+            # we have no control over its interior
+            # (we get no page-wise error handling and input downloading)
+            p = JavaPostCorrector(self.workspace.mets_target,
+                                  self.input_file_grp,
+                                  self.output_file_grp,
+                                  self.params,
+                                  getLevelName(self.logger.getEffectiveLevel()))
+            p.exe()
+            # reload the mets file to prevent run_processor's save_mets
+            # from overriding the results from the Java process
+            self.workspace.reload_mets()
+            # workaround for cisocrgroup/ocrd-postcorrection#13 (absolute paths in output):
+            for output_file in self.workspace.find_files(file_grp=self.output_file_grp):
+                flocat = output_file._el.find('{http://www.loc.gov/METS/}FLocat')
+                flocat.attrib['LOCTYPE'] = 'OTHER'
+                flocat.attrib['OTHERLOCTYPE'] = 'FILE'
+                output_file.local_filename = os.path.relpath(output_file.local_filename, self.workspace.directory)

From bd0613a20fd4d7d88a466cc75f3e94be656f08bf Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Mon, 26 Aug 2024 11:36:53 +0200
Subject: [PATCH 159/194] require ocrd>=3.0.0b1

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 38f09abd..83cf28bb 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@
     packages=find_packages(),
     include_package_data=True,
     install_requires=[
-        'ocrd>=3.0.0a1',
+        'ocrd>=3.0.0b1',
         'click',
         'scipy',
         'numpy>=1.17.0',

From f6e437fc8d5ef7bbb51fa7b4f5d590a11c6fc627 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 27 Aug 2024 14:46:41 +0200
Subject: [PATCH 160/194] add: simple github actions workflow

---
 .github/workflow/tests.yml | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 .github/workflow/tests.yml

diff --git a/.github/workflow/tests.yml b/.github/workflow/tests.yml
new file mode 100644
index 00000000..424409df
--- /dev/null
+++ b/.github/workflow/tests.yml
@@ -0,0 +1,27 @@
+name: Test ocrd_cis installation and run tests
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
+        os: [ "ubuntu-22.04" ]
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install ocrd_cis
+        run: make install
+      - name: Test ocrd_cis
+        run: make test

From 403781a3c27e5fdb0cddcf311401dad1a24f83f8 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 27 Aug 2024 15:30:14 +0200
Subject: [PATCH 161/194] Update .github/workflow/tests.yml

Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
---
 .github/workflow/tests.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflow/tests.yml b/.github/workflow/tests.yml
index 424409df..24fa0bc7 100644
--- a/.github/workflow/tests.yml
+++ b/.github/workflow/tests.yml
@@ -2,9 +2,8 @@ name: Test ocrd_cis installation and run tests
 
 on:
   push:
-    branches: [ "master" ]
   pull_request:
-    branches: [ "master" ]
+  workflow_dispatch:
 
 jobs:
   build:

From 97083bb71e724276385058bde9244cbdd21dce64 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 27 Aug 2024 15:30:25 +0200
Subject: [PATCH 162/194] Update .github/workflow/tests.yml

Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
---
 .github/workflow/tests.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflow/tests.yml b/.github/workflow/tests.yml
index 24fa0bc7..559297dd 100644
--- a/.github/workflow/tests.yml
+++ b/.github/workflow/tests.yml
@@ -20,7 +20,11 @@ jobs:
         uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
+      - uses: actions/setup-java@v4
+        with:
+          distribution: 'zulu'
+          java-version: '11'
       - name: Install ocrd_cis
         run: make install
       - name: Test ocrd_cis
-        run: make test
+        run: make test V=""

From 2b20e0c44da924a5b15379d86eb557acdf42b1f3 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 27 Aug 2024 15:49:11 +0200
Subject: [PATCH 163/194] fix: checkout ref

---
 .github/workflow/tests.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflow/tests.yml b/.github/workflow/tests.yml
index 559297dd..f95a09a4 100644
--- a/.github/workflow/tests.yml
+++ b/.github/workflow/tests.yml
@@ -15,7 +15,10 @@ jobs:
         os: [ "ubuntu-22.04" ]
 
     steps:
-      - uses: actions/checkout@v4
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.head_ref }}
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v5
         with:

From 86a08eb5cc471eef536bc2d050e80f768a728e43 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 27 Aug 2024 16:08:48 +0200
Subject: [PATCH 164/194] Create GH Actions workflow: test.yml

---
 .github/workflows/test.yml | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 .github/workflows/test.yml

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 00000000..f95a09a4
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,33 @@
+name: Test ocrd_cis installation and run tests
+
+on:
+  push:
+  pull_request:
+  workflow_dispatch:
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
+        os: [ "ubuntu-22.04" ]
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.head_ref }}
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - uses: actions/setup-java@v4
+        with:
+          distribution: 'zulu'
+          java-version: '11'
+      - name: Install ocrd_cis
+        run: make install
+      - name: Test ocrd_cis
+        run: make test V=""

From 1d7e9a0d5f72e66c92c07e15508ba330e130f6bb Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 27 Aug 2024 16:18:40 +0200
Subject: [PATCH 165/194] delete: wrong path for workflows

---
 .github/workflow/tests.yml | 33 ---------------------------------
 1 file changed, 33 deletions(-)
 delete mode 100644 .github/workflow/tests.yml

diff --git a/.github/workflow/tests.yml b/.github/workflow/tests.yml
deleted file mode 100644
index f95a09a4..00000000
--- a/.github/workflow/tests.yml
+++ /dev/null
@@ -1,33 +0,0 @@
-name: Test ocrd_cis installation and run tests
-
-on:
-  push:
-  pull_request:
-  workflow_dispatch:
-
-jobs:
-  build:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
-        os: [ "ubuntu-22.04" ]
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.head_ref }}
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python-version }}
-      - uses: actions/setup-java@v4
-        with:
-          distribution: 'zulu'
-          java-version: '11'
-      - name: Install ocrd_cis
-        run: make install
-      - name: Test ocrd_cis
-        run: make test V=""

From 224e86f5467c7506882792fa03397cbe032f69c9 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 27 Aug 2024 16:27:55 +0200
Subject: [PATCH 166/194] fix: NaN error for python3.9+

---
 ocrd_cis/ocropy/ocrolib/morph.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py
index 7d6ffc85..1ebfb204 100644
--- a/ocrd_cis/ocropy/ocrolib/morph.py
+++ b/ocrd_cis/ocropy/ocrolib/morph.py
@@ -343,7 +343,7 @@ def select_regions(binary,f,min=0,nbest=100000):
     return keep[labels]
 
 @checks(SEGMENTATION)
-def all_neighbors(image, dist=1, bg=NaN):
+def all_neighbors(image, dist=1, bg=float('nan')):
     """Given an image with labels, find all pairs of labels
     that are directly (up to ``dist``) neighboring each other, ignoring the label ``bg``."""
     q = 100000

From a397531e549532675341c15b6c4a6fbef1f96818 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 27 Aug 2024 16:29:37 +0200
Subject: [PATCH 167/194] fix: NaN in reading_order in morph.py

---
 ocrd_cis/ocropy/ocrolib/morph.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py
index 1ebfb204..4b626e83 100644
--- a/ocrd_cis/ocropy/ocrolib/morph.py
+++ b/ocrd_cis/ocropy/ocrolib/morph.py
@@ -429,7 +429,7 @@ def reading_order(seg,rl=False,bt=False):
         segmap[1:] = 1
         return segmap
     def pos(f,l):
-        return array([f(x) if x else nan for x in l])
+        return array([f(x) if x else float('nan') for x in l])
     ys = pos(sl.ycenter,objects)
     yorder = argsort(ys)[::-1 if bt else 1]
     groups = [[yorder[0]]]

From 9cf83051b2f1875b0757eb1d81ff0a29b7f63047 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sun, 1 Sep 2024 11:18:36 +0200
Subject: [PATCH 168/194] fix type hints

---
 ocrd_cis/align/cli.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py
index 5706461e..395f7b07 100644
--- a/ocrd_cis/align/cli.py
+++ b/ocrd_cis/align/cli.py
@@ -229,8 +229,8 @@ class Alignment:
     file_grp: str
     pcgts: OcrdPage
     region: TextRegionType
-    alignment: Alignment
-    def __init__(self, file_grp: str, pcgts: OcrdPage, region: TextRegionType, alignment: Alignment):
+    alignment: dict
+    def __init__(self, file_grp: str, pcgts: OcrdPage, region: TextRegionType, alignment: dict):
         self.file_grp = file_grp
         self.pcgts = pcgts
         self.region = region

From a0c734dd3e357606bde1c121cd4e25c972087df6 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sun, 1 Sep 2024 11:19:29 +0200
Subject: [PATCH 169/194] dewarp: make thread-safe

---
 ocrd_cis/ocropy/dewarp.py          | 25 ++++++++++++-------------
 ocrd_cis/ocropy/ocrolib/lineest.py |  2 +-
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index e33ce024..a0d0ea5c 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -57,17 +57,6 @@ class OcropyDewarp(Processor):
     def executable(self):
         return 'ocrd-cis-ocropy-dewarp'
 
-    def setup(self):
-        # defaults from ocrolib.lineest:
-        self.lnorm = lineest.CenterNormalizer(
-            params=(self.parameter['range'],
-                    self.parameter['smoothness'],
-                    # let's not expose this for now
-                    # (otherwise we must explain mutual
-                    #  dependency between smoothness
-                    #  and extra params)
-                    0.3))
-
     def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """Dewarp the lines of the workspace.
 
@@ -94,6 +83,16 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
         page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id)
         zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
+        # defaults from ocrolib.lineest:
+        lnorm = lineest.CenterNormalizer(
+            params=(self.parameter['range'],
+                    self.parameter['smoothness'],
+                    # let's not expose this for now
+                    # (otherwise we must explain mutual
+                    #  dependency between smoothness
+                    #  and extra params)
+                    0.3))
+
         regions = page.get_AllRegions(classes=['Text'], order='reading-order')
         if not regions:
             self.logger.warning(f'Page "{page_id}" contains no text regions')
@@ -107,8 +106,8 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                 self.logger.info(f"About to dewarp page '{page_id}' region '{region.id}' line '{line.id}'")
                 try:
                     dew_image = dewarp(
-                        line_image, self.lnorm, check=True, max_neighbour=self.parameter['max_neighbour'], zoom=zoom)
-                except InvalidLine as err:
+                        line_image, lnorm, check=True, max_neighbour=self.parameter['max_neighbour'], zoom=zoom)
+                except (InvalidLine, AssertionError) as err:
                     self.logger.error(f'Cannot dewarp line "{line.id}": {err}')
                     continue
                 except InadequateLine as err:
diff --git a/ocrd_cis/ocropy/ocrolib/lineest.py b/ocrd_cis/ocropy/ocrolib/lineest.py
index 42ef2237..392c7e4a 100644
--- a/ocrd_cis/ocropy/ocrolib/lineest.py
+++ b/ocrd_cis/ocropy/ocrolib/lineest.py
@@ -75,7 +75,7 @@ def measure(self,line):
             plt.plot(self.center)
             plt.ginput(1,1000)
     def dewarp(self,img,cval=0,dtype=np.dtype('f')):
-        assert img.shape==self.shape
+        assert img.shape==self.shape, f"input shape {img.shape} deviates from measured shape {self.shape}"
         h,w = img.shape
         # The actual image img is embedded into a larger image by
         # adding vertical space on top and at the bottom (padding)

From 66baaf07f60532185a41ea606c31964ee046c8ba Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sun, 1 Sep 2024 11:21:19 +0200
Subject: [PATCH 170/194] recognize: disallow multithreading (impossible with
 current lstm implementation)

---
 ocrd_cis/ocropy/recognize.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 85a76585..97bec8a7 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -45,7 +45,7 @@ def recognize(image, pad, network, check=True):
     pred = network.predictString(line)
 
     # getting confidence
-    result = lstm.translate_back(network.outputs, pos=1)
+    result = lstm.translate_back(network.outputs, pos=1) # raw positions
     scale = len(raw_line.T) * 1.0 / (len(network.outputs) - 2 * pad)
 
     clist = []
@@ -68,6 +68,8 @@ def recognize(image, pad, network, check=True):
 class OcropyRecognize(Processor):
     network: Any
     pad: int
+    # lstm is not thread-safe (.outputs, .last_n as side effects etc)
+    max_workers = 1
 
     @property
     def executable(self):
@@ -191,7 +193,7 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh):
             try:
                 linepred, clist, rlist, confidlist = recognize(final_img, self.pad, self.network, check=True)
             except Exception as err:
-                self.logger.debug(f'Error processing line "{line.id}": {err}')
+                self.logger.debug(f'Error processing line "{line.id}": {str(err) or err.__class__.__name__}')
                 continue
             self.logger.debug(f"OCR '{line.id}': '{linepred}'")
             edits += Levenshtein.distance(linepred, linegt)

From 32ce6560d9c1e10fdfd00055e567b0fe13187404 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sun, 1 Sep 2024 11:22:14 +0200
Subject: [PATCH 171/194] postcorrect: make work under METS Server

---
 ocrd_cis/postcorrect/cli.py | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/ocrd_cis/postcorrect/cli.py b/ocrd_cis/postcorrect/cli.py
index 6759b96a..70918de7 100644
--- a/ocrd_cis/postcorrect/cli.py
+++ b/ocrd_cis/postcorrect/cli.py
@@ -1,12 +1,14 @@
 from __future__ import absolute_import
 import os
+import json
 
 import click
-import json
 
 from ocrd import Processor, Workspace
 from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
 from ocrd_utils import getLevelName, pushd_popd
+from ocrd_models import OcrdMets
+
 from ocrd_cis import JavaPostCorrector
 
 
@@ -37,6 +39,8 @@ def process_workspace(self, workspace: Workspace):
         with pushd_popd(workspace.directory):
             self.workspace = workspace
             self.verify()
+            # ensure that input files are referenced in on-disk METS
+            self.workspace.save_mets()
             # this CLI call mimics the OCR-D processor CLI itself
             # we have no control over its interior
             # (we get no page-wise error handling and input downloading)
@@ -46,12 +50,23 @@ def process_workspace(self, workspace: Workspace):
                                   self.params,
                                   getLevelName(self.logger.getEffectiveLevel()))
             p.exe()
-            # reload the mets file to prevent run_processor's save_mets
-            # from overriding the results from the Java process
-            self.workspace.reload_mets()
             # workaround for cisocrgroup/ocrd-postcorrection#13 (absolute paths in output):
-            for output_file in self.workspace.find_files(file_grp=self.output_file_grp):
+            #   We cannot do that with this method, because our self.workspace.mets might be
+            #   a ClientSideOcrdMets, which does not allow modifying or removing files:
+            # for output_file in self.workspace.find_files(file_grp=self.output_file_grp):
+            #     flocat = output_file._el.find('{http://www.loc.gov/METS/}FLocat')
+            #     flocat.attrib['LOCTYPE'] = 'OTHER'
+            #     flocat.attrib['OTHERLOCTYPE'] = 'FILE'
+            #     output_file.local_filename = os.path.relpath(output_file.local_filename, self.workspace.directory)
+            #   So instead, let's post-process the local METS file result directly:
+            mets = OcrdMets(filename=self.workspace.mets_target)
+            for output_file in mets.find_files(fileGrp=self.output_file_grp):
                 flocat = output_file._el.find('{http://www.loc.gov/METS/}FLocat')
                 flocat.attrib['LOCTYPE'] = 'OTHER'
                 flocat.attrib['OTHERLOCTYPE'] = 'FILE'
                 output_file.local_filename = os.path.relpath(output_file.local_filename, self.workspace.directory)
+            with open(self.workspace.mets_target, 'w') as f:
+                f.write(mets.to_xml(xmllint=True).decode('utf-8'))
+            # reload the mets file to prevent run_processor's save_mets
+            # from overriding the results from the Java process
+            self.workspace.reload_mets()

From c4a5999d905d23a8e347eed2b257363c0c2545af Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sun, 1 Sep 2024 11:24:41 +0200
Subject: [PATCH 172/194] tests: use METS Server if OCRD_MAX_PARALLEL_PAGES>1

---
 tests/run_add_zip_test.bash             |  5 +--
 tests/run_alignment_test.bash           |  5 +--
 tests/run_image_preprocessing_test.bash | 15 +++++----
 tests/run_ocr_test.bash                 |  7 ++--
 tests/run_postcorrection_test.bash      | 19 +++++------
 tests/run_training_test.bash            |  7 ++--
 tests/test_lib.bash                     | 43 ++++++++++++++++++++-----
 7 files changed, 68 insertions(+), 33 deletions(-)

diff --git a/tests/run_add_zip_test.bash b/tests/run_add_zip_test.bash
index 02de2db2..e2d44983 100644
--- a/tests/run_add_zip_test.bash
+++ b/tests/run_add_zip_test.bash
@@ -6,7 +6,7 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip
 # test if there are 3 gt files
 pushd "$tmpws"
 found_files=0
-for file in $(ocrd workspace find -G OCR-D-GT-SEG-LINE); do
+for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G OCR-D-GT-SEG-LINE); do
 	[[ -f "$file" ]] || fail "cannot find ground truth file: $file"
 	found_files=$((found_files + 1))
 done
@@ -16,9 +16,10 @@ popd
 # test if there are 3 gt files
 pushd "$tmpws"
 found_files=0
-for file in $(ocrd workspace find -G OCR-D-IMG); do
+for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G OCR-D-IMG); do
 	[[ -f "$file" ]] || fail "cannot find ground truth file: $file"
 	found_files=$((found_files + 1))
 done
 (( found_files == 3 )) || fail "invalid number of files: $found_files"
 popd
+
diff --git a/tests/run_alignment_test.bash b/tests/run_alignment_test.bash
index e8a3c79a..7a82254b 100644
--- a/tests/run_alignment_test.bash
+++ b/tests/run_alignment_test.bash
@@ -6,7 +6,7 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip
 # test if there are 3 gt files
 pushd "$tmpws"
 found_files=0
-for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do
+for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do
 	[[ -f "$file" ]] || fail "cannot find ground truth file: $file"
 	found_files=$((found_files + 1))
 done
@@ -17,9 +17,10 @@ ocrd_cis_align
 
 pushd $tmpws
 found_files=0
-for file in $(ocrd workspace find -G OCR-D-CIS-ALIGN); do
+for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G OCR-D-CIS-ALIGN); do
 	[[ -f "$file" ]] || fail "cannot find aligned file group workspace"
 	found_files=$((found_files + 1))
 done
 (( found_files == 3 )) || fail "invalid number of files: $found_files"
 popd
+
diff --git a/tests/run_image_preprocessing_test.bash b/tests/run_image_preprocessing_test.bash
index f80fc636..7a66a57b 100644
--- a/tests/run_image_preprocessing_test.bash
+++ b/tests/run_image_preprocessing_test.bash
@@ -7,16 +7,17 @@ ocrd_cis_init_ws "blumenbach_anatomie_1805.ocrd.zip"
 # test if there are 3 gt files
 pushd "$tmpws"
 found_files=0
-for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do
+for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do
 	[[ -f "$file" ]] || fail "cannot find ground truth file: $file"
 	found_files=$((found_files + 1))
 done
 (( found_files == 3 )) || fail "invalid number of files: $found_files"
 
-ocrd-cis-ocropy-binarize -l DEBUG -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN
-ocrd-cis-ocropy-clip -l DEBUG -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-IMG-CLIP
-ocrd-cis-ocropy-denoise -l DEBUG -I OCR-D-CIS-IMG-CLIP -O OCR-D-CIS-IMG-DEN
-ocrd-cis-ocropy-deskew -l DEBUG -I OCR-D-CIS-IMG-DEN  -O OCR-D-CIS-IMG-DES
-ocrd-cis-ocropy-dewarp -l DEBUG -I OCR-D-CIS-IMG-DES -O OCR-D-CIS-IMG-DEW
-ocrd-cis-ocropy-segment -l DEBUG -I OCR-D-CIS-IMG-DEW -O OCR-D-CIS-IMG-SEG
+ARGS=(${OCRD_LOG_ARGS[*]} ${OCRD_WS_ARGS[*]})
+ocrd-cis-ocropy-binarize ${ARGS[*]} -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN
+ocrd-cis-ocropy-clip ${ARGS[*]} -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-IMG-CLIP
+ocrd-cis-ocropy-denoise ${ARGS[*]} -I OCR-D-CIS-IMG-CLIP -O OCR-D-CIS-IMG-DEN
+ocrd-cis-ocropy-deskew ${ARGS[*]} -I OCR-D-CIS-IMG-DEN  -O OCR-D-CIS-IMG-DES
+ocrd-cis-ocropy-dewarp ${ARGS[*]} -I OCR-D-CIS-IMG-DES -O OCR-D-CIS-IMG-DEW
+ocrd-cis-ocropy-segment ${ARGS[*]} -I OCR-D-CIS-IMG-DEW -O OCR-D-CIS-IMG-SEG
 popd
diff --git a/tests/run_ocr_test.bash b/tests/run_ocr_test.bash
index b10f6f6d..f737ae43 100644
--- a/tests/run_ocr_test.bash
+++ b/tests/run_ocr_test.bash
@@ -6,7 +6,7 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip
 # test if there are 3 gt files
 pushd "$tmpws"
 found_files=0
-for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do
+for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do
 	[[ -f "$file" ]] || fail "cannot find ground truth file: $file"
 	found_files=$((found_files + 1))
 done
@@ -16,8 +16,9 @@ done
 ocrd resmgr download ocrd-cis-ocropy-recognize fraktur.pyrnn.gz
 
 # run ocr
-ocrd-cis-ocropy-binarize -l DEBUG -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN
-ocrd-cis-ocropy-recognize -l DEBUG -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR \
+ARGS=(${OCRD_LOG_ARGS[*]} ${OCRD_WS_ARGS[*]})
+ocrd-cis-ocropy-binarize ${ARGS[*]} -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN
+ocrd-cis-ocropy-recognize ${ARGS[*]} -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR \
 	-P textequiv_level word -P model fraktur.pyrnn.gz
 
 popd
diff --git a/tests/run_postcorrection_test.bash b/tests/run_postcorrection_test.bash
index d7f34ace..859c8407 100644
--- a/tests/run_postcorrection_test.bash
+++ b/tests/run_postcorrection_test.bash
@@ -6,7 +6,7 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip
 # test if there are 3 gt files
 pushd "$tmpws"
 found_files=0
-for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do
+for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do
 	[[ -f "$file" ]] || fail "cannot find ground truth file: $file"
 	found_files=$((found_files + 1))
 done
@@ -15,25 +15,26 @@ popd
 
 ocrd_cis_align
 
-mkdir "$tmpdir/bin"
-cat > "$tmpdir/bin/profiler.bash" <<EOF
+pushd "$tmpws"
+mkdir "bin"
+cat > "bin/profiler.bash" <<EOF
 #!/bin/bash
 cat > /dev/null
 echo '{}'
 EOF
-chmod a+x "$tmpdir/bin/profiler.bash"
-ocrd-cis-postcorrect -l DEBUG \
+chmod a+x "bin/profiler.bash"
+
+ARGS=(${OCRD_LOG_ARGS[*]} ${OCRD_WS_ARGS[*]})
+ocrd-cis-postcorrect ${ARGS[*]} \
 			-I OCR-D-CIS-ALIGN \
 			-O OCR-D-CIS-POSTCORRECT \
-			-m $tmpws/mets.xml \
-			-P profilerPath $tmpdir/bin/profiler.bash \
+			-P profilerPath bin/profiler.bash \
 			-P profilerConfig ignored \
 			-P model "$(ocrd-cis-data -model)" \
 			-P nOCR 2
 
-pushd $tmpws
 found_files=0
-for file in $(ocrd workspace find -G OCR-D-CIS-POSTCORRECT); do
+for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G OCR-D-CIS-POSTCORRECT); do
 	[[ -f "$file" ]] || fail "$file: not a file"
 	found_files=$((found_files + 1))
 done
diff --git a/tests/run_training_test.bash b/tests/run_training_test.bash
index ade1b68e..5b96dc3e 100644
--- a/tests/run_training_test.bash
+++ b/tests/run_training_test.bash
@@ -6,7 +6,7 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip
 # test if there are 3 gt files
 pushd "$tmpws"
 found_files=0
-for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do
+for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do
 	[[ -f "$file" ]] || fail "cannot find ground truth file: $file"
 	found_files=$((found_files + 1))
 done
@@ -15,9 +15,12 @@ popd
 
 ocrd_cis_align
 
+stopserver
+OCRD_MAX_PARALLEL_PAGES=1
+
 # fix ocr for some entries (otherwise the training will fail)
 pushd $tmpws
-for f in $(ocrd workspace find -G OCR-D-CIS-ALIGN); do
+for f in $(ocrd ${OCRD_LOG_ARGS[*]} workspace find -G OCR-D-CIS-ALIGN); do
 	sed -i -e 's#<pc:Unicode>e.</pc:Unicode>#<pc:Unicode>Säugethiere.</pc:Unicode>#' $f
 	sed -i -e 's#<pc:Unicode>E</pc:Unicode>#<pc:Unicode>Säugethieren</pc:Unicode>#' $f
 done
diff --git a/tests/test_lib.bash b/tests/test_lib.bash
index 801be01a..76111d25 100644
--- a/tests/test_lib.bash
+++ b/tests/test_lib.bash
@@ -1,10 +1,27 @@
 #/bin/bash
 
 tmpdir=$(mktemp -d)
-trap "trap 'echo exiting without removing $tmpdir' EXIT" ERR
-trap "rm -rf $tmpdir" EXIT
+function stopserver() {
+    :
+}
+function failexit() {
+    stopserver
+}
+function cleanexit() {
+    stopserver
+    rm -rf $tmpdir
+}
+trap "trap failexit EXIT" ERR
+trap cleanexit EXIT
+
+OCRD_LOG_ARGS=()
+if test -v OCRD_OVERRIDE_LOGLEVEL; then
+    OCRD_LOG_ARGS+=(-l $OCRD_OVERRIDE_LOGLEVEL)
+fi
+OCRD_WS_ARGS=() # -m mets.xml
 
 OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE"
+
 data_url="https://github.com/OCR-D/gt_structure_text/releases/download/v1.5.0/"
 function ocrd_cis_download_bagit() {
 	local url="$data_url/$1"
@@ -16,22 +33,32 @@ function ocrd_cis_init_ws() {
 	ocrd_cis_download_bagit "$1"
 	ocrd zip spill -d "$tmpdir" "$PWD/download/$1"
 	tmpws="$tmpdir/${1%.ocrd.zip}"
+        if ((${OCRD_MAX_PARALLEL_PAGES:-0} > 1)); then
+            echo starting METS server at $tmpws
+            ocrd workspace -d "$tmpws" -U "$tmpws/mets.sock" server start &
+            OCRD_WS_ARGS+=(-U "$tmpws/mets.sock")
+            sleep 1
+            function stopserver() {
+                echo stopping METS server at $tmpws
+                ocrd workspace -d "$tmpws" -U "$tmpws/mets.sock" server stop || true
+            }
+        fi
 }
 
+
 function ocrd_cis_align() {
 	# download ocr models
 	ocrd resmgr download ocrd-cis-ocropy-recognize fraktur.pyrnn.gz
 	ocrd resmgr download ocrd-cis-ocropy-recognize fraktur-jze.pyrnn.gz
 	# run ocr
         pushd $tmpws
-        ocrd-cis-ocropy-binarize -l DEBUG -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN
-	ocrd-cis-ocropy-recognize -l DEBUG \
- 				-I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR-1 \
+        ARGS=(${OCRD_LOG_ARGS[*]} ${OCRD_WS_ARGS[*]})
+        ocrd-cis-ocropy-binarize ${ARGS[*]} -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN
+	ocrd-cis-ocropy-recognize ${ARGS[*]} -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR-1 \
 				-P textequiv_level word -P model fraktur.pyrnn.gz
-	ocrd-cis-ocropy-recognize -l DEBUG \
-				-I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR-2 \
+	ocrd-cis-ocropy-recognize ${ARGS[*]} -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR-2 \
 				-P textequiv_level word -P model fraktur-jze.pyrnn.gz
-	ocrd-cis-align -l DEBUG	-I OCR-D-CIS-OCR-1,OCR-D-CIS-OCR-2,$OCRD_CIS_FILEGRP \
+	ocrd-cis-align ${ARGS[*]} -I OCR-D-CIS-OCR-1,OCR-D-CIS-OCR-2,$OCRD_CIS_FILEGRP \
 				-O OCR-D-CIS-ALIGN
         popd
 }

From ae7dc671ab50104c0cf3f4dec6bf28fc3c1990ed Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sun, 1 Sep 2024 11:25:35 +0200
Subject: [PATCH 173/194] make test: run serially and parallel, show times

---
 Makefile | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index a040cf9d..d1991df0 100644
--- a/Makefile
+++ b/Makefile
@@ -22,7 +22,17 @@ docker-push: docker-build
 TEST_SCRIPTS=$(sort $(filter-out tests/run_training_test.bash, $(wildcard tests/run_*.bash)))
 .PHONY: $(TEST_SCRIPTS)
 $(TEST_SCRIPTS):
-	bash $@ $V
+	OCRD_MAX_PARALLEL_PAGES=1 /usr/bin/time -o test_serially.log -a -f "$@: %Uuser %Ssystem %Eelapsed %PCPU (%Mmax)k" bash $@ $V
+	OCRD_MAX_PARALLEL_PAGES=4 /usr/bin/time -o test_parallel.log -a -f "$@: %Uuser %Ssystem %Eelapsed %PCPU (%Mmax)k" bash $@ $V
+
+test: export OCRD_OVERRIDE_LOGLEVEL=DEBUG
+test: export OCRD_MISSING_OUTPUT=ABORT
+test: export OCRD_MAX_MISSING_OUTPUTS=-1
 test: $(TEST_SCRIPTS)
-	@echo $^
+	@echo =====single-threaded test results=====
+	@cat test_serially.log
+	@echo =====4-page-parallel test results=====
+	@cat test_parallel.log
+	@$(RM) test_serially.log test_parallel.log
+
 .PHONY: install install-devel uninstall test docker-build docker-push

From e540b108e0c7f14c1cfcf8579dd0722a41069ead Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Mon, 2 Sep 2024 11:48:43 +0200
Subject: [PATCH 174/194] require ocrd>=3.0.0b4

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 83cf28bb..e8ea1cf3 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@
     packages=find_packages(),
     include_package_data=True,
     install_requires=[
-        'ocrd>=3.0.0b1',
+        'ocrd>=3.0.0b4',
         'click',
         'scipy',
         'numpy>=1.17.0',

From fe122ae4ac21e87e684af8c6b9aa02026bf0748c Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 26 Sep 2024 01:28:50 +0000
Subject: [PATCH 175/194] segment: adapt to numpy deprecation

---
 ocrd_cis/ocropy/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index 3cb9e4c4..d78198e5 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -643,7 +643,7 @@ def compute_seplines(binary, scale, maxseps=0):
                 sepdists.append(np.median(subdistances))
                 #LOG.debug("adding sublabel %d as sep %d (size %d [%s])", sublabel, numsep, sublabelsize, str(sublabelslice))
     sepsizes = np.array(sepsizes)
-    sepslices = np.array(sepslices)
+    sepslices = np.array(sepslices, dtype=object)
     LOG.debug("detected %d separator candidates", numsep)
     DSAVE("seps-raw", sepmap[labels])
     # now dilate+erode to link neighbouring candidates,

From 99b348915bcf0c1d3ea0028ca43ac2448a0ee922 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 26 Sep 2024 01:28:50 +0000
Subject: [PATCH 176/194] segment: adapt to numpy deprecation

---
 ocrd_cis/ocropy/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index c5b56ed0..bae4dac0 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -644,7 +644,7 @@ def compute_seplines(binary, scale, maxseps=0):
                 sepdists.append(np.median(subdistances))
                 #LOG.debug("adding sublabel %d as sep %d (size %d [%s])", sublabel, numsep, sublabelsize, str(sublabelslice))
     sepsizes = np.array(sepsizes)
-    sepslices = np.array(sepslices)
+    sepslices = np.array(sepslices, dtype=object)
     LOG.debug("detected %d separator candidates", numsep)
     DSAVE("seps-raw", sepmap[labels])
     # now dilate+erode to link neighbouring candidates,

From 56eaca7116dd5b21a2ebd456cd1b0237b8c09dc3 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Thu, 10 Oct 2024 19:09:41 +0200
Subject: [PATCH 177/194] fix: levenshtein import

---
 ocrd_cis/align/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py
index ffe53fd8..7747622e 100644
--- a/ocrd_cis/align/cli.py
+++ b/ocrd_cis/align/cli.py
@@ -2,7 +2,7 @@
 import click
 import json
 import os
-import Levenshtein
+from rapidfuzz.distance import Levenshtein
 from ocrd import Processor
 from ocrd.decorators import ocrd_cli_options
 from ocrd.decorators import ocrd_cli_wrap_processor

From ca08c1af462769df84cf5e83aadf118da0d96865 Mon Sep 17 00:00:00 2001
From: kba <unixprog@gmail.com>
Date: Fri, 11 Oct 2024 11:12:20 +0200
Subject: [PATCH 178/194] eval/stats: Levenshtein ->
 rapidfuzz.distance.Levenshtein

---
 ocrd_cis/div/eval.py  | 2 +-
 ocrd_cis/div/stats.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ocrd_cis/div/eval.py b/ocrd_cis/div/eval.py
index 6efe90c6..f47682ff 100644
--- a/ocrd_cis/div/eval.py
+++ b/ocrd_cis/div/eval.py
@@ -1,6 +1,6 @@
 import os
 from PIL import Image
-from Levenshtein import distance
+from rapidfuzz.distance.Levenshtein import distance
 
 
 path = '/mnt/c/Users/chris/Documents/projects/OCR-D/daten/gt/lines/'
diff --git a/ocrd_cis/div/stats.py b/ocrd_cis/div/stats.py
index ea385d98..6f9c9816 100644
--- a/ocrd_cis/div/stats.py
+++ b/ocrd_cis/div/stats.py
@@ -4,7 +4,7 @@
 from ocrd import Processor
 from ocrd_cis import get_ocrd_tool
 from ocrd_models.ocrd_page_generateds import parse
-from Levenshtein import distance
+from rapidfuzz.distance import Levenshtein
 
 
 class Stats(Processor):
@@ -81,7 +81,7 @@ def process(self):
                         # print(line.get_TextEquiv()[2].dataType)
                         unicodeline = line.get_TextEquiv()[i].Unicode
 
-                        d[i] += distance(gtline, unicodeline)
+                        d[i] += Levenshtein.distance(gtline, unicodeline)
 
                         # words = line.get_Word()
                         # for word in words:

From dee1abf5c1cfcf3b8e111f4b3f8614e0f6fea214 Mon Sep 17 00:00:00 2001
From: kba <unixprog@gmail.com>
Date: Fri, 11 Oct 2024 11:12:20 +0200
Subject: [PATCH 179/194] eval/stats: Levenshtein ->
 rapidfuzz.distance.Levenshtein

---
 ocrd_cis/div/eval.py  | 2 +-
 ocrd_cis/div/stats.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ocrd_cis/div/eval.py b/ocrd_cis/div/eval.py
index 6efe90c6..f47682ff 100644
--- a/ocrd_cis/div/eval.py
+++ b/ocrd_cis/div/eval.py
@@ -1,6 +1,6 @@
 import os
 from PIL import Image
-from Levenshtein import distance
+from rapidfuzz.distance.Levenshtein import distance
 
 
 path = '/mnt/c/Users/chris/Documents/projects/OCR-D/daten/gt/lines/'
diff --git a/ocrd_cis/div/stats.py b/ocrd_cis/div/stats.py
index ea385d98..6f9c9816 100644
--- a/ocrd_cis/div/stats.py
+++ b/ocrd_cis/div/stats.py
@@ -4,7 +4,7 @@
 from ocrd import Processor
 from ocrd_cis import get_ocrd_tool
 from ocrd_models.ocrd_page_generateds import parse
-from Levenshtein import distance
+from rapidfuzz.distance import Levenshtein
 
 
 class Stats(Processor):
@@ -81,7 +81,7 @@ def process(self):
                         # print(line.get_TextEquiv()[2].dataType)
                         unicodeline = line.get_TextEquiv()[i].Unicode
 
-                        d[i] += distance(gtline, unicodeline)
+                        d[i] += Levenshtein.distance(gtline, unicodeline)
 
                         # words = line.get_Word()
                         # for word in words:

From 817230b626a9d1c6d84fd868a05e77b4fa487005 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 12 Feb 2025 13:36:52 +0100
Subject: [PATCH 180/194] require ocrd v3

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index e8ea1cf3..25dce03e 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@
     packages=find_packages(),
     include_package_data=True,
     install_requires=[
-        'ocrd>=3.0.0b4',
+        'ocrd>=3.0.2',
         'click',
         'scipy',
         'numpy>=1.17.0',

From ec348fcf78a81506064e9d3fd1c83325d33a043c Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 12 Feb 2025 13:37:02 +0100
Subject: [PATCH 181/194] relax max_workers (now multiprocessing instead of
 multithreading)

---
 ocrd_cis/ocropy/recognize.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 97bec8a7..55d91cc5 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -68,8 +68,6 @@ def recognize(image, pad, network, check=True):
 class OcropyRecognize(Processor):
     network: Any
     pad: int
-    # lstm is not thread-safe (.outputs, .last_n as side effects etc)
-    max_workers = 1
 
     @property
     def executable(self):

From c022bba6ee4f0dc322cf2a1ac1f09ad7ee8490ab Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 12 Feb 2025 13:54:02 +0100
Subject: [PATCH 182/194] ocrd-tool.json: add dockerhub=ocrd/cis

---
 ocrd_cis/ocrd-tool.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json
index c2e20268..378d73ac 100644
--- a/ocrd_cis/ocrd-tool.json
+++ b/ocrd_cis/ocrd-tool.json
@@ -1,6 +1,7 @@
 {
 	"git_url": "https://github.com/cisocrgroup/ocrd_cis",
 	"version": "0.1.5",
+	"dockerhub": "ocrd/cis",
 	"tools": {
 		"ocrd-cis-ocropy-binarize": {
 			"executable": "ocrd-cis-ocropy-binarize",

From ed8082c539c294385254f9455c4165c9b2a5c458 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Tue, 4 Mar 2025 07:32:18 +0100
Subject: [PATCH 183/194] make test: better stats

---
 .github/workflows/test.yml                         |  2 +-
 Makefile                                           | 14 +++++++++++---
 ...ssing_test.bash => run_preprocessing_test.bash} |  0
 3 files changed, 12 insertions(+), 4 deletions(-)
 rename tests/{run_image_preprocessing_test.bash => run_preprocessing_test.bash} (100%)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index f95a09a4..c50810f0 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -30,4 +30,4 @@ jobs:
       - name: Install ocrd_cis
         run: make install
       - name: Test ocrd_cis
-        run: make test V=""
+        run: make test V=
diff --git a/Makefile b/Makefile
index d1991df0..3583e0d0 100644
--- a/Makefile
+++ b/Makefile
@@ -3,6 +3,7 @@ PIP ?= pip3
 V ?= > /dev/null 2>&1
 PKG = ocrd_cis
 TAG = flobar/ocrd_cis
+SHELL = bash
 
 install:
 	${PIP} install --upgrade pip .
@@ -20,18 +21,25 @@ docker-push: docker-build
 	docker push $(TAG):latest
 
 TEST_SCRIPTS=$(sort $(filter-out tests/run_training_test.bash, $(wildcard tests/run_*.bash)))
+INDENT != MAX=; for NAME in $(TEST_SCRIPTS:tests/%=%); do if test $${\#MAX} -lt $${\#NAME}; then MAX=$${NAME//?/_}; fi; done; echo $$MAX
+indent = `WHAT=$1; WITH=$(INDENT); echo $$WHAT$${WITH:$${\#WHAT}}`
+format_tr = "$(call indent,$1):\t%U\t%S\t%E\t%P\t(%Mk)"
+format_th = "$(call indent)\tuser\tsystem\telapsed\tCPU\tmaxRSS"
+
 .PHONY: $(TEST_SCRIPTS)
 $(TEST_SCRIPTS):
-	OCRD_MAX_PARALLEL_PAGES=1 /usr/bin/time -o test_serially.log -a -f "$@: %Uuser %Ssystem %Eelapsed %PCPU (%Mmax)k" bash $@ $V
-	OCRD_MAX_PARALLEL_PAGES=4 /usr/bin/time -o test_parallel.log -a -f "$@: %Uuser %Ssystem %Eelapsed %PCPU (%Mmax)k" bash $@ $V
+	OCRD_MAX_PARALLEL_PAGES=1 /usr/bin/time -o test_serially.log -a -f $(call format_tr,$(@F)) bash -x $@ $V
+	OCRD_MAX_PARALLEL_PAGES=4 /usr/bin/time -o test_parallel.log -a -f $(call format_tr,$(@F)) bash -x $@ $V
 
 test: export OCRD_OVERRIDE_LOGLEVEL=DEBUG
 test: export OCRD_MISSING_OUTPUT=ABORT
 test: export OCRD_MAX_MISSING_OUTPUTS=-1
 test: $(TEST_SCRIPTS)
-	@echo =====single-threaded test results=====
+	@echo =====single-processing test results=====
+	@echo -e $(call format_th)
 	@cat test_serially.log
 	@echo =====4-page-parallel test results=====
+	@echo -e $(call format_th)
 	@cat test_parallel.log
 	@$(RM) test_serially.log test_parallel.log
 
diff --git a/tests/run_image_preprocessing_test.bash b/tests/run_preprocessing_test.bash
similarity index 100%
rename from tests/run_image_preprocessing_test.bash
rename to tests/run_preprocessing_test.bash

From 2854820d0abf07b89707a97848cfff65b0817b8e Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 7 Mar 2025 14:42:31 +0100
Subject: [PATCH 184/194] Docker: shortcut ocrd-all-tool.json via
 ocrd-tool.json

---
 .circleci/config.yml         | 43 ------------------------------
 .github/workflows/docker.yml | 51 ++++++++++++++++++++++++++++++++++++
 Dockerfile                   | 30 ++++++++++++++-------
 Makefile                     | 19 +++++++++-----
 4 files changed, 84 insertions(+), 59 deletions(-)
 delete mode 100644 .circleci/config.yml
 create mode 100644 .github/workflows/docker.yml

diff --git a/.circleci/config.yml b/.circleci/config.yml
deleted file mode 100644
index 5825a4e0..00000000
--- a/.circleci/config.yml
+++ /dev/null
@@ -1,43 +0,0 @@
-version: 2.1
-jobs:
-  test-python3:
-    docker:
-      - image: ocrd/core
-    resource_class: large
-    environment:
-      PIP: pip3
-      PYTHON: python3
-    steps:
-      - checkout
-      - run: apt-get update && apt-get -y install default-jre-headless
-      - run: make install
-      - run: make -j test V=""
-
-  deploy-docker:
-    docker:
-      - image: circleci/buildpack-deps:stretch
-    environment:
-      DOCKER_TAG: ocrd/cis
-    steps:
-      - checkout
-      - setup_remote_docker: # https://circleci.com/docs/2.0/building-docker-images/
-         docker_layer_caching: true
-      - run: make docker-build TAG=$DOCKER_TAG
-      - run:
-          name: Login to Docker Hub
-          command: echo "$DOCKERHUB_PASS" | docker login --username "$DOCKERHUB_USER" --password-stdin
-      - run: docker push $DOCKER_TAG
-
-workflows:
-  version: 2
-  build-and-test:
-    jobs:
-      - test-python3
-  deploy:
-    jobs:
-      - deploy-docker:
-          filters:
-            branches:
-              only:
-                - master
-                - fix-alpha-shape
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
new file mode 100644
index 00000000..a9f766d5
--- /dev/null
+++ b/.github/workflows/docker.yml
@@ -0,0 +1,51 @@
+name: Docker Image CI
+
+on:
+  workflow_dispatch:
+  push:
+    branches: [ "master", "fix-alpha-shape" ]
+
+env:
+  REPO_NAME: ${{ github.repository }}
+
+jobs:
+
+  build:
+
+    runs-on: ubuntu-latest
+    permissions:
+      packages: write
+      contents: read
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+      with:
+        # we need tags for docker version tagging
+        fetch-tags: true
+        fetch-depth: 0
+    - # Activate cache export feature to reduce build time of images
+      name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+    - name: Login to GitHub Container Registry
+      uses: docker/login-action@v2
+      with:
+        registry: ghcr.io
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+    - name: Log in to Docker Hub
+      uses: docker/login-action@v3
+      with:
+        username: ${{ secrets.DOCKERIO_USERNAME }}
+        password: ${{ secrets.DOCKERIO_PASSWORD }}
+    - name: define image name from repo name
+      run: echo "IMAGE_NAME=ghcr.io/${REPO_NAME,,}" >> $GITHUB_ENV
+    - name: Build the Docker image
+      # build both tags at the same time
+      run: make docker-build DOCKER_TAG="docker.io/ocrd/cis -t ${{ env.IMAGE_NAME }}"
+    - name: Test the Docker image
+      run: docker run --rm ${{ env.IMAGE_NAME }} ocrd-cis-ocropy-segment -h
+    - name: Push to Dockerhub
+      run: docker push docker.io/ocrd/cis
+    - name: Push to Github Container Registry
+      run: docker push ${{ env.IMAGE_NAME }}
+
diff --git a/Dockerfile b/Dockerfile
index e7b2249a..cffb7475 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,14 +1,22 @@
-FROM ocrd/core:v2.67.2 AS base
+ARG DOCKER_BASE_IMAGE
+FROM $DOCKER_BASE_IMAGE AS base
 ARG VCS_REF
 ARG BUILD_DATE
 LABEL \
-    maintainer="https://github.com/OCR-D/ocrd_cis/issues" \
+    maintainer="https://github.com/cisocrgroup/ocrd_cis/issues" \
     org.label-schema.vcs-ref=$VCS_REF \
-    org.label-schema.vcs-url="https://github.com/OCR-D/ocrd_cis" \
-    org.label-schema.build-date=$BUILD_DATE
+    org.label-schema.vcs-url="https://github.com/cisocrgroup/ocrd_cis" \
+    org.label-schema.build-date=$BUILD_DATE \
+    org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \
+    org.opencontainers.image.title="ocrd_cis" \
+    org.opencontainers.image.description="Ocropy OCR and CIS post-correction bindings" \
+    org.opencontainers.image.source="https://github.com/cisocrgroup/ocrd_cis" \
+    org.opencontainers.image.documentation="https://github.com/cisocrgroup/ocrd_cis/blob/${VCS_REF}/README.md" \
+    org.opencontainers.image.revision=$VCS_REF \
+    org.opencontainers.image.created=$BUILD_DATE \
+    org.opencontainers.image.base.name=ocrd/core
 
 ENV GITURL="https://github.com/cisocrgroup"
-ENV DOWNLOAD_URL="http://cis.lmu.de/~finkf"
 
 SHELL ["/bin/bash", "-c"]
 
@@ -51,19 +59,23 @@ RUN apt-get update \
 
 FROM base AS postcorrection
 # install ocrd_cis (python)
-VOLUME ["/data"]
+WORKDIR /build/ocrd_cis
 COPY --from=languagemodel /etc/profiler/languages /etc/profiler/languages
 COPY --from=profiler /apps/profiler /apps/
 COPY --from=profiler /usr/lib/x86_64-linux-gnu/libicuuc.so /usr/lib//x86_64-linux-gnu/
 COPY --from=profiler /usr/lib/x86_64-linux-gnu/libicudata.so /usr/lib//x86_64-linux-gnu/
 COPY --from=profiler /usr/lib//x86_64-linux-gnu/libxerces-c-3.2.so /usr/lib//x86_64-linux-gnu/
-COPY . /build/ocrd_cis
+COPY . .
+# prepackage ocrd-tool.json as ocrd-all-tool.json
+RUN ocrd ocrd-tool ocrd_cis/ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
+# install everything and reduce image size
 RUN apt-get update \
 	&& apt-get -y install --no-install-recommends gcc wget default-jre-headless \
-	&& pushd /build/ocrd_cis \
 	&& make install \
 	# test always fail, resources not available for download. Resources should be made available
 	# somewhere else, e.g. github.com/OCR-D/assets
 	# && make test \
-	&& popd \
 	&& rm -rf /build/ocrd_cis
+
+WORKDIR /data
+VOLUME /data
diff --git a/Makefile b/Makefile
index 3583e0d0..1d3e9930 100644
--- a/Makefile
+++ b/Makefile
@@ -2,23 +2,28 @@ PY ?= python3
 PIP ?= pip3
 V ?= > /dev/null 2>&1
 PKG = ocrd_cis
-TAG = flobar/ocrd_cis
+DOCKER_TAG = ocrd/cis
+DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.1.0
 SHELL = bash
 
 install:
-	${PIP} install --upgrade pip .
-install-devel:
-	${PIP} install --upgrade pip -e .
+	${PIP} install .
+
+install-devel install-dev:
+	${PIP} install -e .
+
 uninstall:
 	${PIP} uninstall ${PKG}
 
 docker-build: Dockerfile
 	docker build \
+	--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
 	--build-arg VCS_REF=$$(git rev-parse --short HEAD) \
 	--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
-	-t $(TAG):latest .
+	-t $(DOCKER_TAG):latest .
+
 docker-push: docker-build
-	docker push $(TAG):latest
+	docker push $(DOCKER_TAG):latest
 
 TEST_SCRIPTS=$(sort $(filter-out tests/run_training_test.bash, $(wildcard tests/run_*.bash)))
 INDENT != MAX=; for NAME in $(TEST_SCRIPTS:tests/%=%); do if test $${\#MAX} -lt $${\#NAME}; then MAX=$${NAME//?/_}; fi; done; echo $$MAX
@@ -43,4 +48,4 @@ test: $(TEST_SCRIPTS)
 	@cat test_parallel.log
 	@$(RM) test_serially.log test_parallel.log
 
-.PHONY: install install-devel uninstall test docker-build docker-push
+.PHONY: install install-dev install-devel uninstall test docker-build docker-push

From 6ee159230f48341d75a2fe0a6eb8e5ab0f441c97 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 7 Mar 2025 15:15:16 +0100
Subject: [PATCH 185/194] =?UTF-8?q?setup=E2=86=92pyproject?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore       |   3 ++
 pyproject.toml   | 107 +++++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt |  10 +++++
 setup.py         |  74 --------------------------------
 4 files changed, 120 insertions(+), 74 deletions(-)
 create mode 100644 pyproject.toml
 create mode 100644 requirements.txt
 delete mode 100644 setup.py

diff --git a/.gitignore b/.gitignore
index fb28879b..aca5a739 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,6 @@ env-dir/*
 /venv*
 /build
 /dist
+TAGS
+*.log
+download/
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..133bbf51
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,107 @@
+[build-system]
+requires = ["setuptools>=61.0.0", "wheel", "setuptools-ocrd"]
+
+[project]
+name = "ocrd_cis"
+authors = [
+    {name = "Florian Fink", email = "finkf@cis.lmu.de"},
+    {name = "Tobias Englmeier", email = "englmeier@cis.lmu.de"},
+    {name = "Christoph Weber", email = "web_chris@msn.com"},
+    {name = "Robert Sachunsky", email = "sachunsky@informatik.uni-leipzig.de"},
+]
+description = "CIS OCR-D post-correction tools and improved Ocropy1"
+readme = "README.md"
+license = {text = "MIT"}
+requires-python = ">=3.8"
+keywords = ["ocr", "ocr-d", "ocropus-ocr", "post-correction"]
+
+dynamic = ["version", "dependencies"]
+
+# https://pypi.org/classifiers/
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Environment :: Console",
+    "Intended Audience :: Science/Research",
+    "Intended Audience :: Other Audience",
+    "License :: OSI Approved :: MIT Software License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3 :: Only",
+    "Topic :: Text Processing",
+]
+
+[project.scripts]
+ocrd-cis-align = "ocrd_cis.align.cli:ocrd_cis_align"
+ocrd-cis-postcorrect = "ocrd_cis.postcorrect.cli:ocrd_cis_postcorrect"
+ocrd-cis-data = "ocrd_cis.data.__main__:main"
+ocrd-cis-ocropy-train = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_train"
+ocrd-cis-ocropy-recognize = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_recognize"
+ocrd-cis-ocropy-segment = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_segment"
+ocrd-cis-ocropy-resegment = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_resegment"
+ocrd-cis-ocropy-clip = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_clip"
+ocrd-cis-ocropy-dewarp = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_dewarp"
+ocrd-cis-ocropy-deskew = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_deskew"
+ocrd-cis-ocropy-denoise = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_denoise"
+ocrd-cis-ocropy-binarize = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_binarize"
+
+[project.urls]
+Homepage = "https://github.com/cisocrgroup/ocrd_cis"
+Repository = "https://github.com/cisocrgroup/ocrd_cis.git"
+
+[project.optional-dependencies]
+debug = ["matplotlib>3.0.0"]
+
+[tool.setuptools.dynamic]
+dependencies = {file = ["requirements.txt"]}
+
+[tool.setuptools]
+packages = ["ocrd_cis", "ocrd_cis.postcorrect", "ocrd_cis.aio", "ocrd_cis.data", "ocrd_cis.wer", "ocrd_cis.ocropy", "ocrd_cis.ocropy.ocrolib", "ocrd_cis.div", "ocrd_cis.align"]
+package-data = {"*" = ["*.json", "*.jar", "model.zip", "3gs.csv.gz"]}
+
+[tool.pytest.ini_options]
+minversion = 6.0
+addopts = "--strict-markers"
+markers = [
+    "integration: integration tests",
+]
+
+
+[tool.mypy]
+plugins = ["numpy.typing.mypy_plugin"]
+
+ignore_missing_imports = true
+
+
+strict = true
+
+disallow_subclassing_any = false
+# ❗ error: Class cannot subclass "Processor" (has type "Any")
+disallow_any_generics = false
+disallow_untyped_defs = false
+disallow_untyped_calls = false
+
+
+[tool.ruff.lint]
+select = ["E", "F", "I"]
+
+
+[tool.coverage.run]
+branch = true
+source = [
+    "ocrd_cis"
+]
+concurrency = [
+    "thread",
+    "multiprocessing"
+]
+
+[tool.coverage.report]
+exclude_also = [
+    "if self\\.debug",
+    "pragma: no cover",
+    "raise NotImplementedError",
+    "if __name__ == .__main__.:",
+]
+ignore_errors = true
+omit = [
+    "ocrd_cis/*/cli"
+]
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..a57112af
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,10 @@
+ocrd>=3.0.2
+click
+scipy
+numpy>=1.17.0
+pillow>=7.1.2
+shapely>=2.0.0
+scikit-image
+networkx
+opencv-python-headless
+rapidfuzz
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 25dce03e..00000000
--- a/setup.py
+++ /dev/null
@@ -1,74 +0,0 @@
-"""
-Installs:
-    - ocrd-cis-align
-    - ocrd-cis-postcorrect
-    - ocrd-cis-data
-    - ocrd-cis-ocropy-clip
-    - ocrd-cis-ocropy-denoise
-    - ocrd-cis-ocropy-deskew
-    - ocrd-cis-ocropy-binarize
-    - ocrd-cis-ocropy-resegment
-    - ocrd-cis-ocropy-segment
-    - ocrd-cis-ocropy-dewarp
-    - ocrd-cis-ocropy-recognize
-    - ocrd-cis-ocropy-train
-"""
-
-import codecs
-import json
-from setuptools import setup
-from setuptools import find_packages
-
-with codecs.open('README.md', encoding='utf-8') as f:
-    README = f.read()
-
-with open('./ocrd-tool.json', 'r') as f:
-    version = json.load(f)['version']
-
-setup(
-    name='ocrd_cis',
-    version=version,
-    description='CIS OCR-D command line tools',
-    long_description=README,
-    long_description_content_type='text/markdown',
-    author='Florian Fink, Tobias Englmeier, Christoph Weber',
-    author_email='finkf@cis.lmu.de, englmeier@cis.lmu.de, web_chris@msn.com',
-    url='https://github.com/cisocrgroup/ocrd_cis',
-    license='MIT',
-    packages=find_packages(),
-    include_package_data=True,
-    install_requires=[
-        'ocrd>=3.0.2',
-        'click',
-        'scipy',
-        'numpy>=1.17.0',
-        'pillow>=7.1.2',
-        'shapely>=1.7.1',
-        'scikit-image',
-        'networkx',
-        'opencv-python-headless',
-        'rapidfuzz'
-    ],
-    extras_require={
-        'debug': ['matplotlib>3.0.0'],
-    },
-    package_data={
-        '': ['*.json', '*.yml', '*.yaml', '*.csv.gz', '*.jar', '*.zip'],
-    },
-    entry_points={
-        'console_scripts': [
-            'ocrd-cis-align=ocrd_cis.align.cli:ocrd_cis_align',
-            'ocrd-cis-postcorrect=ocrd_cis.postcorrect.cli:ocrd_cis_postcorrect',
-            'ocrd-cis-data=ocrd_cis.data.__main__:main',
-            'ocrd-cis-ocropy-binarize=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_binarize',
-            'ocrd-cis-ocropy-clip=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_clip',
-            'ocrd-cis-ocropy-denoise=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_denoise',
-            'ocrd-cis-ocropy-deskew=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_deskew',
-            'ocrd-cis-ocropy-dewarp=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_dewarp',
-            'ocrd-cis-ocropy-recognize=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_recognize',
-            'ocrd-cis-ocropy-resegment=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_resegment',
-            'ocrd-cis-ocropy-segment=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_segment',
-            'ocrd-cis-ocropy-train=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_train',
-        ]
-    },
-)

From 89e046cd1ab55e007882d0ff3d68a7e2024267b2 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 7 Mar 2025 15:25:12 +0100
Subject: [PATCH 186/194] add 'build' and 'help' targets

---
 Makefile | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 1d3e9930..1f97072c 100644
--- a/Makefile
+++ b/Makefile
@@ -6,16 +6,36 @@ DOCKER_TAG = ocrd/cis
 DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.1.0
 SHELL = bash
 
+help:
+	@echo ""
+	@echo "  Targets"
+	@echo ""
+	@echo "    install          Install ocrd_cis"
+	@echo "    install-dev      Install in editable mode"
+	@echo "    build            Build source and binary distribution"
+	@echo "    docker           Build Docker image"
+	@echo "    test             Run unit tests"
+	@echo ""
+	@echo "  Variables"
+	@echo ""
+	@echo "    DOCKER_TAG   '$(DOCKER_TAG)'"
+	@echo "    PY           '$(PY)'"
+	@echo "    PIP          '$(PIP)'"
+
 install:
 	${PIP} install .
 
 install-devel install-dev:
 	${PIP} install -e .
 
+build:
+	${PIP} install build
+	${PY} -m build .
+
 uninstall:
 	${PIP} uninstall ${PKG}
 
-docker-build: Dockerfile
+docker-build docker: Dockerfile
 	docker build \
 	--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
 	--build-arg VCS_REF=$$(git rev-parse --short HEAD) \
@@ -48,4 +68,4 @@ test: $(TEST_SCRIPTS)
 	@cat test_parallel.log
 	@$(RM) test_serially.log test_parallel.log
 
-.PHONY: install install-dev install-devel uninstall test docker-build docker-push
+.PHONY: install install-dev install-devel build uninstall test docker docker-build docker-push

From f31917b70dd54b7bdb0d6dbbefac0e4125c48544 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 7 Mar 2025 15:25:34 +0100
Subject: [PATCH 187/194] :package: 0.2.0

---
 ocrd_cis/ocrd-tool.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json
index 378d73ac..472ea5ab 100644
--- a/ocrd_cis/ocrd-tool.json
+++ b/ocrd_cis/ocrd-tool.json
@@ -1,6 +1,6 @@
 {
 	"git_url": "https://github.com/cisocrgroup/ocrd_cis",
-	"version": "0.1.5",
+	"version": "0.2.0",
 	"dockerhub": "ocrd/cis",
 	"tools": {
 		"ocrd-cis-ocropy-binarize": {

From 61ed15a3c558e8038d55cc4beb5e4bed716ca020 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 7 Mar 2025 17:11:10 +0100
Subject: [PATCH 188/194] add PyPI CD

---
 .github/workflows/pypi.yml | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 .github/workflows/pypi.yml

diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
new file mode 100644
index 00000000..1b239c0c
--- /dev/null
+++ b/.github/workflows/pypi.yml
@@ -0,0 +1,29 @@
+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: PyPI CD
+
+on:
+  release:
+    types: [published]
+  workflow_dispatch:
+
+jobs:
+  publish:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.8'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install setuptools wheel build twine
+        pip install -r requirements.txt
+    - name: Build and publish
+      env:
+        TWINE_USERNAME: __token__
+        TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
+      run: twine upload --verbose dist/ocrd*${{ github.ref_name }}*{tar.gz,whl}

From 2cf3c85ced72df0de28573db9f2ea531beaf0a42 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 7 Mar 2025 17:31:00 +0100
Subject: [PATCH 189/194] PyPI CD: strip 'v' prefix from git tag

---
 .github/workflows/pypi.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
index 1b239c0c..17860add 100644
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -26,4 +26,6 @@ jobs:
       env:
         TWINE_USERNAME: __token__
         TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
-      run: twine upload --verbose dist/ocrd*${{ github.ref_name }}*{tar.gz,whl}
+      run: |
+        version=${{ github.ref_name }}
+        twine upload --verbose dist/ocrd*${version:1}*{tar.gz,whl}

From a8210ed3592107072ec91d6a07d89fa211e7d1f9 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 7 Mar 2025 20:27:48 +0100
Subject: [PATCH 190/194] PyPI CD: use whatever is in dist

---
 .github/workflows/pypi.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
index 17860add..79309e74 100644
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -27,5 +27,5 @@ jobs:
         TWINE_USERNAME: __token__
         TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
       run: |
-        version=${{ github.ref_name }}
-        twine upload --verbose dist/ocrd*${version:1}*{tar.gz,whl}
+        ls -l dist
+        twine upload --verbose dist/ocrd*{tar.gz,whl}

From f4a41ce8be655a80bd36ab5b9a9d199e29a931f3 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 7 Mar 2025 20:30:53 +0100
Subject: [PATCH 191/194] PyPI CD: forgot the actual build!

---
 .github/workflows/pypi.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
index 79309e74..54c46713 100644
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -27,5 +27,6 @@ jobs:
         TWINE_USERNAME: __token__
         TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
       run: |
+        python -m build .
         ls -l dist
         twine upload --verbose dist/ocrd*{tar.gz,whl}

From 5cf22f5baa093ffaf0049e3c9756094116273598 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 7 Mar 2025 22:10:08 +0100
Subject: [PATCH 192/194] fix license classifier for PyPI

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 133bbf51..6432dd27 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,7 +23,7 @@ classifiers = [
     "Environment :: Console",
     "Intended Audience :: Science/Research",
     "Intended Audience :: Other Audience",
-    "License :: OSI Approved :: MIT Software License",
+    "License :: OSI Approved :: MIT License",
     "Programming Language :: Python :: 3",
     "Programming Language :: Python :: 3 :: Only",
     "Topic :: Text Processing",

From 027ebe3614da172b2ed7ebc1279e6d3088d9728a Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 2 May 2025 00:26:01 +0200
Subject: [PATCH 193/194] docker: prepackage ocrd-all-module-dir.json

---
 Dockerfile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index cffb7475..0fa98fb4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -66,8 +66,11 @@ COPY --from=profiler /usr/lib/x86_64-linux-gnu/libicuuc.so /usr/lib//x86_64-linu
 COPY --from=profiler /usr/lib/x86_64-linux-gnu/libicudata.so /usr/lib//x86_64-linux-gnu/
 COPY --from=profiler /usr/lib//x86_64-linux-gnu/libxerces-c-3.2.so /usr/lib//x86_64-linux-gnu/
 COPY . .
+COPY ocrd-tool.json .
 # prepackage ocrd-tool.json as ocrd-all-tool.json
-RUN ocrd ocrd-tool ocrd_cis/ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
+RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
+# prepackage ocrd-all-module-dir.json
+RUN ocrd ocrd-tool ocrd-tool.json dump-module-dirs > $(dirname $(ocrd bashlib filename))/ocrd-all-module-dir.json
 # install everything and reduce image size
 RUN apt-get update \
 	&& apt-get -y install --no-install-recommends gcc wget default-jre-headless \

From 025409becd6dc01c42770f218f561507887d7ff5 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 2 May 2025 00:25:57 +0200
Subject: [PATCH 194/194] docker: use latest core base stage

---
 Makefile | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 1f97072c..ac2edacc 100644
--- a/Makefile
+++ b/Makefile
@@ -2,8 +2,9 @@ PY ?= python3
 PIP ?= pip3
 V ?= > /dev/null 2>&1
 PKG = ocrd_cis
-DOCKER_TAG = ocrd/cis
-DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.1.0
+DOCKER_TAG ?= ocrd/cis
+DOCKER_BASE_IMAGE ?= docker.io/ocrd/core:latest
+DOCKER ?= docker
 SHELL = bash
 
 help:
@@ -36,14 +37,14 @@ uninstall:
 	${PIP} uninstall ${PKG}
 
 docker-build docker: Dockerfile
-	docker build \
+	$(DOCKER) build \
 	--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
 	--build-arg VCS_REF=$$(git rev-parse --short HEAD) \
 	--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
 	-t $(DOCKER_TAG):latest .
 
 docker-push: docker-build
-	docker push $(DOCKER_TAG):latest
+	$(DOCKER) push $(DOCKER_TAG):latest
 
 TEST_SCRIPTS=$(sort $(filter-out tests/run_training_test.bash, $(wildcard tests/run_*.bash)))
 INDENT != MAX=; for NAME in $(TEST_SCRIPTS:tests/%=%); do if test $${\#MAX} -lt $${\#NAME}; then MAX=$${NAME//?/_}; fi; done; echo $$MAX