From 12668a020888cd5914f353b82ba800120bb5657b Mon Sep 17 00:00:00 2001
From: Robin Palotai <robinp@juremy.com>
Date: Mon, 12 May 2025 11:05:28 +0200
Subject: [PATCH 1/8] Small comments and diagnosis.

---
 python/eflomal/eflomal.pyx |  4 ++++
 src/eflomal.c              | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/python/eflomal/eflomal.pyx b/python/eflomal/eflomal.pyx
index f759184..b2323fe 100644
--- a/python/eflomal/eflomal.pyx
+++ b/python/eflomal/eflomal.pyx
@@ -61,6 +61,9 @@ cpdef tuple read_text(pyfile, bool lowercase, int prefix_len, int suffix_len):
 cpdef write_text(pyfile, tuple sents, int voc_size):
     """Write a sequence of sentences in the format expected by eflomal
 
+    NOTE(token-limit): if more than 1024 tokens are in a sentence, an empty
+    sentence is written instead of that sentence.
+
     Arguments:
     pyfile -- Python file object to write to
     sents -- tuple of sentences, each encoded as np.ndarray(uint32)
@@ -74,6 +77,7 @@ cpdef write_text(pyfile, tuple sents, int voc_size):
     fprintf(f, '%d %d\n', len(sents), voc_size)
     for sent in sents:
         n = len(sent)
+        # NOTE(token-limit).
         if n < 0x400:
             i = 0
             fprintf(f, '%d', n)
diff --git a/src/eflomal.c b/src/eflomal.c
index bb504d3..1bfda6e 100644
--- a/src/eflomal.c
+++ b/src/eflomal.c
@@ -1136,6 +1136,20 @@ struct text* text_read(const char *filename) {
     return text;
 }
 
+void check_openmp() {
+    int n_threads = 0;
+#pragma omp parallel
+    {
+#pragma omp atomic
+        n_threads += 1;
+    }
+    if (n_threads > 1) {
+        fprintf(stderr, "OpenMP is active! Number of threads: %d\n", n_threads);
+    } else {
+        fprintf(stderr, "Running without OpenMP concurrency?\n");
+    }
+}
+
 static void align(
         int reverse,
         const struct text *source,
@@ -1342,6 +1356,10 @@ int main(int argc, char *argv[]) {
         return 1;
     }
 
+    if (!quiet) {
+        check_openmp();
+    }
+
     if (score_model == -1) score_model = model;
 
     t0 = seconds();

From 8974e0b52a0b399750a1cf67e30f756f1e19ec4c Mon Sep 17 00:00:00 2001
From: Robin Palotai <robinp@juremy.com>
Date: Tue, 13 May 2025 22:08:06 +0200
Subject: [PATCH 2/8] Add more config params to server call.

---
 .gitignore                    |  16 +++
 devscripts/ctags.sh           |   1 +
 devscripts/curl_server.sh     |   1 +
 python/eflomal/__init__.py    | 205 +++++++++++++++++++++++++++++++---
 python/eflomal/eflomal.pyx    |   2 +-
 python/eflomal/server.py      | 126 +++++++++++++++++++++
 python/scripts/eflomal-server |   9 ++
 server_config.json.example    |   8 ++
 setup.py                      |   5 +-
 9 files changed, 357 insertions(+), 16 deletions(-)
 create mode 100644 devscripts/ctags.sh
 create mode 100644 devscripts/curl_server.sh
 create mode 100644 python/eflomal/server.py
 create mode 100755 python/scripts/eflomal-server
 create mode 100644 server_config.json.example

diff --git a/.gitignore b/.gitignore
index ecefa7f..3612c56 100644
--- a/.gitignore
+++ b/.gitignore
@@ -33,3 +33,19 @@ python/eflomal/bin
 # Debug files
 *.dSYM/
 *.su
+
+# ctags
+tags
+*.TAG
+
+# Generated
+python/eflomal/eflomal.c
+
+# Virtualenv and build
+pyvenv.cfg
+bin
+build
+lib
+lib64
+**/*.egg-info
+**/__pycache__
diff --git a/devscripts/ctags.sh b/devscripts/ctags.sh
new file mode 100644
index 0000000..a3cd85d
--- /dev/null
+++ b/devscripts/ctags.sh
@@ -0,0 +1 @@
+ctags --exclude=@.gitignore -R python/ src/
diff --git a/devscripts/curl_server.sh b/devscripts/curl_server.sh
new file mode 100644
index 0000000..b28687d
--- /dev/null
+++ b/devscripts/curl_server.sh
@@ -0,0 +1 @@
+curl -X POST $@ localhost:5000/api/align/v1 -H 'Content-type: application/json' -d '{"aligner": "my-align", "samplers": 3, "iters": {"1":64, "2": 32, "3": 8}, "sents":[{"s":"Die Kuh", "t":"The cow"}]}'
diff --git a/python/eflomal/__init__.py b/python/eflomal/__init__.py
index e0fb3a5..dd58fdd 100644
--- a/python/eflomal/__init__.py
+++ b/python/eflomal/__init__.py
@@ -6,11 +6,11 @@
 from tempfile import NamedTemporaryFile
 
 from .cython import align, read_text, write_text
-
+import time
+#import shutil
 
 logger = logging.getLogger(__name__)
 
-
 class Aligner:
     """Aligner class"""
 
@@ -27,12 +27,80 @@ def __init__(self, model=3, score_model=0,
         self.null_prior = null_prior
         self.source_prefix_len = source_prefix_len
         self.source_suffix_len = source_suffix_len
+        self.source_lowercase = True
         self.target_prefix_len = target_prefix_len
         self.target_suffix_len = target_suffix_len
+        self.target_lowercase = True
+        #
+        self._preloaded_priors = None
+        # Note(preloaded-priors,development): Set to True when developing to
+        # ensure consistency between normal and preloaded priors.
+        self._assert_preloaded_prior_eq = False
+
+    def preload_priors(self, priors_input):
+        """
+        Preloads the priors into quick to index structures. Useful in server
+        mode, where individual requests typically use a small part of the
+        prior words, so iterating the full prior would be wasteful.
+
+        Note that the preprocessing performs the same text transform operations
+        that the sentence word transformer would do. So the preprocessed prior
+        is already in terms of transformed words, and so is only suitable to
+        use with sentence words using the same transformation (which, for the
+        same Aligner, is always true).
+
+        """
+        t0 = time.time()
+        priors = read_priors(priors_input)
+        priors_list, hmmf_priors, hmmr_priors, ferf_priors, ferr_priors = priors
+        src_tf = TextIndex({}, self.source_prefix_len, self.source_suffix_len,
+                           self.source_lowercase)
+        trg_tf = TextIndex({}, self.target_prefix_len, self.target_suffix_len,
+                           self.target_lowercase)
+
+        priors_tree = {}
+        # TODO(NULL): <NULL> is not supported. Could.
+        for src_word, trg_word, alpha in priors_list:
+            src_word = src_tf.transform(src_word)
+            trg_word = trg_tf.transform(trg_word)
+
+            if src_word not in priors_tree:
+                priors_tree[src_word] = {}
+            trg_tree = priors_tree[src_word]
+
+            trg_tree[trg_word] = trg_tree.get(trg_word, 0.0) + alpha
+
+        ferf_map = {}
+        for src_word, fert, alpha in ferf_priors:
+            # Note(preloaded-priors,development): for example comment following
+            # line to trigger an orig vs preloaded prior difference check.
+            src_word = src_tf.transform(src_word)
+
+            if src_word not in ferf_map:
+                ferf_map[src_word] = {}
+            smap = ferf_map[src_word]
+            smap[fert] = smap.get(fert, 0.0) + alpha
+
+        ferr_map = {}
+        for trg_word, fert, alpha in ferr_priors:
+            trg_word = trg_tf.transform(trg_word)
+
+            if trg_word not in ferr_map:
+                ferr_map[trg_word] = {}
+            smap = ferr_map[trg_word]
+            smap[fert] = smap.get(fert, 0.0) + alpha
+
+        dt = time.time() - t0
+        logger.info(f"Prior preprocessing took {dt} seconds")
+
+        preloaded = (priors_tree, ferf_map, ferr_map)
+
+        self._preloaded_priors = (priors, preloaded)
 
     def prepare_files(self, src_input_file, src_output_file,
                       trg_input_file, trg_output_file,
-                      priors_input_file, priors_output_file):
+                      priors_input_file,
+                      priors_output_file, orig_priors_output_file=None):
         """Convert text files to formats used by eflomal
 
         Inputs should be file objects or any iterables over lines. Outputs
@@ -51,7 +119,18 @@ def prepare_files(self, src_input_file, src_output_file,
                 n_src_sents, n_trg_sents)
             raise ValueError('Mismatched file sizes')
         logger.info('Prepared %d sentences for alignment', n_src_sents)
-        if priors_input_file:
+        if self._preloaded_priors:
+            t0 = time.time()
+            (priors, _) = self._preloaded_priors
+            preloaded_to_eflomal_priors_file(self._preloaded_priors, src_index,
+                                             trg_index, priors_output_file)
+            dt = time.time() - t0
+            logger.info(f"Prior calculation took {dt} seconds using preloaded")
+            if orig_priors_output_file is not None:
+                # output normal processing-based priors for comparison
+                to_eflomal_priors_file(
+                    priors, src_index, trg_index, orig_priors_output_file)
+        elif priors_input_file:
             logger.info('Reading lexical priors...')
             priors = read_priors(priors_input_file)
             to_eflomal_priors_file(
@@ -64,19 +143,46 @@ def align(self, src_input, trg_input,
         """Run alignment for the input"""
         with NamedTemporaryFile('wb') as srcf, \
              NamedTemporaryFile('wb') as trgf, \
-             NamedTemporaryFile('w', encoding='utf-8') as priorsf:
-            # Write input files for the eflomal binary
-            self.prepare_files(
-                src_input, srcf, trg_input, trgf, priors_input, priorsf)
+             NamedTemporaryFile('w', encoding='utf-8',
+                                delete_on_close=False) as priorsf:
+
+            use_prior = self._preloaded_priors or priors_input
+            if self._preloaded_priors and self._assert_preloaded_prior_eq:
+                with NamedTemporaryFile('w', encoding='utf-8',
+                                        delete_on_close=False) as orig_priorsf:
+                    self.prepare_files(
+                        src_input, srcf, trg_input, trgf, priors_input,
+                        priorsf, orig_priorsf)
+                    # Note: opening NamedTemporaryFile-s is safe as long as
+                    #  1) happens using context-manager, and 2) delete_on_close
+                    #  was set to False, as above.
+                    with open(orig_priorsf.name, 'r') as of, \
+                         open(priorsf.name, 'r') as f:
+                        orig = of.read()
+                        pre = f.read()
+                        if orig != pre:
+                            #shutil.copy(orig_priorsf.name, "/tmp/prior.orig")
+                            #shutil.copy(priorsf.name, "/tmp/prior.preloaded")
+                            raise Exception("===== ERROR! Preloaded prior leads to differing processed prior! ======")
+            else:
+                # Write input files for the eflomal binary
+                #
+                # Note(preloaded-priors): if priors were preloaded, then
+                # priors_input is not used at this point (but then likely they
+                # are not passed either).
+                #
+                self.prepare_files(
+                    src_input, srcf, trg_input, trgf, priors_input, priorsf)
+
             # Run wrapper for the eflomal binary
+            t0 = time.time()
             align(srcf.name, trgf.name,
                   links_filename_fwd=links_filename_fwd,
                   links_filename_rev=links_filename_rev,
                   statistics_filename=None,
                   scores_filename_fwd=scores_filename_fwd,
                   scores_filename_rev=scores_filename_rev,
-                  priors_filename=(None if priors_input is None
-                                   else priorsf.name),
+                  priors_filename=(priorsf.name if use_prior else None),
                   model=self.model,
                   score_model=self.score_model,
                   n_iterations=self.n_iterations,
@@ -85,25 +191,33 @@ def align(self, src_input, trg_input,
                   rel_iterations=self.rel_iterations,
                   null_prior=self.null_prior,
                   use_gdb=use_gdb)
+            dt = time.time() - t0
+            logger.info(f"Align call took {dt} seconds")
 
 
 class TextIndex:
     """Word to index mapping with lowercasing and prefix/suffix removal"""
 
-    def __init__(self, index, prefix_len=0, suffix_len=0):
+    def __init__(self, index, prefix_len=0, suffix_len=0, lowercase=True):
         self.index = index
         self.prefix_len = prefix_len
         self.suffix_len = suffix_len
+        self.lowercase = lowercase
 
     def __len__(self):
         return len(self.index)
 
-    def __getitem__(self, word):
-        word = word.lower()
+    def transform(self, word):
+        if self.lowercase:
+            word = word.lower()
         if self.prefix_len != 0:
             word = word[:self.prefix_len]
         if self.suffix_len != 0:
             word = word[-self.suffix_len:]
+        return word
+
+    def __getitem__(self, word):
+        word = self.transform(word)
         e = self.index.get(word)
         if e is not None:
             e = e + 1
@@ -315,3 +429,68 @@ def to_eflomal_priors_file(priors, src_index, trg_index, outfile):
     for (f, fert), alpha in sorted(ferr_indexed.items()):
         print('%d %d %g' % (f, fert, alpha), file=outfile)
     outfile.flush()
+
+def preloaded_to_eflomal_priors_file(pp, src_index, trg_index, outfile):
+    """Write priors to a file read by eflomal binary
+
+    Arguments:
+
+    priors - tuple of priors (priors_list, hmmf_priors, hmmr_priors,
+             ferf_priors, ferr_priors)
+    src_index - vocabulary index for source text
+    tgt_index - vocabulary index for target text
+    outfile - file object for output
+
+    """
+    (priors, preloaded_priors) = pp
+    priors_list, hmmf_priors, hmmr_priors, ferf_priors, ferr_priors = priors
+    (priors_tree, ferf_map, ferr_map) = preloaded_priors
+
+    priors_indexed = {}
+    # TODO(NULL): not yet supported.
+    for src_word, e in src_index.index.items():
+        e = e + 1
+        trg_tree = priors_tree.get(src_word)
+        if trg_tree is None: continue
+        for trg_word, f in trg_index.index.items():
+            f = f + 1
+            alpha = trg_tree.get(trg_word)
+            if alpha is not None:
+                priors_indexed[(e, f)] = priors_indexed.get((e, f), 0.0) + alpha
+
+    logger.info('%d (of %d) pairs of lexical priors used',
+                len(priors_indexed), len(priors_list))
+
+    ferf_indexed = {}
+    for src_word, e in src_index.index.items():
+        e = e + 1
+        falphas = ferf_map.get(src_word)
+        if falphas is None: continue
+        for fert, alpha in falphas.items():
+            ferf_indexed[(e, fert)] = ferf_indexed.get((e, fert), 0.0) + alpha
+
+    ferr_indexed = {}
+    for trg_word, f in trg_index.index.items():
+        f = f + 1
+        falphas = ferr_map.get(trg_word)
+        if falphas is None: continue
+        for fert, alpha in falphas.items():
+            ferr_indexed[(f, fert)] = ferr_indexed.get((f, fert), 0.0) + alpha
+
+    print('%d %d %d %d %d %d %d' % (
+        len(src_index)+1, len(trg_index)+1, len(priors_indexed),
+        len(hmmf_priors), len(hmmr_priors),
+        len(ferf_indexed), len(ferr_indexed)),
+          file=outfile)
+    for (e, f), alpha in sorted(priors_indexed.items()):
+        print('%d %d %g' % (e, f, alpha), file=outfile)
+    for jump, alpha in sorted(hmmf_priors.items()):
+        print('%d %g' % (jump, alpha), file=outfile)
+    for jump, alpha in sorted(hmmr_priors.items()):
+        print('%d %g' % (jump, alpha), file=outfile)
+    for (e, fert), alpha in sorted(ferf_indexed.items()):
+        print('%d %d %g' % (e, fert, alpha), file=outfile)
+    for (f, fert), alpha in sorted(ferr_indexed.items()):
+        print('%d %d %g' % (f, fert, alpha), file=outfile)
+    outfile.flush()
+
diff --git a/python/eflomal/eflomal.pyx b/python/eflomal/eflomal.pyx
index b2323fe..25de18b 100644
--- a/python/eflomal/eflomal.pyx
+++ b/python/eflomal/eflomal.pyx
@@ -7,7 +7,6 @@ import os
 import sys
 import math
 import subprocess
-from tempfile import NamedTemporaryFile
 
 import numpy as np
 
@@ -164,3 +163,4 @@ def align(
     if use_gdb: args = ['gdb', '-ex=run', '--args'] + args
     subprocess.run(args, check=True)
 
+
diff --git a/python/eflomal/server.py b/python/eflomal/server.py
new file mode 100644
index 0000000..ac40db2
--- /dev/null
+++ b/python/eflomal/server.py
@@ -0,0 +1,126 @@
+from flask import Flask, request, make_response
+
+import json
+import os
+import functools
+import time
+
+from eflomal import Aligner, sentences_from_joint_file
+from tempfile import TemporaryDirectory
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+DEFAULT_LOG_FORMAT = "[%(asctime)s] [%(process)d] [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s"
+
+ACCEPT_LOG_LEVELS = ["error", "info", "debug"]
+
+
+class InputFormatException(Exception):
+    def __init__(self, msg):
+        self.msg = msg
+
+
+def create_app():
+    app = Flask(__name__, instance_relative_config=True)  # why?
+
+    app_config_path = os.environ.get('FLASK_APP_CONFIG')
+    with open(app_config_path) as f:
+        cfg = json.load(f)
+
+    log_format = cfg.get("log_format", DEFAULT_LOG_FORMAT)
+
+    log_level = cfg.get("log_level", "info")
+    if not log_level in ACCEPT_LOG_LEVELS:
+        raise Exception(f"log_level not one of {ACCEPT_LOG_LEVELS}")
+
+    ll = None
+    if log_level == "error":
+        ll = logging.ERROR
+    elif log_level == "info":
+        ll = logging.INFO
+    elif log_level == "debug":
+        ll = logging.DEBUG
+
+    logging.basicConfig( level=ll, format=log_format)
+
+    logger.info("Read application config: %s", cfg)
+
+    aligners = {}
+    for acfg in cfg["aligners"]:
+        name = acfg["name"]
+        pri = acfg["priors"]
+        logger.info(f"Loading aligner {name} with priors {pri}")
+        aligners[name] = create_aligner(pri)
+
+    @app.route('/api/align/v1', methods=['POST'])
+    def alignV1():
+        req = request.get_json()
+        aligner = aligners[req['aligner']]
+
+        iters = [32, 32, 32]
+        if 'iters' in req:
+            req_iters = req['iters']
+            if "1" in req_iters: iters[0] = req_iters["1"]
+            if "2" in req_iters: iters[1] = req_iters["2"]
+            if "3" in req_iters: iters[2] = req_iters["3"]
+        iters = tuple(iters)
+
+        samplers = 3   # copied default
+        if 'samplers' in req:
+            samplers = int(req['samplers'])
+
+        num_sents = len(req['sents'])
+        def input_iter(field):
+            for sent in req['sents']:
+                f = sent[field]
+                if type(f) == list:
+                    f = ' '.join(f)
+                if type(f) != str:
+                    raise InputFormatException("Sentence should be string")
+                yield f
+        src_iter = input_iter("s")
+        trg_iter = input_iter("t")
+
+        t10 = time.time()
+        with TemporaryDirectory() as td:
+            fwd_fp = os.path.join(td, "req.fwd")
+            rev_fp = os.path.join(td, "req.rev")
+
+            aligner.n_iterations = iters
+            aligner.n_samplers = samplers
+            try:
+                aligner.align(src_iter, trg_iter,
+                              links_filename_fwd=fwd_fp,
+                              links_filename_rev=rev_fp,
+                              quiet=log_level != "debug")
+            except InputFormatException as e:
+                return make_response(e.msg, 400)
+
+            with open(fwd_fp, 'r') as fwdf, open(rev_fp, 'r') as revf:
+                fr_pairs = []
+                for f, r in zip(fwdf, revf):
+                    fr_pairs.append({ "fwd": f.strip(), "rev": r.strip() })
+            if len(fr_pairs) != num_sents:
+                raise Exception(f'Number of alignments differ from inputs: {len(fr_pairs)} != {num_sents}')
+            res = { "aligns": fr_pairs }
+            return res
+
+    # Don't forget this.
+    return app
+
+def create_aligner(prior_path):
+    # TODO(config) more config if needed
+    aligner = Aligner()
+    with open(prior_path, 'r', encoding='utf-8') as priors_input:
+        aligner.preload_priors(priors_input)
+    return aligner
+
+
+def main():
+    app = create_app()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/scripts/eflomal-server b/python/scripts/eflomal-server
new file mode 100755
index 0000000..56d4146
--- /dev/null
+++ b/python/scripts/eflomal-server
@@ -0,0 +1,9 @@
+#!/bin/env bash
+set -eu
+
+FLASK_APP_CONFIG=${FLASK_APP_CONFIG:-$(realpath server_config.json)} gunicorn \
+    "eflomal.server:create_app()" \
+    -b ${FLASK_HOST:-127.0.0.1}:${FLASK_PORT:-5000} \
+    --access-logfile \
+    - \
+    --workers=${WORKERS:-2}
diff --git a/server_config.json.example b/server_config.json.example
new file mode 100644
index 0000000..48faaf9
--- /dev/null
+++ b/server_config.json.example
@@ -0,0 +1,8 @@
+{
+  "aligners": [
+    { "name": "some-name"
+    , "priors": "/path/to/prior"
+    }
+  ],
+  "log_level": "info"
+}
diff --git a/setup.py b/setup.py
index 284ccd4..fc6b5a6 100755
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@ def run(self):
 
 setup(
     name='eflomal',
-    version='1.0.0-beta2',
+    version='1.0.1',
     author='Robert Östling',
     url='https://github.com/robertostling/eflomal',
     license='GNU GPLv3',
@@ -36,6 +36,7 @@ def run(self):
     long_description_content_type='text/markdown',
     install_requires=install_requires,
     tests_require=tests_require,
+    python_requires='>=3.12',
     extras_require={'test': tests_require},
     packages=['eflomal'],
     package_dir={'': 'python'},
@@ -43,6 +44,6 @@ def run(self):
         'eflomal': ['bin/eflomal']
     },
     ext_modules=cythonize(cyalign_ext, language_level='3'),
-    scripts=['python/scripts/eflomal-align', 'python/scripts/eflomal-makepriors'],
+    scripts=['python/scripts/eflomal-align', 'python/scripts/eflomal-makepriors', 'python/scripts/eflomal-server'],
     cmdclass={'build_py': build_py}
 )

From a46c1276e6108a2c62b94ba93967d1cdcfefe261 Mon Sep 17 00:00:00 2001
From: Robin Palotai <robinp@juremy.com>
Date: Fri, 16 May 2025 09:07:11 +0200
Subject: [PATCH 3/8] Distinguish forward/reverse pass in logs.

---
 python/eflomal/server.py |  2 +-
 src/eflomal.c            | 30 +++++++++++++++++++-----------
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/python/eflomal/server.py b/python/eflomal/server.py
index ac40db2..3db9348 100644
--- a/python/eflomal/server.py
+++ b/python/eflomal/server.py
@@ -60,7 +60,7 @@ def alignV1():
         aligner = aligners[req['aligner']]
 
         iters = [32, 32, 32]
-        if 'iters' in req:
+        if 'iters' in req and req['iters']:
             req_iters = req['iters']
             if "1" in req_iters: iters[0] = req_iters["1"]
             if "2" in req_iters: iters[1] = req_iters["2"]
diff --git a/src/eflomal.c b/src/eflomal.c
index 1bfda6e..7f7583f 100644
--- a/src/eflomal.c
+++ b/src/eflomal.c
@@ -1166,6 +1166,7 @@ static void align(
         const char *priors_filename)
 {
     double t0;
+    const char fr = reverse ? 'R' : 'F';
     random_state state;
     struct text_alignment *tas[n_samplers];
 
@@ -1188,8 +1189,8 @@ static void align(
         }
     }
     if (!quiet)
-        fprintf(stderr, "Created alignment structures: %.3f s\n",
-                seconds() - t0);
+        fprintf(stderr, "[%c] Created alignment structures: %.3f s\n",
+                fr, seconds() - t0);
 
     t0 = seconds();
 #pragma omp parallel for
@@ -1202,13 +1203,15 @@ static void align(
         text_alignment_randomize(tas[i], &local_state);
     }
     if (!quiet)
-        fprintf(stderr, "Randomized alignment: %.3f s\n", seconds() - t0);
+        fprintf(stderr, "[%c] Randomized alignment: %.3f s\n", fr,
+                seconds() - t0);
 
     for (int m=1; m<=model; m++) {
         if (n_iters[m-1]) {
             if (!quiet)
-                fprintf(stderr, "Aligning with model %d (%d iterations)\n",
-                        m, n_iters[m-1]);
+                fprintf(stderr,
+                        "[%c] Aligning with model %d (%d iterations)\n", fr, m,
+                        n_iters[m-1]);
             t0 = seconds();
 
 #pragma omp parallel for
@@ -1227,20 +1230,21 @@ static void align(
                 }
             }
             if (!quiet)
-                fprintf(stderr, "Done: %.3f s\n", seconds() - t0);
+                fprintf(stderr, "[%c] Done: %.3f s\n", fr, seconds() - t0);
         }
     }
 
     t0 = seconds();
     text_alignment_sample(tas[0], &state, NULL, tas, n_samplers);
     if (!quiet)
-        fprintf(stderr, "Final argmax iteration: %.3f s\n", seconds() - t0);
+        fprintf(stderr, "[%c] Final argmax iteration: %.3f s\n", fr,
+                seconds() - t0);
 
     struct text_alignment *ta = tas[0];
 
     if (stats_filename != NULL) {
         if (!quiet)
-            fprintf(stderr, "Writing alignment statistics to %s\n",
+            fprintf(stderr, "[%c] Writing alignment statistics to %s\n", fr,
                     stats_filename);
         FILE *file = (!strcmp(stats_filename, "-"))? stdout
                      : fopen(stats_filename, "w");
@@ -1251,7 +1255,8 @@ static void align(
 
     if (links_filename != NULL) {
         if (!quiet)
-            fprintf(stderr, "Writing alignments to %s for %Zu sentencess\n",
+            fprintf(stderr,
+                    "[%c] Writing alignments to %s for %Zu sentencess\n", fr,
                     links_filename, ta->target->n_sentences);
         FILE *file = (!strcmp(links_filename, "-"))? stdout
                      : fopen(links_filename, "w");
@@ -1267,10 +1272,11 @@ static void align(
         FILE *file = (!strcmp(scores_filename, "-"))? stdout
                      : fopen(scores_filename, "w");
 
+        t0 = seconds();
         if (!quiet)
             fprintf(stderr,
-                    "Computing scores using model %d for %Zu sentences\n",
-                    score_model, ta->source->n_sentences);
+                    "[%c] Computing scores using model %d for %Zu sentences\n",
+                    fr, score_model, ta->source->n_sentences);
 
         // Switch to whatever model is specified for scoring
         ta->model = score_model;
@@ -1281,6 +1287,8 @@ static void align(
 
         if (file != stdout) fclose(file);
         free(scores);
+        if (!quiet)
+            fprintf(stderr, "[%c] Scoring took: %.3f s\n", fr, seconds() - t0);
     }
 
 

From 8dc4c5ceb7daf878233549888cb2e0aed246f5f8 Mon Sep 17 00:00:00 2001
From: Robin Palotai <robinp@juremy.com>
Date: Mon, 19 May 2025 22:36:48 +0200
Subject: [PATCH 4/8] Add options to skip/limit processed lines.

---
 python/eflomal/__init__.py   | 14 ++++++++-
 python/scripts/eflomal-align | 55 ++++++++++++++++++++++++++++++++----
 2 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/python/eflomal/__init__.py b/python/eflomal/__init__.py
index dd58fdd..ce28c2d 100644
--- a/python/eflomal/__init__.py
+++ b/python/eflomal/__init__.py
@@ -196,7 +196,13 @@ def align(self, src_input, trg_input,
 
 
 class TextIndex:
-    """Word to index mapping with lowercasing and prefix/suffix removal"""
+    """
+    Word to index mapping with lowercasing and prefix/suffix removal.
+
+    Note that the returned indices are one larger than the indices in the
+    passed-in index, due to reserving output index 0 to the <NULL> token.
+
+    """
 
     def __init__(self, index, prefix_len=0, suffix_len=0, lowercase=True):
         self.index = index
@@ -266,6 +272,10 @@ def calculate_priors(src_sentences, trg_sentences,
 
     If `reverse` is True, compute priors for the opposite alignment
     direction.
+
+    Note: stored priors are agnostic of the word transform used during
+    alignment, and is in terms of the original sentence words.
+
     """
     priors = Counter()
     hmmf_priors = Counter()
@@ -274,6 +284,8 @@ def calculate_priors(src_sentences, trg_sentences,
     ferr_priors = Counter()
     for lineno, (src_sent, trg_sent, fwd_line, rev_line) in enumerate(
             zip(src_sentences, trg_sentences, fwd_alignments, rev_alignments)):
+        if lineno % 10000 == 0:
+            logger.info('processing line #%d', lineno)
         src_sent = src_sent.strip().split()
         trg_sent = trg_sent.strip().split()
         fwd_links = [tuple(map(int, s.split('-'))) for s in fwd_line.split()]
diff --git a/python/scripts/eflomal-align b/python/scripts/eflomal-align
index 873b480..5a1f4a6 100755
--- a/python/scripts/eflomal-align
+++ b/python/scripts/eflomal-align
@@ -11,6 +11,32 @@ import sys, argparse, os
 logger = logging.getLogger(__name__)
 
 
+class LineSkipIterator:
+    def __init__(self, wrapped, skip=None, limit=None):
+        self._wrapped = wrapped
+        self._left = limit
+        if skip:
+            for i in range(0, skip): next(wrapped)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self._left is not None:
+            if self._left <= 0:
+                raise StopIteration
+            else:
+                self._left -= 1
+        return next(self._wrapped)
+
+    def __enter__(self):
+        self._ctx = self._wrapped.__enter__()
+        return self
+
+    def __exit__(self, et, ev, tb):
+        return self._wrapped.__exit__(et, ev, tb)
+
+
 def main():
     parser = argparse.ArgumentParser(
         description='eflomal: efficient low-memory aligner')
@@ -70,6 +96,14 @@ def main():
     parser.add_argument(
         '-i', '--input', dest='joint_filename', type=str, metavar='filename',
         help='fast_align style ||| separated file')
+    parser.add_argument(
+        '-k', '--skip-lines', dest='skip_lines', default=None, metavar='X',
+        type=int,
+        help='Number of initial lines to skip in input')
+    parser.add_argument(
+        '-n', '--n-lines', dest='keep_lines', default=None, metavar='X',
+        type=int,
+        help='Number of lines to process at most (after optional skipping)')
     parser.add_argument(
         '-f', '--forward-links', dest='links_filename_fwd', type=str,
         metavar='filename',
@@ -137,17 +171,28 @@ def main():
             logger.info('Reading source/target sentences from %s...',
                         args.joint_filename)
             src_in_f = stack.enter_context(
-                open(args.joint_filename, 'r', encoding='utf-8'))
+                LineSkipIterator(
+                    open(args.joint_filename, 'r', encoding='utf-8'),
+                    skip = args.skip_lines,
+                    limit = args.keep_lines))
             src_input = sentences_from_joint_file(src_in_f, 0)
             trg_in_f = stack.enter_context(
-                open(args.joint_filename, 'r', encoding='utf-8'))
+                LineSkipIterator(
+                    open(args.joint_filename, 'r', encoding='utf-8'),
+                    skip = args.skip_lines,
+                    limit = args.keep_lines))
             trg_input = sentences_from_joint_file(trg_in_f, 1)
         else:
             src_input = stack.enter_context(
-                open(args.source_filename, 'r', encoding='utf-8'))
+                LineSkipIterator(
+                    open(args.source_filename, 'r', encoding='utf-8'),
+                    skip = args.skip_lines,
+                    limit = args.keep_lines))
             trg_input = stack.enter_context(
-                open(args.target_filename, 'r', encoding='utf-8'))
-
+                LineSkipIterator(
+                    open(args.target_filename, 'r', encoding='utf-8'),
+                    skip = args.skip_lines,
+                    limit = args.keep_lines))
         aligner.align(src_input, trg_input,
                       links_filename_fwd=args.links_filename_fwd,
                       links_filename_rev=args.links_filename_rev,

From 1c3ddae6ddf9e601c903a18ad94fdaafb66c7a4c Mon Sep 17 00:00:00 2001
From: Robin Palotai <robinp@juremy.com>
Date: Fri, 23 May 2025 13:04:28 +0200
Subject: [PATCH 5/8] Fix reverse prior LEX direction?

Not sure, but sounds logical.
---
 python/eflomal/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/eflomal/__init__.py b/python/eflomal/__init__.py
index ce28c2d..2a18999 100644
--- a/python/eflomal/__init__.py
+++ b/python/eflomal/__init__.py
@@ -295,7 +295,9 @@ def calculate_priors(src_sentences, trg_sentences,
                 logger.error('alignment out of bounds in line %d: '
                              '(%d, %d)', lineno + 1, i, j)
                 raise ValueError('Invalid input on line %d' % lineno + 1)
-            priors[(src_sent[i], trg_sent[j])] += 1
+            s, t = src_sent[i], trg_sent[j]
+            k = (t,s) if rev_alignments else (s,t)
+            priors[k] += 1
 
         last_j = -1
         last_i = -1

From ed239707ac70fab5acb0431fabacdbe24ec99ccc Mon Sep 17 00:00:00 2001
From: Robin Palotai <robinp@juremy.com>
Date: Fri, 23 May 2025 13:59:44 +0200
Subject: [PATCH 6/8] Server option trust_sents to disable n_clean.

Passing trust_sents=False will set n_clean=0, which (after this change)
means no sentences are trusted for statistics, so prior updates don't
happen. Useful for batched sending of sentences of dubious quality.
---
 python/eflomal/__init__.py |  4 +++-
 python/eflomal/eflomal.pyx |  4 ++++
 python/eflomal/server.py   | 11 ++++++++++-
 src/eflomal.c              | 21 ++++++++++++++-------
 4 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/python/eflomal/__init__.py b/python/eflomal/__init__.py
index 2a18999..1ef5399 100644
--- a/python/eflomal/__init__.py
+++ b/python/eflomal/__init__.py
@@ -139,7 +139,8 @@ def prepare_files(self, src_input_file, src_output_file,
     def align(self, src_input, trg_input,
               links_filename_fwd=None, links_filename_rev=None,
               scores_filename_fwd=None, scores_filename_rev=None,
-              priors_input=None, quiet=True, use_gdb=False):
+              priors_input=None, trust_sents=True,
+              quiet=True, use_gdb=False):
         """Run alignment for the input"""
         with NamedTemporaryFile('wb') as srcf, \
              NamedTemporaryFile('wb') as trgf, \
@@ -187,6 +188,7 @@ def align(self, src_input, trg_input,
                   score_model=self.score_model,
                   n_iterations=self.n_iterations,
                   n_samplers=self.n_samplers,
+                  n_clean=-1 if trust_sents else 0,
                   quiet=quiet,
                   rel_iterations=self.rel_iterations,
                   null_prior=self.null_prior,
diff --git a/python/eflomal/eflomal.pyx b/python/eflomal/eflomal.pyx
index 25de18b..f396661 100644
--- a/python/eflomal/eflomal.pyx
+++ b/python/eflomal/eflomal.pyx
@@ -103,6 +103,7 @@ def align(
         int score_model=0,
         tuple n_iterations=None,
         int n_samplers=1,
+        int n_clean=-1,
         bool quiet=True,
         double rel_iterations=1.0,
         double null_prior=0.2,
@@ -123,6 +124,8 @@ def align(
                     not given the numbers will be computed automatically based
                     on rel_iterations
     n_samplers -- number of independent samplers to run
+    n_clean -- number of first N sentences to deem clean and use for stats
+               update (-1 = all).
     quiet -- if True, suppress output
     rel_iterations -- number of iterations relative to the default
     """
@@ -147,6 +150,7 @@ def align(
             '-s', source_filename,
             '-t', target_filename,
             '-n', str(n_samplers),
+            '-c', str(n_clean),
             '-N', str(null_prior),
             '-1', str(n_iterations[0])]
     if quiet: args.append('-q')
diff --git a/python/eflomal/server.py b/python/eflomal/server.py
index 3db9348..362820c 100644
--- a/python/eflomal/server.py
+++ b/python/eflomal/server.py
@@ -67,6 +67,14 @@ def alignV1():
             if "3" in req_iters: iters[2] = req_iters["3"]
         iters = tuple(iters)
 
+        trust_sents = True
+        if 'trust_sents' in req:
+            f = req['trust_sents']
+            if type(f) == bool:
+                trust_sents = f
+            else:
+                raise InputFormatException("trust_sents should be bool")
+
         samplers = 3   # copied default
         if 'samplers' in req:
             samplers = int(req['samplers'])
@@ -78,7 +86,7 @@ def input_iter(field):
                 if type(f) == list:
                     f = ' '.join(f)
                 if type(f) != str:
-                    raise InputFormatException("Sentence should be string")
+                    raise InputFormatException("Sentence should be string or list of strings")
                 yield f
         src_iter = input_iter("s")
         trg_iter = input_iter("t")
@@ -94,6 +102,7 @@ def input_iter(field):
                 aligner.align(src_iter, trg_iter,
                               links_filename_fwd=fwd_fp,
                               links_filename_rev=rev_fp,
+                              trust_sents=trust_sents,
                               quiet=log_level != "debug")
             except InputFormatException as e:
                 return make_response(e.msg, 400)
diff --git a/src/eflomal.c b/src/eflomal.c
index 7f7583f..dac532d 100644
--- a/src/eflomal.c
+++ b/src/eflomal.c
@@ -100,7 +100,7 @@ struct text_alignment {
     // this number of sentences contain clean parallel data and should
     // contribute to the statistics (anything after this should still be
     // aligned, but don't trust the statistics):
-    size_t n_clean; // 0 (the default) means all sentences should be used
+    int32_t n_clean; // -1 (the default) means all sentences should be used
     count null_prior;
 };
 
@@ -230,7 +230,7 @@ void text_alignment_sample(
     count *jump_counts = ta->jump_counts;
     count *fert_counts = ta->fert_counts;
     const size_t n_sentences =
-        ta->n_clean? ta->n_clean: ta->target->n_sentences;
+        ta->n_clean >= 0 ? ta->n_clean: ta->target->n_sentences;
 
     // the fertility distributions (unlike the jump and lexical distributions)
     // are sampled explicitly, and the categorical distributions are fixed
@@ -683,7 +683,7 @@ void text_alignment_make_counts(struct text_alignment *ta) {
         }
     }
     const size_t n_sentences =
-        ta->n_clean? ta->n_clean: ta->target->n_sentences;
+        ta->n_clean >= 0 ? ta->n_clean: ta->target->n_sentences;
     for (size_t sent=0; sent<n_sentences; sent++) {
         link_t *links = ta->sentence_links[sent];
         if (links == NULL) continue;
@@ -971,7 +971,7 @@ struct text_alignment *text_alignment_create(
     ta->model = 1;
     ta->source = source;
     ta->target = target;
-    ta->n_clean = 0;
+    ta->n_clean = -1;
 
     // These should be initialized with text_alignment_load_priors()
     ta->source_prior = NULL;
@@ -1158,6 +1158,7 @@ static void align(
         int score_model,
         double null_prior,
         int n_samplers,
+        int n_clean,
         int quiet,
         const int *n_iters,
         const char *links_filename,
@@ -1176,6 +1177,7 @@ static void align(
     for (int i=0; i<n_samplers; i++) {
         tas[i] = text_alignment_create(
                 (reverse? target: source), (reverse? source: target));
+        tas[i]->n_clean = n_clean;
         tas[i]->null_prior = null_prior;
         if (priors_filename != NULL) {
             // TODO: since read-only, could use the pointer from tas[0]
@@ -1227,7 +1229,11 @@ static void align(
 
                 for (int j=0; j<n_iters[m-1]; j++) {
                     text_alignment_sample(tas[i], &local_state, NULL, NULL, 1);
+                    //if (!quiet)
+                    //  fprintf(stderr, ".%c%d", fr, i);
                 }
+                //if (!quiet)
+                //  fprintf(stderr, "\n");  // Racy but better than nothing.
             }
             if (!quiet)
                 fprintf(stderr, "[%c] Done: %.3f s\n", fr, seconds() - t0);
@@ -1314,14 +1320,14 @@ int main(int argc, char *argv[]) {
          *stats_filename = NULL,
          *scores_filename_fwd = NULL, *scores_filename_rev = NULL;
     int n_iters[3];
-    int n_samplers = 1, quiet = 0, model = -1, score_model = -1;
+    int n_samplers = 1, n_clean = -1, quiet = 0, model = -1, score_model = -1;
     double null_prior = 0.2;
 
     n_iters[0] = 1; n_iters[1] = 1; n_iters[2] = 1;
 
     omp_set_nested(1);
 
-    while ((opt = getopt(argc, argv, "s:t:p:f:r:S:F:R:1:2:3:n:qm:M:N:h"))
+    while ((opt = getopt(argc, argv, "s:t:p:f:r:S:F:R:1:2:3:n:c:qm:M:N:h"))
             != -1)
     {
         switch(opt) {
@@ -1337,6 +1343,7 @@ int main(int argc, char *argv[]) {
             case '2': n_iters[1] = atoi(optarg); break;
             case '3': n_iters[2] = atoi(optarg); break;
             case 'n': n_samplers = atoi(optarg); break;
+            case 'c': n_clean = atoi(optarg); break;
             case 'q': quiet = 1; break;
             case 'm': model = atoi(optarg);
                       if (model < 1 || model > 3) {
@@ -1397,7 +1404,7 @@ int main(int argc, char *argv[]) {
                 (!reverse && links_filename_fwd == NULL &&
                  links_filename_rev == NULL))
             align(reverse, source, target, model, score_model, null_prior,
-                  n_samplers,
+                  n_samplers, n_clean,
                   quiet, n_iters, links_filename, stats_filename,
                   scores_filename, priors_filename);
     }

From ea15c412c6a2bef1011a6ed407f35d5f3f34f440 Mon Sep 17 00:00:00 2001
From: Robin Palotai <robinp@juremy.com>
Date: Thu, 29 May 2025 13:01:06 +0200
Subject: [PATCH 7/8] Support returning scores through server.

---
 python/eflomal/server.py | 45 +++++++++++++++++++++++++++++++++-------
 1 file changed, 38 insertions(+), 7 deletions(-)

diff --git a/python/eflomal/server.py b/python/eflomal/server.py
index 362820c..7d95311 100644
--- a/python/eflomal/server.py
+++ b/python/eflomal/server.py
@@ -4,6 +4,7 @@
 import os
 import functools
 import time
+import math
 
 from eflomal import Aligner, sentences_from_joint_file
 from tempfile import TemporaryDirectory
@@ -79,22 +80,37 @@ def alignV1():
         if 'samplers' in req:
             samplers = int(req['samplers'])
 
+        scoring = True
+        if 'scoring' in req:
+            f = req['scoring']
+            if type(f) == bool:
+                scoring = f
+            else:
+                raise InputFormatException("scoring should be bool")
+
         num_sents = len(req['sents'])
-        def input_iter(field):
-            for sent in req['sents']:
+        sent_stoks = [0] * num_sents
+        sent_ttoks = [0] * num_sents
+        def input_iter(field, toks):
+            for n, sent in enumerate(req['sents']):
                 f = sent[field]
                 if type(f) == list:
+                    toks[n] = len(f)
                     f = ' '.join(f)
-                if type(f) != str:
+                elif type(f) == str:
+                    toks[n] = len(f.split())
+                else:
                     raise InputFormatException("Sentence should be string or list of strings")
                 yield f
-        src_iter = input_iter("s")
-        trg_iter = input_iter("t")
+        src_iter = input_iter("s", sent_stoks)
+        trg_iter = input_iter("t", sent_ttoks)
 
         t10 = time.time()
         with TemporaryDirectory() as td:
             fwd_fp = os.path.join(td, "req.fwd")
             rev_fp = os.path.join(td, "req.rev")
+            fsc_fp = os.path.join(td, "rsc.fwd") if scoring else None
+            rsc_fp = os.path.join(td, "rsc.rev") if scoring else None
 
             aligner.n_iterations = iters
             aligner.n_samplers = samplers
@@ -102,15 +118,30 @@ def input_iter(field):
                 aligner.align(src_iter, trg_iter,
                               links_filename_fwd=fwd_fp,
                               links_filename_rev=rev_fp,
+                              scores_filename_fwd=fsc_fp,
+                              scores_filename_rev=rsc_fp,
                               trust_sents=trust_sents,
                               quiet=log_level != "debug")
             except InputFormatException as e:
                 return make_response(e.msg, 400)
 
+            scores = []
+            if scoring:
+                with open(fsc_fp, 'r') as fscf, open(rsc_fp, 'r') as rscf:
+                    for fs, rs in zip(fscf, rscf):
+                        scores.append((float(fs), float(rs)))
+
             with open(fwd_fp, 'r') as fwdf, open(rev_fp, 'r') as revf:
                 fr_pairs = []
-                for f, r in zip(fwdf, revf):
-                    fr_pairs.append({ "fwd": f.strip(), "rev": r.strip() })
+                for n, (f, r) in enumerate(zip(fwdf, revf)):
+                    res = { "fwd": f.strip(), "rev": r.strip() }
+                    if scoring:
+                        fs, rs = scores[n]
+                        res["score_fwd"] = fs
+                        res["score_rev"] = rs
+                        res["norm_score_fwd"] = fs - math.log(sent_ttoks[n])
+                        res["norm_score_rev"] = rs - math.log(sent_stoks[n])
+                    fr_pairs.append(res)
             if len(fr_pairs) != num_sents:
                 raise Exception(f'Number of alignments differ from inputs: {len(fr_pairs)} != {num_sents}')
             res = { "aligns": fr_pairs }

From a77b95e867628889e93af259c0706a8bca383051 Mon Sep 17 00:00:00 2001
From: Robin Palotai <robinp@juremy.com>
Date: Tue, 15 Jul 2025 20:58:32 +0200
Subject: [PATCH 8/8] Add small test server data example.

---
 devscripts/curl_server.sh           |  9 ++++-
 devscripts/server_config.json       |  8 +++++
 devscripts/testdata/create-prior.sh |  3 ++
 devscripts/testdata/my-align.fwd    |  6 ++++
 devscripts/testdata/my-align.pri    | 56 +++++++++++++++++++++++++++++
 devscripts/testdata/my-align.rev    |  6 ++++
 devscripts/testdata/my-align.txt    |  6 ++++
 7 files changed, 93 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 devscripts/curl_server.sh
 create mode 100644 devscripts/server_config.json
 create mode 100755 devscripts/testdata/create-prior.sh
 create mode 100644 devscripts/testdata/my-align.fwd
 create mode 100644 devscripts/testdata/my-align.pri
 create mode 100644 devscripts/testdata/my-align.rev
 create mode 100644 devscripts/testdata/my-align.txt

diff --git a/devscripts/curl_server.sh b/devscripts/curl_server.sh
old mode 100644
new mode 100755
index b28687d..99bbcf8
--- a/devscripts/curl_server.sh
+++ b/devscripts/curl_server.sh
@@ -1 +1,8 @@
-curl -X POST $@ localhost:5000/api/align/v1 -H 'Content-type: application/json' -d '{"aligner": "my-align", "samplers": 3, "iters": {"1":64, "2": 32, "3": 8}, "sents":[{"s":"Die Kuh", "t":"The cow"}]}'
+#!/bin/env sh
+
+echo "Run eflomal-server in this directory, so it picks up server_config.json, then perform the sample request:"
+
+curl -X POST $@ localhost:5000/api/align/v1 -H 'Content-type: application/json' -H 'Accept: application/json' -d '{"aligner": "my-align", "samplers": 3, "iters": {"1":64, "2": 32, "3": 8}, "trust_sents": false, "sents":[{"s":"The cow and grass", "t":"Die Kuh und das Gras"}]}'
+
+# Approximate response:
+# {"aligns":[{"fwd":"0-0 0-1 1-2 2-3 3-4","norm_score_fwd":3.954102087565899,"norm_score_rev":2.250345638880109,"rev":"0-0 1-1 2-3 3-4","score_fwd":5.56354,"score_rev":3.63664}]}
diff --git a/devscripts/server_config.json b/devscripts/server_config.json
new file mode 100644
index 0000000..52a21b3
--- /dev/null
+++ b/devscripts/server_config.json
@@ -0,0 +1,8 @@
+{
+  "aligners": [
+    { "name": "my-align",
+      "priors": "testdata/my-align.pri"
+    }
+  ],
+  "log_level": "debug"
+}
diff --git a/devscripts/testdata/create-prior.sh b/devscripts/testdata/create-prior.sh
new file mode 100755
index 0000000..91eb82f
--- /dev/null
+++ b/devscripts/testdata/create-prior.sh
@@ -0,0 +1,3 @@
+#!/bin/env sh
+eflomal-align -i my-align.txt -f my-align.fwd -r my-align.rev
+eflomal-makepriors -i my-align.txt  -f my-align.fwd -r my-align.rev -p my-align.pri
diff --git a/devscripts/testdata/my-align.fwd b/devscripts/testdata/my-align.fwd
new file mode 100644
index 0000000..c97e1a8
--- /dev/null
+++ b/devscripts/testdata/my-align.fwd
@@ -0,0 +1,6 @@
+1-0 2-2 3-3
+0-0 1-1 2-2 4-3 6-4
+1-0 2-2 3-3
+0-0 2-1 4-2
+0-0 2-1 3-2 4-3
+1-0 2-2 3-3 4-4 5-5 6-6
diff --git a/devscripts/testdata/my-align.pri b/devscripts/testdata/my-align.pri
new file mode 100644
index 0000000..93af577
--- /dev/null
+++ b/devscripts/testdata/my-align.pri
@@ -0,0 +1,56 @@
+LEX	Auf	There	1
+LEX	Die	cow	3
+LEX	Freunde	friends	1
+LEX	Gras	grass	1
+LEX	Häschen	rabbit	2
+LEX	Lass	Let	2
+LEX	Wiese	a	1
+LEX	das	the	2
+LEX	der	is	1
+LEX	die	cow	1
+LEX	eine	grass	1
+LEX	frisst	eats	1
+LEX	ist	is	1
+LEX	mich	see	2
+LEX	schläfrig	sleepy	1
+LEX	sind	are	1
+LEX	steht	on	1
+LEX	und	and	1
+FERF	Let	1	2
+FERF	There	1	1
+FERF	a	1	1
+FERF	and	1	1
+FERF	are	1	1
+FERF	cow	1	4
+FERF	eats	1	1
+FERF	friends	1	1
+FERF	grass	1	2
+FERF	is	1	2
+FERF	on	1	1
+FERF	rabbit	1	2
+FERF	see	1	2
+FERF	sleepy	1	1
+FERF	the	1	2
+FERR	Auf	1	1
+FERR	Die	1	3
+FERR	Freunde	1	1
+FERR	Gras	1	1
+FERR	Häschen	1	2
+FERR	Kuh	1	5
+FERR	Lass	1	2
+FERR	Wiese	1	1
+FERR	das	1	2
+FERR	der	1	1
+FERR	die	1	1
+FERR	eine	1	1
+FERR	frisst	1	1
+FERR	ist	1	1
+FERR	mich	1	2
+FERR	schläfrig	1	1
+FERR	sind	1	1
+FERR	steht	1	1
+FERR	und	1	1
+HMMF	1	22
+HMMF	2	8
+HMMR	1	33
+HMMR	2	2
diff --git a/devscripts/testdata/my-align.rev b/devscripts/testdata/my-align.rev
new file mode 100644
index 0000000..85f02b9
--- /dev/null
+++ b/devscripts/testdata/my-align.rev
@@ -0,0 +1,6 @@
+0-0 1-1 2-2 3-3
+0-0 1-1 2-2 3-3 4-4 5-5
+0-0 1-1 2-2 3-3
+0-0 1-1 3-2 4-3
+0-0 1-1 3-2 4-3
+0-0 1-1 2-2 3-3 4-4 5-5 6-6
diff --git a/devscripts/testdata/my-align.txt b/devscripts/testdata/my-align.txt
new file mode 100644
index 0000000..c55f1d2
--- /dev/null
+++ b/devscripts/testdata/my-align.txt
@@ -0,0 +1,6 @@
+The cow eats grass ||| Die Kuh frisst Gras
+There is a cow on the grass ||| Auf der Wiese steht eine Kuh
+The cow is sleepy ||| Die Kuh ist schläfrig
+Let me see the cow ||| Lass mich die Kuh sehen
+Let me see the rabbit ||| Lass mich das Häschen sehen
+The cow and the rabbit are friends ||| Die Kuh und das Häschen sind Freunde