From 12668a020888cd5914f353b82ba800120bb5657b Mon Sep 17 00:00:00 2001 From: Robin Palotai Date: Mon, 12 May 2025 11:05:28 +0200 Subject: [PATCH 1/8] Small comments and diagnosis. --- python/eflomal/eflomal.pyx | 4 ++++ src/eflomal.c | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/python/eflomal/eflomal.pyx b/python/eflomal/eflomal.pyx index f759184..b2323fe 100644 --- a/python/eflomal/eflomal.pyx +++ b/python/eflomal/eflomal.pyx @@ -61,6 +61,9 @@ cpdef tuple read_text(pyfile, bool lowercase, int prefix_len, int suffix_len): cpdef write_text(pyfile, tuple sents, int voc_size): """Write a sequence of sentences in the format expected by eflomal + NOTE(token-limit): if more than 1024 tokens are in a sentence, an empty + sentence is written instead of that sentence. + Arguments: pyfile -- Python file object to write to sents -- tuple of sentences, each encoded as np.ndarray(uint32) @@ -74,6 +77,7 @@ cpdef write_text(pyfile, tuple sents, int voc_size): fprintf(f, '%d %d\n', len(sents), voc_size) for sent in sents: n = len(sent) + # NOTE(token-limit). if n < 0x400: i = 0 fprintf(f, '%d', n) diff --git a/src/eflomal.c b/src/eflomal.c index bb504d3..1bfda6e 100644 --- a/src/eflomal.c +++ b/src/eflomal.c @@ -1136,6 +1136,20 @@ struct text* text_read(const char *filename) { return text; } +void check_openmp() { + int n_threads = 0; +#pragma omp parallel + { +#pragma omp atomic + n_threads += 1; + } + if (n_threads > 1) { + fprintf(stderr, "OpenMP is active! Number of threads: %d\n", n_threads); + } else { + fprintf(stderr, "Running without OpenMP concurrency?\n"); + } +} + static void align( int reverse, const struct text *source, @@ -1342,6 +1356,10 @@ int main(int argc, char *argv[]) { return 1; } + if (!quiet) { + check_openmp(); + } + if (score_model == -1) score_model = model; t0 = seconds(); From 8974e0b52a0b399750a1cf67e30f756f1e19ec4c Mon Sep 17 00:00:00 2001 From: Robin Palotai Date: Tue, 13 May 2025 22:08:06 +0200 Subject: [PATCH 2/8] Add more config params to server call. --- .gitignore | 16 +++ devscripts/ctags.sh | 1 + devscripts/curl_server.sh | 1 + python/eflomal/__init__.py | 205 +++++++++++++++++++++++++++++++--- python/eflomal/eflomal.pyx | 2 +- python/eflomal/server.py | 126 +++++++++++++++++++++ python/scripts/eflomal-server | 9 ++ server_config.json.example | 8 ++ setup.py | 5 +- 9 files changed, 357 insertions(+), 16 deletions(-) create mode 100644 devscripts/ctags.sh create mode 100644 devscripts/curl_server.sh create mode 100644 python/eflomal/server.py create mode 100755 python/scripts/eflomal-server create mode 100644 server_config.json.example diff --git a/.gitignore b/.gitignore index ecefa7f..3612c56 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,19 @@ python/eflomal/bin # Debug files *.dSYM/ *.su + +# ctags +tags +*.TAG + +# Generated +python/eflomal/eflomal.c + +# Virtualenv and build +pyvenv.cfg +bin +build +lib +lib64 +**/*.egg-info +**/__pycache__ diff --git a/devscripts/ctags.sh b/devscripts/ctags.sh new file mode 100644 index 0000000..a3cd85d --- /dev/null +++ b/devscripts/ctags.sh @@ -0,0 +1 @@ +ctags --exclude=@.gitignore -R python/ src/ diff --git a/devscripts/curl_server.sh b/devscripts/curl_server.sh new file mode 100644 index 0000000..b28687d --- /dev/null +++ b/devscripts/curl_server.sh @@ -0,0 +1 @@ +curl -X POST $@ localhost:5000/api/align/v1 -H 'Content-type: application/json' -d '{"aligner": "my-align", "samplers": 3, "iters": {"1":64, "2": 32, "3": 8}, "sents":[{"s":"Die Kuh", "t":"The cow"}]}' diff --git a/python/eflomal/__init__.py b/python/eflomal/__init__.py index e0fb3a5..dd58fdd 100644 --- a/python/eflomal/__init__.py +++ b/python/eflomal/__init__.py @@ -6,11 +6,11 @@ from tempfile import NamedTemporaryFile from .cython import align, read_text, write_text - +import time +#import shutil logger = logging.getLogger(__name__) - class Aligner: """Aligner class""" @@ -27,12 +27,80 @@ def __init__(self, model=3, score_model=0, self.null_prior = null_prior self.source_prefix_len = source_prefix_len self.source_suffix_len = source_suffix_len + self.source_lowercase = True self.target_prefix_len = target_prefix_len self.target_suffix_len = target_suffix_len + self.target_lowercase = True + # + self._preloaded_priors = None + # Note(preloaded-priors,development): Set to True when developing to + # ensure consistency between normal and preloaded priors. + self._assert_preloaded_prior_eq = False + + def preload_priors(self, priors_input): + """ + Preloads the priors into quick to index structures. Useful in server + mode, where individual requests typically use a small part of the + prior words, so iterating the full prior would be wasteful. + + Note that the preprocessing performs the same text transform operations + that the sentence word transformer would do. So the preprocessed prior + is already in terms of transformed words, and so is only suitable to + use with sentence words using the same transformation (which, for the + same Aligner, is always true). + + """ + t0 = time.time() + priors = read_priors(priors_input) + priors_list, hmmf_priors, hmmr_priors, ferf_priors, ferr_priors = priors + src_tf = TextIndex({}, self.source_prefix_len, self.source_suffix_len, + self.source_lowercase) + trg_tf = TextIndex({}, self.target_prefix_len, self.target_suffix_len, + self.target_lowercase) + + priors_tree = {} + # TODO(NULL): is not supported. Could. + for src_word, trg_word, alpha in priors_list: + src_word = src_tf.transform(src_word) + trg_word = trg_tf.transform(trg_word) + + if src_word not in priors_tree: + priors_tree[src_word] = {} + trg_tree = priors_tree[src_word] + + trg_tree[trg_word] = trg_tree.get(trg_word, 0.0) + alpha + + ferf_map = {} + for src_word, fert, alpha in ferf_priors: + # Note(preloaded-priors,development): for example comment following + # line to trigger an orig vs preloaded prior difference check. + src_word = src_tf.transform(src_word) + + if src_word not in ferf_map: + ferf_map[src_word] = {} + smap = ferf_map[src_word] + smap[fert] = smap.get(fert, 0.0) + alpha + + ferr_map = {} + for trg_word, fert, alpha in ferr_priors: + trg_word = trg_tf.transform(trg_word) + + if trg_word not in ferr_map: + ferr_map[trg_word] = {} + smap = ferr_map[trg_word] + smap[fert] = smap.get(fert, 0.0) + alpha + + dt = time.time() - t0 + logger.info(f"Prior preprocessing took {dt} seconds") + + preloaded = (priors_tree, ferf_map, ferr_map) + + self._preloaded_priors = (priors, preloaded) def prepare_files(self, src_input_file, src_output_file, trg_input_file, trg_output_file, - priors_input_file, priors_output_file): + priors_input_file, + priors_output_file, orig_priors_output_file=None): """Convert text files to formats used by eflomal Inputs should be file objects or any iterables over lines. Outputs @@ -51,7 +119,18 @@ def prepare_files(self, src_input_file, src_output_file, n_src_sents, n_trg_sents) raise ValueError('Mismatched file sizes') logger.info('Prepared %d sentences for alignment', n_src_sents) - if priors_input_file: + if self._preloaded_priors: + t0 = time.time() + (priors, _) = self._preloaded_priors + preloaded_to_eflomal_priors_file(self._preloaded_priors, src_index, + trg_index, priors_output_file) + dt = time.time() - t0 + logger.info(f"Prior calculation took {dt} seconds using preloaded") + if orig_priors_output_file is not None: + # output normal processing-based priors for comparison + to_eflomal_priors_file( + priors, src_index, trg_index, orig_priors_output_file) + elif priors_input_file: logger.info('Reading lexical priors...') priors = read_priors(priors_input_file) to_eflomal_priors_file( @@ -64,19 +143,46 @@ def align(self, src_input, trg_input, """Run alignment for the input""" with NamedTemporaryFile('wb') as srcf, \ NamedTemporaryFile('wb') as trgf, \ - NamedTemporaryFile('w', encoding='utf-8') as priorsf: - # Write input files for the eflomal binary - self.prepare_files( - src_input, srcf, trg_input, trgf, priors_input, priorsf) + NamedTemporaryFile('w', encoding='utf-8', + delete_on_close=False) as priorsf: + + use_prior = self._preloaded_priors or priors_input + if self._preloaded_priors and self._assert_preloaded_prior_eq: + with NamedTemporaryFile('w', encoding='utf-8', + delete_on_close=False) as orig_priorsf: + self.prepare_files( + src_input, srcf, trg_input, trgf, priors_input, + priorsf, orig_priorsf) + # Note: opening NamedTemporaryFile-s is safe as long as + # 1) happens using context-manager, and 2) delete_on_close + # was set to False, as above. + with open(orig_priorsf.name, 'r') as of, \ + open(priorsf.name, 'r') as f: + orig = of.read() + pre = f.read() + if orig != pre: + #shutil.copy(orig_priorsf.name, "/tmp/prior.orig") + #shutil.copy(priorsf.name, "/tmp/prior.preloaded") + raise Exception("===== ERROR! Preloaded prior leads to differing processed prior! ======") + else: + # Write input files for the eflomal binary + # + # Note(preloaded-priors): if priors were preloaded, then + # priors_input is not used at this point (but then likely they + # are not passed either). + # + self.prepare_files( + src_input, srcf, trg_input, trgf, priors_input, priorsf) + # Run wrapper for the eflomal binary + t0 = time.time() align(srcf.name, trgf.name, links_filename_fwd=links_filename_fwd, links_filename_rev=links_filename_rev, statistics_filename=None, scores_filename_fwd=scores_filename_fwd, scores_filename_rev=scores_filename_rev, - priors_filename=(None if priors_input is None - else priorsf.name), + priors_filename=(priorsf.name if use_prior else None), model=self.model, score_model=self.score_model, n_iterations=self.n_iterations, @@ -85,25 +191,33 @@ def align(self, src_input, trg_input, rel_iterations=self.rel_iterations, null_prior=self.null_prior, use_gdb=use_gdb) + dt = time.time() - t0 + logger.info(f"Align call took {dt} seconds") class TextIndex: """Word to index mapping with lowercasing and prefix/suffix removal""" - def __init__(self, index, prefix_len=0, suffix_len=0): + def __init__(self, index, prefix_len=0, suffix_len=0, lowercase=True): self.index = index self.prefix_len = prefix_len self.suffix_len = suffix_len + self.lowercase = lowercase def __len__(self): return len(self.index) - def __getitem__(self, word): - word = word.lower() + def transform(self, word): + if self.lowercase: + word = word.lower() if self.prefix_len != 0: word = word[:self.prefix_len] if self.suffix_len != 0: word = word[-self.suffix_len:] + return word + + def __getitem__(self, word): + word = self.transform(word) e = self.index.get(word) if e is not None: e = e + 1 @@ -315,3 +429,68 @@ def to_eflomal_priors_file(priors, src_index, trg_index, outfile): for (f, fert), alpha in sorted(ferr_indexed.items()): print('%d %d %g' % (f, fert, alpha), file=outfile) outfile.flush() + +def preloaded_to_eflomal_priors_file(pp, src_index, trg_index, outfile): + """Write priors to a file read by eflomal binary + + Arguments: + + priors - tuple of priors (priors_list, hmmf_priors, hmmr_priors, + ferf_priors, ferr_priors) + src_index - vocabulary index for source text + tgt_index - vocabulary index for target text + outfile - file object for output + + """ + (priors, preloaded_priors) = pp + priors_list, hmmf_priors, hmmr_priors, ferf_priors, ferr_priors = priors + (priors_tree, ferf_map, ferr_map) = preloaded_priors + + priors_indexed = {} + # TODO(NULL): not yet supported. + for src_word, e in src_index.index.items(): + e = e + 1 + trg_tree = priors_tree.get(src_word) + if trg_tree is None: continue + for trg_word, f in trg_index.index.items(): + f = f + 1 + alpha = trg_tree.get(trg_word) + if alpha is not None: + priors_indexed[(e, f)] = priors_indexed.get((e, f), 0.0) + alpha + + logger.info('%d (of %d) pairs of lexical priors used', + len(priors_indexed), len(priors_list)) + + ferf_indexed = {} + for src_word, e in src_index.index.items(): + e = e + 1 + falphas = ferf_map.get(src_word) + if falphas is None: continue + for fert, alpha in falphas.items(): + ferf_indexed[(e, fert)] = ferf_indexed.get((e, fert), 0.0) + alpha + + ferr_indexed = {} + for trg_word, f in trg_index.index.items(): + f = f + 1 + falphas = ferr_map.get(trg_word) + if falphas is None: continue + for fert, alpha in falphas.items(): + ferr_indexed[(f, fert)] = ferr_indexed.get((f, fert), 0.0) + alpha + + print('%d %d %d %d %d %d %d' % ( + len(src_index)+1, len(trg_index)+1, len(priors_indexed), + len(hmmf_priors), len(hmmr_priors), + len(ferf_indexed), len(ferr_indexed)), + file=outfile) + for (e, f), alpha in sorted(priors_indexed.items()): + print('%d %d %g' % (e, f, alpha), file=outfile) + for jump, alpha in sorted(hmmf_priors.items()): + print('%d %g' % (jump, alpha), file=outfile) + for jump, alpha in sorted(hmmr_priors.items()): + print('%d %g' % (jump, alpha), file=outfile) + for (e, fert), alpha in sorted(ferf_indexed.items()): + print('%d %d %g' % (e, fert, alpha), file=outfile) + for (f, fert), alpha in sorted(ferr_indexed.items()): + print('%d %d %g' % (f, fert, alpha), file=outfile) + outfile.flush() + diff --git a/python/eflomal/eflomal.pyx b/python/eflomal/eflomal.pyx index b2323fe..25de18b 100644 --- a/python/eflomal/eflomal.pyx +++ b/python/eflomal/eflomal.pyx @@ -7,7 +7,6 @@ import os import sys import math import subprocess -from tempfile import NamedTemporaryFile import numpy as np @@ -164,3 +163,4 @@ def align( if use_gdb: args = ['gdb', '-ex=run', '--args'] + args subprocess.run(args, check=True) + diff --git a/python/eflomal/server.py b/python/eflomal/server.py new file mode 100644 index 0000000..ac40db2 --- /dev/null +++ b/python/eflomal/server.py @@ -0,0 +1,126 @@ +from flask import Flask, request, make_response + +import json +import os +import functools +import time + +from eflomal import Aligner, sentences_from_joint_file +from tempfile import TemporaryDirectory + +import logging +logger = logging.getLogger(__name__) + + +DEFAULT_LOG_FORMAT = "[%(asctime)s] [%(process)d] [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s" + +ACCEPT_LOG_LEVELS = ["error", "info", "debug"] + + +class InputFormatException(Exception): + def __init__(self, msg): + self.msg = msg + + +def create_app(): + app = Flask(__name__, instance_relative_config=True) # why? + + app_config_path = os.environ.get('FLASK_APP_CONFIG') + with open(app_config_path) as f: + cfg = json.load(f) + + log_format = cfg.get("log_format", DEFAULT_LOG_FORMAT) + + log_level = cfg.get("log_level", "info") + if not log_level in ACCEPT_LOG_LEVELS: + raise Exception(f"log_level not one of {ACCEPT_LOG_LEVELS}") + + ll = None + if log_level == "error": + ll = logging.ERROR + elif log_level == "info": + ll = logging.INFO + elif log_level == "debug": + ll = logging.DEBUG + + logging.basicConfig( level=ll, format=log_format) + + logger.info("Read application config: %s", cfg) + + aligners = {} + for acfg in cfg["aligners"]: + name = acfg["name"] + pri = acfg["priors"] + logger.info(f"Loading aligner {name} with priors {pri}") + aligners[name] = create_aligner(pri) + + @app.route('/api/align/v1', methods=['POST']) + def alignV1(): + req = request.get_json() + aligner = aligners[req['aligner']] + + iters = [32, 32, 32] + if 'iters' in req: + req_iters = req['iters'] + if "1" in req_iters: iters[0] = req_iters["1"] + if "2" in req_iters: iters[1] = req_iters["2"] + if "3" in req_iters: iters[2] = req_iters["3"] + iters = tuple(iters) + + samplers = 3 # copied default + if 'samplers' in req: + samplers = int(req['samplers']) + + num_sents = len(req['sents']) + def input_iter(field): + for sent in req['sents']: + f = sent[field] + if type(f) == list: + f = ' '.join(f) + if type(f) != str: + raise InputFormatException("Sentence should be string") + yield f + src_iter = input_iter("s") + trg_iter = input_iter("t") + + t10 = time.time() + with TemporaryDirectory() as td: + fwd_fp = os.path.join(td, "req.fwd") + rev_fp = os.path.join(td, "req.rev") + + aligner.n_iterations = iters + aligner.n_samplers = samplers + try: + aligner.align(src_iter, trg_iter, + links_filename_fwd=fwd_fp, + links_filename_rev=rev_fp, + quiet=log_level != "debug") + except InputFormatException as e: + return make_response(e.msg, 400) + + with open(fwd_fp, 'r') as fwdf, open(rev_fp, 'r') as revf: + fr_pairs = [] + for f, r in zip(fwdf, revf): + fr_pairs.append({ "fwd": f.strip(), "rev": r.strip() }) + if len(fr_pairs) != num_sents: + raise Exception(f'Number of alignments differ from inputs: {len(fr_pairs)} != {num_sents}') + res = { "aligns": fr_pairs } + return res + + # Don't forget this. + return app + +def create_aligner(prior_path): + # TODO(config) more config if needed + aligner = Aligner() + with open(prior_path, 'r', encoding='utf-8') as priors_input: + aligner.preload_priors(priors_input) + return aligner + + +def main(): + app = create_app() + + +if __name__ == '__main__': + main() diff --git a/python/scripts/eflomal-server b/python/scripts/eflomal-server new file mode 100755 index 0000000..56d4146 --- /dev/null +++ b/python/scripts/eflomal-server @@ -0,0 +1,9 @@ +#!/bin/env bash +set -eu + +FLASK_APP_CONFIG=${FLASK_APP_CONFIG:-$(realpath server_config.json)} gunicorn \ + "eflomal.server:create_app()" \ + -b ${FLASK_HOST:-127.0.0.1}:${FLASK_PORT:-5000} \ + --access-logfile \ + - \ + --workers=${WORKERS:-2} diff --git a/server_config.json.example b/server_config.json.example new file mode 100644 index 0000000..48faaf9 --- /dev/null +++ b/server_config.json.example @@ -0,0 +1,8 @@ +{ + "aligners": [ + { "name": "some-name" + , "priors": "/path/to/prior" + } + ], + "log_level": "info" +} diff --git a/setup.py b/setup.py index 284ccd4..fc6b5a6 100755 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ def run(self): setup( name='eflomal', - version='1.0.0-beta2', + version='1.0.1', author='Robert Östling', url='https://github.com/robertostling/eflomal', license='GNU GPLv3', @@ -36,6 +36,7 @@ def run(self): long_description_content_type='text/markdown', install_requires=install_requires, tests_require=tests_require, + python_requires='>=3.12', extras_require={'test': tests_require}, packages=['eflomal'], package_dir={'': 'python'}, @@ -43,6 +44,6 @@ def run(self): 'eflomal': ['bin/eflomal'] }, ext_modules=cythonize(cyalign_ext, language_level='3'), - scripts=['python/scripts/eflomal-align', 'python/scripts/eflomal-makepriors'], + scripts=['python/scripts/eflomal-align', 'python/scripts/eflomal-makepriors', 'python/scripts/eflomal-server'], cmdclass={'build_py': build_py} ) From a46c1276e6108a2c62b94ba93967d1cdcfefe261 Mon Sep 17 00:00:00 2001 From: Robin Palotai Date: Fri, 16 May 2025 09:07:11 +0200 Subject: [PATCH 3/8] Distinguish forward/reverse pass in logs. --- python/eflomal/server.py | 2 +- src/eflomal.c | 30 +++++++++++++++++++----------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/python/eflomal/server.py b/python/eflomal/server.py index ac40db2..3db9348 100644 --- a/python/eflomal/server.py +++ b/python/eflomal/server.py @@ -60,7 +60,7 @@ def alignV1(): aligner = aligners[req['aligner']] iters = [32, 32, 32] - if 'iters' in req: + if 'iters' in req and req['iters']: req_iters = req['iters'] if "1" in req_iters: iters[0] = req_iters["1"] if "2" in req_iters: iters[1] = req_iters["2"] diff --git a/src/eflomal.c b/src/eflomal.c index 1bfda6e..7f7583f 100644 --- a/src/eflomal.c +++ b/src/eflomal.c @@ -1166,6 +1166,7 @@ static void align( const char *priors_filename) { double t0; + const char fr = reverse ? 'R' : 'F'; random_state state; struct text_alignment *tas[n_samplers]; @@ -1188,8 +1189,8 @@ static void align( } } if (!quiet) - fprintf(stderr, "Created alignment structures: %.3f s\n", - seconds() - t0); + fprintf(stderr, "[%c] Created alignment structures: %.3f s\n", + fr, seconds() - t0); t0 = seconds(); #pragma omp parallel for @@ -1202,13 +1203,15 @@ static void align( text_alignment_randomize(tas[i], &local_state); } if (!quiet) - fprintf(stderr, "Randomized alignment: %.3f s\n", seconds() - t0); + fprintf(stderr, "[%c] Randomized alignment: %.3f s\n", fr, + seconds() - t0); for (int m=1; m<=model; m++) { if (n_iters[m-1]) { if (!quiet) - fprintf(stderr, "Aligning with model %d (%d iterations)\n", - m, n_iters[m-1]); + fprintf(stderr, + "[%c] Aligning with model %d (%d iterations)\n", fr, m, + n_iters[m-1]); t0 = seconds(); #pragma omp parallel for @@ -1227,20 +1230,21 @@ static void align( } } if (!quiet) - fprintf(stderr, "Done: %.3f s\n", seconds() - t0); + fprintf(stderr, "[%c] Done: %.3f s\n", fr, seconds() - t0); } } t0 = seconds(); text_alignment_sample(tas[0], &state, NULL, tas, n_samplers); if (!quiet) - fprintf(stderr, "Final argmax iteration: %.3f s\n", seconds() - t0); + fprintf(stderr, "[%c] Final argmax iteration: %.3f s\n", fr, + seconds() - t0); struct text_alignment *ta = tas[0]; if (stats_filename != NULL) { if (!quiet) - fprintf(stderr, "Writing alignment statistics to %s\n", + fprintf(stderr, "[%c] Writing alignment statistics to %s\n", fr, stats_filename); FILE *file = (!strcmp(stats_filename, "-"))? stdout : fopen(stats_filename, "w"); @@ -1251,7 +1255,8 @@ static void align( if (links_filename != NULL) { if (!quiet) - fprintf(stderr, "Writing alignments to %s for %Zu sentencess\n", + fprintf(stderr, + "[%c] Writing alignments to %s for %Zu sentencess\n", fr, links_filename, ta->target->n_sentences); FILE *file = (!strcmp(links_filename, "-"))? stdout : fopen(links_filename, "w"); @@ -1267,10 +1272,11 @@ static void align( FILE *file = (!strcmp(scores_filename, "-"))? stdout : fopen(scores_filename, "w"); + t0 = seconds(); if (!quiet) fprintf(stderr, - "Computing scores using model %d for %Zu sentences\n", - score_model, ta->source->n_sentences); + "[%c] Computing scores using model %d for %Zu sentences\n", + fr, score_model, ta->source->n_sentences); // Switch to whatever model is specified for scoring ta->model = score_model; @@ -1281,6 +1287,8 @@ static void align( if (file != stdout) fclose(file); free(scores); + if (!quiet) + fprintf(stderr, "[%c] Scoring took: %.3f s\n", fr, seconds() - t0); } From 8dc4c5ceb7daf878233549888cb2e0aed246f5f8 Mon Sep 17 00:00:00 2001 From: Robin Palotai Date: Mon, 19 May 2025 22:36:48 +0200 Subject: [PATCH 4/8] Add options to skip/limit processed lines. --- python/eflomal/__init__.py | 14 ++++++++- python/scripts/eflomal-align | 55 ++++++++++++++++++++++++++++++++---- 2 files changed, 63 insertions(+), 6 deletions(-) diff --git a/python/eflomal/__init__.py b/python/eflomal/__init__.py index dd58fdd..ce28c2d 100644 --- a/python/eflomal/__init__.py +++ b/python/eflomal/__init__.py @@ -196,7 +196,13 @@ def align(self, src_input, trg_input, class TextIndex: - """Word to index mapping with lowercasing and prefix/suffix removal""" + """ + Word to index mapping with lowercasing and prefix/suffix removal. + + Note that the returned indices are one larger than the indices in the + passed-in index, due to reserving output index 0 to the token. + + """ def __init__(self, index, prefix_len=0, suffix_len=0, lowercase=True): self.index = index @@ -266,6 +272,10 @@ def calculate_priors(src_sentences, trg_sentences, If `reverse` is True, compute priors for the opposite alignment direction. + + Note: stored priors are agnostic of the word transform used during + alignment, and is in terms of the original sentence words. + """ priors = Counter() hmmf_priors = Counter() @@ -274,6 +284,8 @@ def calculate_priors(src_sentences, trg_sentences, ferr_priors = Counter() for lineno, (src_sent, trg_sent, fwd_line, rev_line) in enumerate( zip(src_sentences, trg_sentences, fwd_alignments, rev_alignments)): + if lineno % 10000 == 0: + logger.info('processing line #%d', lineno) src_sent = src_sent.strip().split() trg_sent = trg_sent.strip().split() fwd_links = [tuple(map(int, s.split('-'))) for s in fwd_line.split()] diff --git a/python/scripts/eflomal-align b/python/scripts/eflomal-align index 873b480..5a1f4a6 100755 --- a/python/scripts/eflomal-align +++ b/python/scripts/eflomal-align @@ -11,6 +11,32 @@ import sys, argparse, os logger = logging.getLogger(__name__) +class LineSkipIterator: + def __init__(self, wrapped, skip=None, limit=None): + self._wrapped = wrapped + self._left = limit + if skip: + for i in range(0, skip): next(wrapped) + + def __iter__(self): + return self + + def __next__(self): + if self._left is not None: + if self._left <= 0: + raise StopIteration + else: + self._left -= 1 + return next(self._wrapped) + + def __enter__(self): + self._ctx = self._wrapped.__enter__() + return self + + def __exit__(self, et, ev, tb): + return self._wrapped.__exit__(et, ev, tb) + + def main(): parser = argparse.ArgumentParser( description='eflomal: efficient low-memory aligner') @@ -70,6 +96,14 @@ def main(): parser.add_argument( '-i', '--input', dest='joint_filename', type=str, metavar='filename', help='fast_align style ||| separated file') + parser.add_argument( + '-k', '--skip-lines', dest='skip_lines', default=None, metavar='X', + type=int, + help='Number of initial lines to skip in input') + parser.add_argument( + '-n', '--n-lines', dest='keep_lines', default=None, metavar='X', + type=int, + help='Number of lines to process at most (after optional skipping)') parser.add_argument( '-f', '--forward-links', dest='links_filename_fwd', type=str, metavar='filename', @@ -137,17 +171,28 @@ def main(): logger.info('Reading source/target sentences from %s...', args.joint_filename) src_in_f = stack.enter_context( - open(args.joint_filename, 'r', encoding='utf-8')) + LineSkipIterator( + open(args.joint_filename, 'r', encoding='utf-8'), + skip = args.skip_lines, + limit = args.keep_lines)) src_input = sentences_from_joint_file(src_in_f, 0) trg_in_f = stack.enter_context( - open(args.joint_filename, 'r', encoding='utf-8')) + LineSkipIterator( + open(args.joint_filename, 'r', encoding='utf-8'), + skip = args.skip_lines, + limit = args.keep_lines)) trg_input = sentences_from_joint_file(trg_in_f, 1) else: src_input = stack.enter_context( - open(args.source_filename, 'r', encoding='utf-8')) + LineSkipIterator( + open(args.source_filename, 'r', encoding='utf-8'), + skip = args.skip_lines, + limit = args.keep_lines)) trg_input = stack.enter_context( - open(args.target_filename, 'r', encoding='utf-8')) - + LineSkipIterator( + open(args.target_filename, 'r', encoding='utf-8'), + skip = args.skip_lines, + limit = args.keep_lines)) aligner.align(src_input, trg_input, links_filename_fwd=args.links_filename_fwd, links_filename_rev=args.links_filename_rev, From 1c3ddae6ddf9e601c903a18ad94fdaafb66c7a4c Mon Sep 17 00:00:00 2001 From: Robin Palotai Date: Fri, 23 May 2025 13:04:28 +0200 Subject: [PATCH 5/8] Fix reverse prior LEX direction? Not sure, but sounds logical. --- python/eflomal/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/eflomal/__init__.py b/python/eflomal/__init__.py index ce28c2d..2a18999 100644 --- a/python/eflomal/__init__.py +++ b/python/eflomal/__init__.py @@ -295,7 +295,9 @@ def calculate_priors(src_sentences, trg_sentences, logger.error('alignment out of bounds in line %d: ' '(%d, %d)', lineno + 1, i, j) raise ValueError('Invalid input on line %d' % lineno + 1) - priors[(src_sent[i], trg_sent[j])] += 1 + s, t = src_sent[i], trg_sent[j] + k = (t,s) if rev_alignments else (s,t) + priors[k] += 1 last_j = -1 last_i = -1 From ed239707ac70fab5acb0431fabacdbe24ec99ccc Mon Sep 17 00:00:00 2001 From: Robin Palotai Date: Fri, 23 May 2025 13:59:44 +0200 Subject: [PATCH 6/8] Server option trust_sents to disable n_clean. Passing trust_sents=False will set n_clean=0, which (after this change) means no sentences are trusted for statistics, so prior updates don't happen. Useful for batched sending of sentences of dubious quality. --- python/eflomal/__init__.py | 4 +++- python/eflomal/eflomal.pyx | 4 ++++ python/eflomal/server.py | 11 ++++++++++- src/eflomal.c | 21 ++++++++++++++------- 4 files changed, 31 insertions(+), 9 deletions(-) diff --git a/python/eflomal/__init__.py b/python/eflomal/__init__.py index 2a18999..1ef5399 100644 --- a/python/eflomal/__init__.py +++ b/python/eflomal/__init__.py @@ -139,7 +139,8 @@ def prepare_files(self, src_input_file, src_output_file, def align(self, src_input, trg_input, links_filename_fwd=None, links_filename_rev=None, scores_filename_fwd=None, scores_filename_rev=None, - priors_input=None, quiet=True, use_gdb=False): + priors_input=None, trust_sents=True, + quiet=True, use_gdb=False): """Run alignment for the input""" with NamedTemporaryFile('wb') as srcf, \ NamedTemporaryFile('wb') as trgf, \ @@ -187,6 +188,7 @@ def align(self, src_input, trg_input, score_model=self.score_model, n_iterations=self.n_iterations, n_samplers=self.n_samplers, + n_clean=-1 if trust_sents else 0, quiet=quiet, rel_iterations=self.rel_iterations, null_prior=self.null_prior, diff --git a/python/eflomal/eflomal.pyx b/python/eflomal/eflomal.pyx index 25de18b..f396661 100644 --- a/python/eflomal/eflomal.pyx +++ b/python/eflomal/eflomal.pyx @@ -103,6 +103,7 @@ def align( int score_model=0, tuple n_iterations=None, int n_samplers=1, + int n_clean=-1, bool quiet=True, double rel_iterations=1.0, double null_prior=0.2, @@ -123,6 +124,8 @@ def align( not given the numbers will be computed automatically based on rel_iterations n_samplers -- number of independent samplers to run + n_clean -- number of first N sentences to deem clean and use for stats + update (-1 = all). quiet -- if True, suppress output rel_iterations -- number of iterations relative to the default """ @@ -147,6 +150,7 @@ def align( '-s', source_filename, '-t', target_filename, '-n', str(n_samplers), + '-c', str(n_clean), '-N', str(null_prior), '-1', str(n_iterations[0])] if quiet: args.append('-q') diff --git a/python/eflomal/server.py b/python/eflomal/server.py index 3db9348..362820c 100644 --- a/python/eflomal/server.py +++ b/python/eflomal/server.py @@ -67,6 +67,14 @@ def alignV1(): if "3" in req_iters: iters[2] = req_iters["3"] iters = tuple(iters) + trust_sents = True + if 'trust_sents' in req: + f = req['trust_sents'] + if type(f) == bool: + trust_sents = f + else: + raise InputFormatException("trust_sents should be bool") + samplers = 3 # copied default if 'samplers' in req: samplers = int(req['samplers']) @@ -78,7 +86,7 @@ def input_iter(field): if type(f) == list: f = ' '.join(f) if type(f) != str: - raise InputFormatException("Sentence should be string") + raise InputFormatException("Sentence should be string or list of strings") yield f src_iter = input_iter("s") trg_iter = input_iter("t") @@ -94,6 +102,7 @@ def input_iter(field): aligner.align(src_iter, trg_iter, links_filename_fwd=fwd_fp, links_filename_rev=rev_fp, + trust_sents=trust_sents, quiet=log_level != "debug") except InputFormatException as e: return make_response(e.msg, 400) diff --git a/src/eflomal.c b/src/eflomal.c index 7f7583f..dac532d 100644 --- a/src/eflomal.c +++ b/src/eflomal.c @@ -100,7 +100,7 @@ struct text_alignment { // this number of sentences contain clean parallel data and should // contribute to the statistics (anything after this should still be // aligned, but don't trust the statistics): - size_t n_clean; // 0 (the default) means all sentences should be used + int32_t n_clean; // -1 (the default) means all sentences should be used count null_prior; }; @@ -230,7 +230,7 @@ void text_alignment_sample( count *jump_counts = ta->jump_counts; count *fert_counts = ta->fert_counts; const size_t n_sentences = - ta->n_clean? ta->n_clean: ta->target->n_sentences; + ta->n_clean >= 0 ? ta->n_clean: ta->target->n_sentences; // the fertility distributions (unlike the jump and lexical distributions) // are sampled explicitly, and the categorical distributions are fixed @@ -683,7 +683,7 @@ void text_alignment_make_counts(struct text_alignment *ta) { } } const size_t n_sentences = - ta->n_clean? ta->n_clean: ta->target->n_sentences; + ta->n_clean >= 0 ? ta->n_clean: ta->target->n_sentences; for (size_t sent=0; sentsentence_links[sent]; if (links == NULL) continue; @@ -971,7 +971,7 @@ struct text_alignment *text_alignment_create( ta->model = 1; ta->source = source; ta->target = target; - ta->n_clean = 0; + ta->n_clean = -1; // These should be initialized with text_alignment_load_priors() ta->source_prior = NULL; @@ -1158,6 +1158,7 @@ static void align( int score_model, double null_prior, int n_samplers, + int n_clean, int quiet, const int *n_iters, const char *links_filename, @@ -1176,6 +1177,7 @@ static void align( for (int i=0; in_clean = n_clean; tas[i]->null_prior = null_prior; if (priors_filename != NULL) { // TODO: since read-only, could use the pointer from tas[0] @@ -1227,7 +1229,11 @@ static void align( for (int j=0; j 3) { @@ -1397,7 +1404,7 @@ int main(int argc, char *argv[]) { (!reverse && links_filename_fwd == NULL && links_filename_rev == NULL)) align(reverse, source, target, model, score_model, null_prior, - n_samplers, + n_samplers, n_clean, quiet, n_iters, links_filename, stats_filename, scores_filename, priors_filename); } From ea15c412c6a2bef1011a6ed407f35d5f3f34f440 Mon Sep 17 00:00:00 2001 From: Robin Palotai Date: Thu, 29 May 2025 13:01:06 +0200 Subject: [PATCH 7/8] Support returning scores through server. --- python/eflomal/server.py | 45 +++++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/python/eflomal/server.py b/python/eflomal/server.py index 362820c..7d95311 100644 --- a/python/eflomal/server.py +++ b/python/eflomal/server.py @@ -4,6 +4,7 @@ import os import functools import time +import math from eflomal import Aligner, sentences_from_joint_file from tempfile import TemporaryDirectory @@ -79,22 +80,37 @@ def alignV1(): if 'samplers' in req: samplers = int(req['samplers']) + scoring = True + if 'scoring' in req: + f = req['scoring'] + if type(f) == bool: + scoring = f + else: + raise InputFormatException("scoring should be bool") + num_sents = len(req['sents']) - def input_iter(field): - for sent in req['sents']: + sent_stoks = [0] * num_sents + sent_ttoks = [0] * num_sents + def input_iter(field, toks): + for n, sent in enumerate(req['sents']): f = sent[field] if type(f) == list: + toks[n] = len(f) f = ' '.join(f) - if type(f) != str: + elif type(f) == str: + toks[n] = len(f.split()) + else: raise InputFormatException("Sentence should be string or list of strings") yield f - src_iter = input_iter("s") - trg_iter = input_iter("t") + src_iter = input_iter("s", sent_stoks) + trg_iter = input_iter("t", sent_ttoks) t10 = time.time() with TemporaryDirectory() as td: fwd_fp = os.path.join(td, "req.fwd") rev_fp = os.path.join(td, "req.rev") + fsc_fp = os.path.join(td, "rsc.fwd") if scoring else None + rsc_fp = os.path.join(td, "rsc.rev") if scoring else None aligner.n_iterations = iters aligner.n_samplers = samplers @@ -102,15 +118,30 @@ def input_iter(field): aligner.align(src_iter, trg_iter, links_filename_fwd=fwd_fp, links_filename_rev=rev_fp, + scores_filename_fwd=fsc_fp, + scores_filename_rev=rsc_fp, trust_sents=trust_sents, quiet=log_level != "debug") except InputFormatException as e: return make_response(e.msg, 400) + scores = [] + if scoring: + with open(fsc_fp, 'r') as fscf, open(rsc_fp, 'r') as rscf: + for fs, rs in zip(fscf, rscf): + scores.append((float(fs), float(rs))) + with open(fwd_fp, 'r') as fwdf, open(rev_fp, 'r') as revf: fr_pairs = [] - for f, r in zip(fwdf, revf): - fr_pairs.append({ "fwd": f.strip(), "rev": r.strip() }) + for n, (f, r) in enumerate(zip(fwdf, revf)): + res = { "fwd": f.strip(), "rev": r.strip() } + if scoring: + fs, rs = scores[n] + res["score_fwd"] = fs + res["score_rev"] = rs + res["norm_score_fwd"] = fs - math.log(sent_ttoks[n]) + res["norm_score_rev"] = rs - math.log(sent_stoks[n]) + fr_pairs.append(res) if len(fr_pairs) != num_sents: raise Exception(f'Number of alignments differ from inputs: {len(fr_pairs)} != {num_sents}') res = { "aligns": fr_pairs } From a77b95e867628889e93af259c0706a8bca383051 Mon Sep 17 00:00:00 2001 From: Robin Palotai Date: Tue, 15 Jul 2025 20:58:32 +0200 Subject: [PATCH 8/8] Add small test server data example. --- devscripts/curl_server.sh | 9 ++++- devscripts/server_config.json | 8 +++++ devscripts/testdata/create-prior.sh | 3 ++ devscripts/testdata/my-align.fwd | 6 ++++ devscripts/testdata/my-align.pri | 56 +++++++++++++++++++++++++++++ devscripts/testdata/my-align.rev | 6 ++++ devscripts/testdata/my-align.txt | 6 ++++ 7 files changed, 93 insertions(+), 1 deletion(-) mode change 100644 => 100755 devscripts/curl_server.sh create mode 100644 devscripts/server_config.json create mode 100755 devscripts/testdata/create-prior.sh create mode 100644 devscripts/testdata/my-align.fwd create mode 100644 devscripts/testdata/my-align.pri create mode 100644 devscripts/testdata/my-align.rev create mode 100644 devscripts/testdata/my-align.txt diff --git a/devscripts/curl_server.sh b/devscripts/curl_server.sh old mode 100644 new mode 100755 index b28687d..99bbcf8 --- a/devscripts/curl_server.sh +++ b/devscripts/curl_server.sh @@ -1 +1,8 @@ -curl -X POST $@ localhost:5000/api/align/v1 -H 'Content-type: application/json' -d '{"aligner": "my-align", "samplers": 3, "iters": {"1":64, "2": 32, "3": 8}, "sents":[{"s":"Die Kuh", "t":"The cow"}]}' +#!/bin/env sh + +echo "Run eflomal-server in this directory, so it picks up server_config.json, then perform the sample request:" + +curl -X POST $@ localhost:5000/api/align/v1 -H 'Content-type: application/json' -H 'Accept: application/json' -d '{"aligner": "my-align", "samplers": 3, "iters": {"1":64, "2": 32, "3": 8}, "trust_sents": false, "sents":[{"s":"The cow and grass", "t":"Die Kuh und das Gras"}]}' + +# Approximate response: +# {"aligns":[{"fwd":"0-0 0-1 1-2 2-3 3-4","norm_score_fwd":3.954102087565899,"norm_score_rev":2.250345638880109,"rev":"0-0 1-1 2-3 3-4","score_fwd":5.56354,"score_rev":3.63664}]} diff --git a/devscripts/server_config.json b/devscripts/server_config.json new file mode 100644 index 0000000..52a21b3 --- /dev/null +++ b/devscripts/server_config.json @@ -0,0 +1,8 @@ +{ + "aligners": [ + { "name": "my-align", + "priors": "testdata/my-align.pri" + } + ], + "log_level": "debug" +} diff --git a/devscripts/testdata/create-prior.sh b/devscripts/testdata/create-prior.sh new file mode 100755 index 0000000..91eb82f --- /dev/null +++ b/devscripts/testdata/create-prior.sh @@ -0,0 +1,3 @@ +#!/bin/env sh +eflomal-align -i my-align.txt -f my-align.fwd -r my-align.rev +eflomal-makepriors -i my-align.txt -f my-align.fwd -r my-align.rev -p my-align.pri diff --git a/devscripts/testdata/my-align.fwd b/devscripts/testdata/my-align.fwd new file mode 100644 index 0000000..c97e1a8 --- /dev/null +++ b/devscripts/testdata/my-align.fwd @@ -0,0 +1,6 @@ +1-0 2-2 3-3 +0-0 1-1 2-2 4-3 6-4 +1-0 2-2 3-3 +0-0 2-1 4-2 +0-0 2-1 3-2 4-3 +1-0 2-2 3-3 4-4 5-5 6-6 diff --git a/devscripts/testdata/my-align.pri b/devscripts/testdata/my-align.pri new file mode 100644 index 0000000..93af577 --- /dev/null +++ b/devscripts/testdata/my-align.pri @@ -0,0 +1,56 @@ +LEX Auf There 1 +LEX Die cow 3 +LEX Freunde friends 1 +LEX Gras grass 1 +LEX Häschen rabbit 2 +LEX Lass Let 2 +LEX Wiese a 1 +LEX das the 2 +LEX der is 1 +LEX die cow 1 +LEX eine grass 1 +LEX frisst eats 1 +LEX ist is 1 +LEX mich see 2 +LEX schläfrig sleepy 1 +LEX sind are 1 +LEX steht on 1 +LEX und and 1 +FERF Let 1 2 +FERF There 1 1 +FERF a 1 1 +FERF and 1 1 +FERF are 1 1 +FERF cow 1 4 +FERF eats 1 1 +FERF friends 1 1 +FERF grass 1 2 +FERF is 1 2 +FERF on 1 1 +FERF rabbit 1 2 +FERF see 1 2 +FERF sleepy 1 1 +FERF the 1 2 +FERR Auf 1 1 +FERR Die 1 3 +FERR Freunde 1 1 +FERR Gras 1 1 +FERR Häschen 1 2 +FERR Kuh 1 5 +FERR Lass 1 2 +FERR Wiese 1 1 +FERR das 1 2 +FERR der 1 1 +FERR die 1 1 +FERR eine 1 1 +FERR frisst 1 1 +FERR ist 1 1 +FERR mich 1 2 +FERR schläfrig 1 1 +FERR sind 1 1 +FERR steht 1 1 +FERR und 1 1 +HMMF 1 22 +HMMF 2 8 +HMMR 1 33 +HMMR 2 2 diff --git a/devscripts/testdata/my-align.rev b/devscripts/testdata/my-align.rev new file mode 100644 index 0000000..85f02b9 --- /dev/null +++ b/devscripts/testdata/my-align.rev @@ -0,0 +1,6 @@ +0-0 1-1 2-2 3-3 +0-0 1-1 2-2 3-3 4-4 5-5 +0-0 1-1 2-2 3-3 +0-0 1-1 3-2 4-3 +0-0 1-1 3-2 4-3 +0-0 1-1 2-2 3-3 4-4 5-5 6-6 diff --git a/devscripts/testdata/my-align.txt b/devscripts/testdata/my-align.txt new file mode 100644 index 0000000..c55f1d2 --- /dev/null +++ b/devscripts/testdata/my-align.txt @@ -0,0 +1,6 @@ +The cow eats grass ||| Die Kuh frisst Gras +There is a cow on the grass ||| Auf der Wiese steht eine Kuh +The cow is sleepy ||| Die Kuh ist schläfrig +Let me see the cow ||| Lass mich die Kuh sehen +Let me see the rabbit ||| Lass mich das Häschen sehen +The cow and the rabbit are friends ||| Die Kuh und das Häschen sind Freunde