diff --git a/.gitignore b/.gitignore index f805e81..e8a21ac 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,59 @@ # Debug files *.dSYM/ *.su + +#binary +eflomal + +#Python stuff + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] + +# C extensions +*.so + +# Distribution / packaging +bin/ +build/ +develop-eggs/ +dist/ +eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +.tox/ +.coverage +.cache +nosetests.xml +coverage.xml + +# Translations +*.mo + +# Mr Developer +.mr.developer.cfg +.project +.pydevproject + +# Rope +.ropeproject + +# Django stuff: +*.log +*.pot + +# Sphinx documentation +docs/_build/ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..e69de29 diff --git a/Makefile b/Makefile index b467c1e..2b89b59 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,12 @@ CFLAGS=-Ofast -march=native -Wall --std=gnu99 -Wno-unused-function -g -fopenmp # This is more suitable for debugging: #CFLAGS=-Og -Wall --std=gnu99 -Wno-unused-function -g -fopenmp -LDFLAGS=-lm -lrt -lgomp -fopenmp +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S),Darwin) + LDFLAGS=-lm -lgomp -fopenmp +else + LDFLAGS=-lm -lrt -lgomp -fopenmp +endif all: eflomal @@ -11,7 +16,7 @@ eflomal.o: eflomal.c natmap.c hash.c random.c simd_math_prims.h eflomal: eflomal.o install: eflomal - install -t /usr/local/bin eflomal + install eflomal /usr/local/bin/eflomal clean: rm -f eflomal eflomal.o diff --git a/README.md b/README.md index ced3b31..adaa081 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,11 @@ # eflomal Efficient Low-Memory Aligner -This is a word alignment tool based on +This work is a fork of Robert Ösling's [eflomal](https://github.com/robertostling/eflomal) with a few fixes and additional features: +* when builing on Mac OS, remove `-lrt` from `LDFLAGS` +* add `mkmodel.py` script for computing translation probabilities directly from a parallel corpus; this first computes alignment using `eflomal` then derives probabilities from it + +`eflomal` is a word alignment tool based on [efmaral](https://github.com/robertostling/efmaral), with the following main differences: * More compact data structures are used, so memory requirements are much @@ -26,6 +30,12 @@ default `/usr/local/bin`. Note that the `align.py` script now uses the `eflomal` executable in the same directory as `align.py`, rather than in `$PATH`. +On mac you will need to compile using `gcc` because `clang` does not support `openmp`: +``` + brew install gcc + export CC=/usr/local/bin/gcc-8 +``` +Change `CC` to match your settings if necessary. Then proceed to build and install normally. ## Using diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..4156998 --- /dev/null +++ b/__init__.py @@ -0,0 +1 @@ +from .ibm1 import IBM1 diff --git a/ibm1.py b/ibm1.py new file mode 100644 index 0000000..d5b244f --- /dev/null +++ b/ibm1.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- +"""Implement IBM1 translation model with translation table and estimator.""" +from math import pow + +class IBM1(): + """Implement IBM1 translation model with translation table and estimator. + + Class members: + voc_s (dict{str -> int}): source vocabulary index + voc_t (dict{str -> int}): target vocabulary index + p (scipy.sparse.lil_matrix): source to target translation probabilities + + Class methods: + __init__(self, p, voc_s, voc_t): instantiate an IBM1 object from a matrix and a source and a target vocabulary hash tables + get: translation probability look up + estimate: compute phrase translation probability + estimate_normalized: compute phrase translation probability, normalized by target phrase length + dump: write out human readable serialization of the translation table + """ + def __init__(self, p, voc_s, voc_t): + """Instantiate an IBM1 object. + + :param p (scipy.sparse.lil_matrix): translation table stored as sparse matrix + :param voc_s (dict{str -> int}): source vocabulary index + :param voc_t (dict{str -> int}): target vocabulary index + """ + self.p = p + self.voc_s = voc_s + self.voc_t = voc_t + + def get(self, word_s, word_t): + """Look up translation probability. Parameters can be strings or indexes. + + :param word_s (str or int): source word + :param word_t (str or int): target word + :return: translation probability P(word_t|word_s) + """ + s_index = word_s if type(word_s) == int else self.voc_s.get(word_s, -1) + t_index = word_t if type(word_t) == int else self.voc_t.get(word_t, -1) + + if s_index < 0 or t_index < 0: return 0.0 + + return self.p[s_index, t_index] + + def dump(self, file): + """Write out human readable serialization of the translation table as TSV file. + + :param file: File object opened for writing (works with convenience.XFiles) + """ + voc_s_rev = {} + for w, i in self.voc_s.items(): + voc_s_rev[i] = w + voc_t_rev = {} + for w, i in self.voc_t.items(): + voc_t_rev[i] = w + X,Y = self.p.nonzero() + for s,t in zip(X,Y): + file.write("{}\t{}\t{}\n".format(voc_s_rev[s], voc_t_rev[t], self.p[s,t])) + + def estimate(self, S, T): + """Compute phrase translation probability according to IBM1 model. P(T|S) = \prod_t \sum_s P(t|s) + + :param S: list of source words (str or int) + :param T: list of target words (str or int) + :return (float): P(T|S) + """ + if len(S) == 0: + return 0.0 + p = 1.0 + for t in T: + partial = 0.0 + for s in S: + partial += self.get(s, t) + p = p * partial + return p / len(S) + + def estimate_normalized(self, S, T): + """Compute phrase translation probability according to IBM1 model, normalized in order not penalize longer sentences. + + Pnorm(T|S) = P(T|S)^len(1/T) + :param S: list of source words (str or int) + :param T: list of target words (str or int) + :return (float): P(T|S)^len(1/T) + """ + p = self.estimate(S, T) + if len(T) == 0: + return p + else: + return pow(p, 1/len(T)) diff --git a/ibm1stats.py b/ibm1stats.py new file mode 100755 index 0000000..3634a27 --- /dev/null +++ b/ibm1stats.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import argparse +import pickle + +from convenience import xopen +from convenience import Logger +from convenience import header, blue, green, yellow, orange, red, bold, underline + +if __name__=="__main__": + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument("input", help="Input model file. Pickle or text, gzipped or not. Automatically processed extensions are .pickle, .pickle.gz, .gz. Required.") + parser.add_argument("--delimiter", "-d", type=str, dest="delimiter", default="\t", + help="Delimiter used in model file, if in text format. Use plain string between quotes. Default=.") + parser.add_argument("-v", "--verbosity", action="count", default=0, help="increase verbosity") + args = parser.parse_args() + logger = Logger(args.verbosity) + + logger.info("Loading model") + + if args.input.endswith(".pickle.gz") or args.input.endswith(".pickle"): + logger.debug("Pickle detected") + with xopen(args.input, "rb") as f: + model = pickle.load(f) + print(blue("Source vocabulary size:\t"+bold(str(len(model.voc_s))))) + print(blue("Target vocabulary size:\t"+bold(str(len(model.voc_t))))) + print(green("Number of entries:\t"+bold(str(model.p.count_nonzero())))) + + else: + logger.debug("Text format detected") + with xopen(args.input, "r") as f: + n_entries = 0 + voc_s = set() + voc_t = set() + for line in f.readlines(): + entry = line.split(args.delimiter, maxsplit=2) + voc_s.add(entry[0]) + voc_t.add(entry[1]) + n_entries += 1 + + print(blue("Source vocabulary size:\t"+bold(str(len(voc_s))))) + print(blue("Target vocabulary size:\t"+bold(str(len(voc_t))))) + print(green("Number of entries:\t"+bold(str(n_entries)))) diff --git a/mkmodel.py b/mkmodel.py new file mode 100755 index 0000000..2c0e683 --- /dev/null +++ b/mkmodel.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python3 + +from eflomal import read_text, write_text, align +from ibm1 import IBM1 + +import sys, argparse, os, gzip, pickle, logging +from tempfile import NamedTemporaryFile +from progressbar import ProgressBar, Percentage, Bar +from convenience import xopen, Logger + +from scipy.sparse import lil_matrix +from numpy import zeros + + +def compute_counts_fwd(voc_s, voc_t, src_sents, trg_sents, alignment_filename, lowercase): + + counts = lil_matrix((len(voc_s.items()), len(voc_t.items()))) + s_counts = zeros(len(voc_s.items())) + + with xopen(alignment_filename , "r") as afile: + pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(src_sents)).start() + i = 0 + s = src_sents[i] + t = trg_sents[i] + aline = afile.readline() + while aline != "": + a = [(int(x), int(y)) for x,y in [apair.split("-") for apair in aline.split()]] + for s_i, t_i in a: + token_s = s[s_i] + token_t = t[t_i] + token_s_id = voc_s[token_s] + token_t_id = voc_t[token_t] + counts[token_s_id, token_t_id] += 1 + s_counts[token_s_id] += 1 + + i += 1 + pbar.update(i) + if i < len(src_sents): + s = src_sents[i] + t = trg_sents[i] + aline = afile.readline() + + pbar.finish() + + return counts, s_counts + +def compute_counts_rev(voc_s, voc_t, src_sents, trg_sents, alignment_filename, lowercase): + + counts = lil_matrix((len(voc_t.items()), len(voc_s.items()))) + t_counts = zeros(len(voc_t.items())) + + with xopen(alignment_filename, "r") as afile: + pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(src_sents)).start() + i = 0 + s = src_sents[i] + t = trg_sents[i] + aline = afile.readline() + while aline!="": + a = [(int(x), int(y)) for x, y in [apair.split("-") for apair in aline.split()]] + + for s_i, t_i in a: + token_s = s[s_i] + token_t = t[t_i] + token_s_id = voc_s[token_s] + token_t_id = voc_t[token_t] + counts[token_t_id, token_s_id] += 1 + t_counts[token_t_id] += 1 + + i += 1 + pbar.update(i) + if i index hash table + logger.info("Extracting vocabulary...") + voc_s = make_voc(src_sents) + voc_t = make_voc(trg_sents) + + if args.p_filename_fwd is not None: + logger.info("Estimating forward counts...") + counts, s_counts = compute_counts_fwd(voc_s, voc_t, src_sents, trg_sents, fwd_alignment_file.name, args.lowercase) + logger.info("Estimating forward probabilities...") + p = compute_p(voc_s, voc_t, counts, s_counts) + logger.info("Saving forward probabilities...") + model = IBM1(p, voc_s, voc_t) + save_p(model, args.p_filename_fwd) + if args.p_filename_fwd_h is not None: + with xopen(args.p_filename_fwd_h, "w") as f: + model.dump(f) + + if args.p_filename_rev is not None: + logger.info("Estimating reverse counts...") + counts, t_counts = compute_counts_rev(voc_s, voc_t, src_sents, trg_sents, rev_alignment_file.name, args.lowercase) + logger.info("Estimating reverse probabilities...") + p = compute_p(voc_t, voc_s, counts, t_counts) + logger.info("Saving reverse probabilities...") + model = IBM1(p, voc_t, voc_s) + save_p(model, args.p_filename_rev) + if args.p_filename_rev_h is not None: + with xopen(args.p_filename_rev_h, "w") as f: + model.dump(f) + + fwd_alignment_file.close() + rev_alignment_file.close() + + +if __name__ == '__main__': main() + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..adcd978 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +convenience