robertostling · s2713 · Jun 8, 2018 · Jun 8, 2018 · Jun 8, 2018 · Jun 8, 2018
diff --git a/.gitignore b/.gitignore
@@ -31,3 +31,59 @@
 # Debug files
 *.dSYM/
 *.su
+
+#binary
+eflomal
+
+#Python stuff
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
+# C extensions
+*.so
+
+# Distribution / packaging
+bin/
+build/
+develop-eggs/
+dist/
+eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+.tox/
+.coverage
+.cache
+nosetests.xml
+coverage.xml
+
+# Translations
+*.mo
+
+# Mr Developer
+.mr.developer.cfg
+.project
+.pydevproject
+
+# Rope
+.ropeproject
+
+# Django stuff:
+*.log
+*.pot
+
+# Sphinx documentation
+docs/_build/
diff --git a/.gitmodules b/.gitmodules
diff --git a/Makefile b/Makefile
@@ -1,7 +1,12 @@
 CFLAGS=-Ofast -march=native -Wall --std=gnu99 -Wno-unused-function -g -fopenmp
 # This is more suitable for debugging:
 #CFLAGS=-Og -Wall --std=gnu99 -Wno-unused-function -g -fopenmp
-LDFLAGS=-lm -lrt -lgomp -fopenmp
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),Darwin)
+    LDFLAGS=-lm -lgomp -fopenmp
+else
+    LDFLAGS=-lm -lrt -lgomp -fopenmp
+endif
 
 all: eflomal
 
@@ -11,7 +16,7 @@ eflomal.o: eflomal.c natmap.c hash.c random.c simd_math_prims.h
 eflomal: eflomal.o
 
 install: eflomal
-	install -t /usr/local/bin eflomal
+	install eflomal /usr/local/bin/eflomal
 
 clean:
 	rm -f eflomal eflomal.o

diff --git a/README.md b/README.md
@@ -1,7 +1,11 @@
 # eflomal
 Efficient Low-Memory Aligner
 
-This is a word alignment tool based on
+This work is a fork of Robert Ösling's [eflomal](https://github.com/robertostling/eflomal) with a few fixes and additional features:
+* when builing on Mac OS, remove `-lrt` from `LDFLAGS`
+* add `mkmodel.py` script for computing translation probabilities directly from a parallel corpus; this first computes alignment using `eflomal` then derives probabilities from it
+
+`eflomal` is a word alignment tool based on
 [efmaral](https://github.com/robertostling/efmaral), with the following main
 differences:
  * More compact data structures are used, so memory requirements are much
@@ -26,6 +30,12 @@ default `/usr/local/bin`. Note that the `align.py` script now uses the
 `eflomal` executable in the same directory as `align.py`, rather than in
 `$PATH`.
 
+On mac you will need to compile using `gcc` because `clang` does not support `openmp`:
+```
+    brew install gcc
+    export CC=/usr/local/bin/gcc-8
+```
+Change `CC` to match your settings if necessary. Then proceed to build and install normally.
 
 ## Using
 

diff --git a/__init__.py b/__init__.py
@@ -0,0 +1 @@
+from .ibm1 import IBM1
diff --git a/ibm1.py b/ibm1.py
@@ -0,0 +1,89 @@
+# -*- coding: utf-8 -*-
+"""Implement IBM1 translation model with translation table and estimator."""
+from math import pow
+
+class IBM1():
+    """Implement IBM1 translation model with translation table and estimator.
+
+    Class members:
+    voc_s (dict{str -> int}): source vocabulary index
+    voc_t (dict{str -> int}): target vocabulary index
+    p (scipy.sparse.lil_matrix): source to target translation probabilities
+
+    Class methods:
+    __init__(self, p, voc_s, voc_t): instantiate an IBM1 object from a matrix and a source and a target vocabulary hash tables
+    get: translation probability look up
+    estimate: compute phrase translation probability
+    estimate_normalized: compute phrase translation probability, normalized by target phrase length
+    dump: write out human readable serialization of the translation table
+    """
+    def __init__(self, p, voc_s, voc_t):
+        """Instantiate an IBM1 object.
+
+        :param p (scipy.sparse.lil_matrix): translation table stored as sparse matrix
+        :param voc_s (dict{str -> int}): source vocabulary index
+        :param voc_t (dict{str -> int}): target vocabulary index
+        """
+        self.p = p
+        self.voc_s = voc_s
+        self.voc_t = voc_t
+
+    def get(self, word_s, word_t):
+        """Look up translation probability. Parameters can be strings or indexes.
+
+        :param word_s (str or int): source word
+        :param word_t (str or int): target word
+        :return: translation probability P(word_t|word_s)
+        """
+        s_index = word_s if type(word_s) == int else self.voc_s.get(word_s, -1)
+        t_index = word_t if type(word_t) == int else self.voc_t.get(word_t, -1)
+
+        if s_index < 0 or t_index < 0: return 0.0
+
+        return self.p[s_index, t_index]
+
+    def dump(self, file):
+        """Write out human readable serialization of the translation table as TSV file.
+
+        :param file: File object opened for writing (works with convenience.XFiles)
+        """
+        voc_s_rev = {}
+        for w, i in self.voc_s.items():
+            voc_s_rev[i] = w
+        voc_t_rev = {}
+        for w, i in self.voc_t.items():
+            voc_t_rev[i] = w
+        X,Y = self.p.nonzero()
+        for s,t in zip(X,Y):
+            file.write("{}\t{}\t{}\n".format(voc_s_rev[s], voc_t_rev[t], self.p[s,t]))
+
+    def estimate(self, S, T):
+        """Compute phrase translation probability according to IBM1 model. P(T|S) = \prod_t \sum_s P(t|s)
+
+        :param S: list of source words (str or int)
+        :param T: list of target words (str or int)
+        :return (float): P(T|S)
+        """
+        if len(S) == 0:
+            return 0.0
+        p = 1.0
+        for t in T:
+            partial = 0.0
+            for s in S:
+                partial += self.get(s, t)
+            p = p * partial
+        return p / len(S)
+
+    def estimate_normalized(self, S, T):
+        """Compute phrase translation probability according to IBM1 model, normalized in order not penalize longer sentences.
+
+        Pnorm(T|S) = P(T|S)^len(1/T)
+        :param S: list of source words (str or int)
+        :param T: list of target words (str or int)
+        :return (float): P(T|S)^len(1/T)
+        """
+        p = self.estimate(S, T)
+        if len(T) == 0:
+            return p
+        else:
+            return pow(p, 1/len(T))
diff --git a/ibm1stats.py b/ibm1stats.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import argparse
+import pickle
+
+from convenience import xopen
+from convenience import Logger
+from convenience import header, blue, green, yellow, orange, red, bold, underline
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument("input", help="Input model file. Pickle or text, gzipped or not. Automatically processed extensions are .pickle, .pickle.gz, .gz. Required.")
+    parser.add_argument("--delimiter", "-d", type=str, dest="delimiter", default="\t",
+                        help="Delimiter used in model file, if in text format. Use plain string between quotes. Default=<tab>.")
+    parser.add_argument("-v", "--verbosity", action="count", default=0, help="increase verbosity")
+    args = parser.parse_args()
+    logger = Logger(args.verbosity)
+
+    logger.info("Loading model")
+
+    if args.input.endswith(".pickle.gz") or args.input.endswith(".pickle"):
+        logger.debug("Pickle detected")
+        with xopen(args.input, "rb") as f:
+            model = pickle.load(f)
+        print(blue("Source vocabulary size:\t"+bold(str(len(model.voc_s)))))
+        print(blue("Target vocabulary size:\t"+bold(str(len(model.voc_t)))))
+        print(green("Number of entries:\t"+bold(str(model.p.count_nonzero()))))
+
+    else:
+        logger.debug("Text format detected")
+        with xopen(args.input, "r") as f:
+            n_entries = 0
+            voc_s = set()
+            voc_t = set()
+            for line in f.readlines():
+                entry = line.split(args.delimiter, maxsplit=2)
+                voc_s.add(entry[0])
+                voc_t.add(entry[1])
+                n_entries += 1
+
+        print(blue("Source vocabulary size:\t"+bold(str(len(voc_s)))))
+        print(blue("Target vocabulary size:\t"+bold(str(len(voc_t)))))
+        print(green("Number of entries:\t"+bold(str(n_entries))))