From 81f99c432f4f79382e2412f8c5054e4c50c0e2f4 Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Fri, 8 Jun 2018 11:44:10 +0200 Subject: [PATCH 01/29] add executable and python stuff --- .gitignore | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/.gitignore b/.gitignore index f805e81..e8a21ac 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,59 @@ # Debug files *.dSYM/ *.su + +#binary +eflomal + +#Python stuff + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] + +# C extensions +*.so + +# Distribution / packaging +bin/ +build/ +develop-eggs/ +dist/ +eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +.tox/ +.coverage +.cache +nosetests.xml +coverage.xml + +# Translations +*.mo + +# Mr Developer +.mr.developer.cfg +.project +.pydevproject + +# Rope +.ropeproject + +# Django stuff: +*.log +*.pot + +# Sphinx documentation +docs/_build/ From d2436f1f87d024d5335378e13bb63bd4b19e6b4c Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Fri, 8 Jun 2018 11:44:23 +0200 Subject: [PATCH 02/29] fix linking and installing on mac --- Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index b467c1e..6307576 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,8 @@ CFLAGS=-Ofast -march=native -Wall --std=gnu99 -Wno-unused-function -g -fopenmp # This is more suitable for debugging: #CFLAGS=-Og -Wall --std=gnu99 -Wno-unused-function -g -fopenmp -LDFLAGS=-lm -lrt -lgomp -fopenmp +#LDFLAGS=-lm -lrt -lgomp -fopenmp +LDFLAGS=-lm -lgomp -fopenmp all: eflomal @@ -11,7 +12,7 @@ eflomal.o: eflomal.c natmap.c hash.c random.c simd_math_prims.h eflomal: eflomal.o install: eflomal - install -t /usr/local/bin eflomal + install eflomal /usr/local/bin/eflomal clean: rm -f eflomal eflomal.o From 76efda267aa88c99a8ddcba534f6e99073d10606 Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Fri, 8 Jun 2018 11:44:48 +0200 Subject: [PATCH 03/29] add description of this fork; add instructions to build on mac --- README.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ced3b31..2d3411d 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,11 @@ # eflomal Efficient Low-Memory Aligner -This is a word alignment tool based on +This work is a fork of Robert Ösling's [eflomal](https://github.com/robertostling/eflomal) with a few fixes and additional features: +* removed `-lrt` from `LDFLAGS` to allow linking on Max OS using `gcc` (***not*** `clang`) +* `mkmodel.py` script for computing translation probabilities directly from a parallel corpus; this first computes alignment using `eflomal` then derives probabilities from it + +`eflomal` is a word alignment tool based on [efmaral](https://github.com/robertostling/efmaral), with the following main differences: * More compact data structures are used, so memory requirements are much @@ -26,6 +30,12 @@ default `/usr/local/bin`. Note that the `align.py` script now uses the `eflomal` executable in the same directory as `align.py`, rather than in `$PATH`. +On mac you will need to compile using `gcc`: +``` + brew install gcc + export CC=/usr/local/bin/gcc-8 +``` +Then proceed to build and install normally. ## Using From 80eca9a130b822e1b98776cad5efa9ef384cd1f9 Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Fri, 8 Jun 2018 11:55:50 +0200 Subject: [PATCH 04/29] autodetect Mac OS in Makefile; add precision to README --- Makefile | 8 ++++++-- README.md | 8 ++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 6307576..2b89b59 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,12 @@ CFLAGS=-Ofast -march=native -Wall --std=gnu99 -Wno-unused-function -g -fopenmp # This is more suitable for debugging: #CFLAGS=-Og -Wall --std=gnu99 -Wno-unused-function -g -fopenmp -#LDFLAGS=-lm -lrt -lgomp -fopenmp -LDFLAGS=-lm -lgomp -fopenmp +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S),Darwin) + LDFLAGS=-lm -lgomp -fopenmp +else + LDFLAGS=-lm -lrt -lgomp -fopenmp +endif all: eflomal diff --git a/README.md b/README.md index 2d3411d..adaa081 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,8 @@ Efficient Low-Memory Aligner This work is a fork of Robert Ösling's [eflomal](https://github.com/robertostling/eflomal) with a few fixes and additional features: -* removed `-lrt` from `LDFLAGS` to allow linking on Max OS using `gcc` (***not*** `clang`) -* `mkmodel.py` script for computing translation probabilities directly from a parallel corpus; this first computes alignment using `eflomal` then derives probabilities from it +* when builing on Mac OS, remove `-lrt` from `LDFLAGS` +* add `mkmodel.py` script for computing translation probabilities directly from a parallel corpus; this first computes alignment using `eflomal` then derives probabilities from it `eflomal` is a word alignment tool based on [efmaral](https://github.com/robertostling/efmaral), with the following main @@ -30,12 +30,12 @@ default `/usr/local/bin`. Note that the `align.py` script now uses the `eflomal` executable in the same directory as `align.py`, rather than in `$PATH`. -On mac you will need to compile using `gcc`: +On mac you will need to compile using `gcc` because `clang` does not support `openmp`: ``` brew install gcc export CC=/usr/local/bin/gcc-8 ``` -Then proceed to build and install normally. +Change `CC` to match your settings if necessary. Then proceed to build and install normally. ## Using From 3580c3efc97db0ea7c0468e76fc285d2a022fcb0 Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Fri, 8 Jun 2018 13:29:54 +0200 Subject: [PATCH 05/29] [WIP] start implementing mkmodel (see README) --- mkmodel.py | 170 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100755 mkmodel.py diff --git a/mkmodel.py b/mkmodel.py new file mode 100755 index 0000000..36bc8e1 --- /dev/null +++ b/mkmodel.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 + +from eflomal import read_text, write_text, align + +import sys, argparse, random, os +from tempfile import NamedTemporaryFile + +from scipy.sparse import lil_matrix + +def compute_counts(): + fwd_alignment_file.name + +def main(): + parser = argparse.ArgumentParser( + description='mkmodel.py: compute IBM-1 translation probabilties using eflomal, the efficient low-memory aligner') + parser.add_argument( + '-v', '--verbose', dest='verbose', + action='store_true', help='Enable verbose output') + parser.add_argument( + '--debug', dest='debug', + action='store_true', help='Enable gdb debugging of eflomal binary') + parser.add_argument( + '--overwrite', dest='overwrite', + action='store_true', help='Overwrite existing output files') + parser.add_argument( + '--null-prior', dest='null_prior', default=0.2, metavar='X', + type=float, help='Prior probability of NULL alignment') + parser.add_argument( + '-m', '--model', dest='model', default=3, metavar='N', + type=int, help='Model (1 = IBM1, 2 = IBM1+HMM, 3 = IBM1+HMM+fertility)') + parser.add_argument( + '--source-prefix', dest='source_prefix_len', default=0, metavar='N', + type=int, help='Length of prefix for stemming (source)') + parser.add_argument( + '--source-suffix', dest='source_suffix_len', default=0, metavar='N', + type=int, help='Length of suffix for stemming (source)') + parser.add_argument( + '--target-prefix', dest='target_prefix_len', default=0, metavar='N', + type=int, help='Length of prefix for stemming (target)') + parser.add_argument( + '--target-suffix', dest='target_suffix_len', default=0, metavar='N', + type=int, help='Length of suffix for stemming (target)') + parser.add_argument( + '-l', '--length', dest='length', default=1.0, metavar='X', + type=float, help='Relative number of sampling iterations') + parser.add_argument( + '-1', '--ibm1-iters', dest='iters1', default=None, metavar='X', + type=int, help='Number of IBM1 iterations (overrides --length)') + parser.add_argument( + '-2', '--hmm-iters', dest='iters2', default=None, metavar='X', + type=int, help='Number of HMM iterations (overrides --length)') + parser.add_argument( + '-3', '--fert-iters', dest='iters3', default=None, metavar='X', + type=int, + help='Number of HMM+fertility iterations (overrides --length)') + parser.add_argument( + '--n-samplers', dest='n_samplers', default=3, metavar='X', + type=int, help='Number of independent samplers to run') + parser.add_argument( + '-s', '--source', dest='source_filename', type=str, metavar='filename', + help='Source text filename', required=True) + parser.add_argument( + '-t', '--target', dest='target_filename', type=str, metavar='filename', + help='Target text filename', required=True) + parser.add_argument( + '-f', '--forward-probabilities', dest='p_filename_fwd', type=str, + metavar='filename', + help='Filename to write forward direction probabilities to') + parser.add_argument( + '-r', '--reverse-probabilities', dest='p_filename_rev', type=str, + metavar='filename', + help='Filename to write reverse direction probabilities to') + + args = parser.parse_args() + + if args.p_filename_fwd is None and args.p_filename_rev is None: + print('ERROR: no file to save probabilities (-f/-r), will do nothing.', + file=sys.stderr, flush=True) + sys.exit(1) + + for filename in (args.source_filename, args.target_filename): + if not os.path.exists(filename): + print('ERROR: input file %s does not exist!' % filename, + file=sys.stderr, flush=True) + sys.exit(1) + + for filename in (args.p_filename_fwd, args.p_filename_rev): + if (not args.overwrite) and (filename is not None) \ + and os.path.exists(filename): + print('ERROR: output file %s exists, will not overwrite!' % \ + filename, + file=sys.stderr, flush=True) + sys.exit(1) + + if args.verbose: + print('Reading source text from %s...' % args.source_filename, + file=sys.stderr, flush=True) + with open(args.source_filename, 'r', encoding='utf-8') as f: + src_sents, src_index = read_text( + f, True, args.source_prefix_len, args.source_suffix_len) + n_src_sents = len(src_sents) + src_voc_size = len(src_index) + src_index = None + srcf = NamedTemporaryFile('wb') + write_text(srcf, tuple(src_sents), src_voc_size) + + if args.verbose: + print('Reading target text from %s...' % args.target_filename, + file=sys.stderr, flush=True) + with open(args.target_filename, 'r', encoding='utf-8') as f: + trg_sents, trg_index = read_text( + f, True, args.target_prefix_len, args.target_suffix_len) + trg_voc_size = len(trg_index) + n_trg_sents = len(trg_sents) + trg_index = None + trgf = NamedTemporaryFile('wb') + write_text(trgf, tuple(trg_sents), trg_voc_size) + + if n_src_sents != n_trg_sents: + print('ERROR: number of sentences differ in input files (%d vs %d)' % ( + n_src_sents, n_trg_sents), + file=sys.stderr, flush=True) + sys.exit(1) + + iters = (args.iters1, args.iters2, args.iters3) + if any(x is None for x in iters[:args.model]): + iters = None + + if args.verbose: + print('Aligning %d sentences...' % n_src_sents, + file=sys.stderr, flush=True) + + fwd_alignment_file = NamedTemporaryFile('w') + rev_alignment_file = NamedTemporaryFile('w') + + align(srcf.name, trgf.name, + links_filename_fwd=fwd_alignment_file.name, + links_filename_rev=rev_alignment_file.name, + statistics_filename=None, + scores_filename=None, + model=args.model, + n_iterations=iters, + n_samplers=args.n_samplers, + quiet=not args.verbose, + rel_iterations=args.length, + null_prior=args.null_prior, + use_gdb=args.debug) + + srcf.close() + trgf.close() + + voc_s = make_voc(src_sents) + voc_t = make_voc(trg_sents) + + if args.p_filename_fwd is not None: + counts = compute_counts_fwd(voc_s, voc_t, src_sents, trg_sents, fwd_alignment_file.name) + p = compute_p(voc_s, voc_t, counts) + save_p(p, voc_s, voc_t, args.p_filename_fwd) + + if args.p_filename_rev is not None: + counts = compute_counts_rev(voc_s, voc_t, src_sents, trg_sents, rev_alignment_file.name) + p = compute_p(voc_t, voc_s, counts) + save_p(p, voc_t, voc_s, args.p_filename_rev) + + fwd_alignment_file.close() + rev_alignment_file.close() + + +if __name__ == '__main__': main() + From d7697baf967de51c7b2151133a60256c54c73a9a Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Fri, 8 Jun 2018 13:52:33 +0200 Subject: [PATCH 06/29] implement voc extraction --- mkmodel.py | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/mkmodel.py b/mkmodel.py index 36bc8e1..35d7905 100755 --- a/mkmodel.py +++ b/mkmodel.py @@ -7,8 +7,24 @@ from scipy.sparse import lil_matrix -def compute_counts(): - fwd_alignment_file.name +def compute_counts_fwd(voc_s, voc_t, src_sents, trg_sents, alignment_filename, lowercase): + counts = lil_matrix(len(voc_s.items()), len(voc_t.items())) + return counts + +def compute_counts_fwd(voc_s, voc_t, src_sents, trg_sents, alignment_filename, lowercase): + counts = lil_matrix(len(voc_t.items()), len(voc_s.items())) + return counts + +def make_voc(sentences, lowercase=False): + voc = {} + index = 0 + for sent in sentences: + if lowercase: sent = sent.lower() + for token in sent.split(): + if token not in voc: + voc[token] = index + index += 1 + return voc def main(): parser = argparse.ArgumentParser( @@ -19,6 +35,9 @@ def main(): parser.add_argument( '--debug', dest='debug', action='store_true', help='Enable gdb debugging of eflomal binary') + parser.add_argument( + '--no-lowercase', dest='lowercase', + action='store_false', default=True, help='Do not lowercase input text') parser.add_argument( '--overwrite', dest='overwrite', action='store_true', help='Overwrite existing output files') @@ -97,7 +116,7 @@ def main(): file=sys.stderr, flush=True) with open(args.source_filename, 'r', encoding='utf-8') as f: src_sents, src_index = read_text( - f, True, args.source_prefix_len, args.source_suffix_len) + f, args.lowercase, args.source_prefix_len, args.source_suffix_len) n_src_sents = len(src_sents) src_voc_size = len(src_index) src_index = None @@ -109,7 +128,7 @@ def main(): file=sys.stderr, flush=True) with open(args.target_filename, 'r', encoding='utf-8') as f: trg_sents, trg_index = read_text( - f, True, args.target_prefix_len, args.target_suffix_len) + f, args.lowercase, args.target_prefix_len, args.target_suffix_len) trg_voc_size = len(trg_index) n_trg_sents = len(trg_sents) trg_index = None @@ -149,16 +168,16 @@ def main(): srcf.close() trgf.close() - voc_s = make_voc(src_sents) - voc_t = make_voc(trg_sents) + voc_s = make_voc(src_sents, args.lowercase) + voc_t = make_voc(trg_sents, args.lowercase) if args.p_filename_fwd is not None: - counts = compute_counts_fwd(voc_s, voc_t, src_sents, trg_sents, fwd_alignment_file.name) + counts = compute_counts_fwd(voc_s, voc_t, src_sents, trg_sents, fwd_alignment_file.name, args.lowercase) p = compute_p(voc_s, voc_t, counts) save_p(p, voc_s, voc_t, args.p_filename_fwd) if args.p_filename_rev is not None: - counts = compute_counts_rev(voc_s, voc_t, src_sents, trg_sents, rev_alignment_file.name) + counts = compute_counts_rev(voc_s, voc_t, src_sents, trg_sents, rev_alignment_file.name, args.lowercase) p = compute_p(voc_t, voc_s, counts) save_p(p, voc_t, voc_s, args.p_filename_rev) From 70d98427a71d684d5d9185c43b4afc48226551a5 Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Fri, 8 Jun 2018 13:56:40 +0200 Subject: [PATCH 07/29] improve efficiency by preprocessing sentences once and for all --- mkmodel.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/mkmodel.py b/mkmodel.py index 35d7905..2c07e05 100755 --- a/mkmodel.py +++ b/mkmodel.py @@ -15,12 +15,18 @@ def compute_counts_fwd(voc_s, voc_t, src_sents, trg_sents, alignment_filename, l counts = lil_matrix(len(voc_t.items()), len(voc_s.items())) return counts -def make_voc(sentences, lowercase=False): +def preprocess(sentences, lowercase): + processed = [] + for sent in sentences: + if lowercase: sent = sent.lower() + processed.append(sent.split()) + return processed + +def make_voc(sentences): voc = {} index = 0 for sent in sentences: - if lowercase: sent = sent.lower() - for token in sent.split(): + for token in sent: if token not in voc: voc[token] = index index += 1 @@ -167,9 +173,14 @@ def main(): srcf.close() trgf.close() + + # split and, if requested, lowercase tokens + src_sents = preprocess(src_sents, args.lowercase) + trg_sents = preprocess(trg_sents, args.lowercase) - voc_s = make_voc(src_sents, args.lowercase) - voc_t = make_voc(trg_sents, args.lowercase) + # extract token --> index hash table + voc_s = make_voc(src_sents) + voc_t = make_voc(trg_sents) if args.p_filename_fwd is not None: counts = compute_counts_fwd(voc_s, voc_t, src_sents, trg_sents, fwd_alignment_file.name, args.lowercase) From e46172badcacc222f69fe33276aaa1f7c7435ccd Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Fri, 8 Jun 2018 14:45:40 +0200 Subject: [PATCH 08/29] add module defining an IBM1 model as a class with data and a getter --- ibm1.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 ibm1.py diff --git a/ibm1.py b/ibm1.py new file mode 100644 index 0000000..db36203 --- /dev/null +++ b/ibm1.py @@ -0,0 +1,16 @@ +# -*- coding: utf-8 -*- + + +class IBM1(): + def __init__(self, p, voc_s, voc_t): + self.p = p + self.voc_s = voc_s + self.voc_t = voc_t + + def get(self, word_s, word_t): + s_index = word_s if type(word_s) == int else self.voc_s.get(word_s, -1) + t_index = word_t if type(word_t) == int else self.voc_t.get(word_t, -1) + + if s_index < 0 or t_index < 0: return 0.0 + + return self.p[s_index, t_index] From 63c04d0b6a69560061ec2235a07e0c284d70bea6 Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Fri, 8 Jun 2018 14:46:13 +0200 Subject: [PATCH 09/29] v0 implemented. Now to be tested --- mkmodel.py | 135 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 126 insertions(+), 9 deletions(-) diff --git a/mkmodel.py b/mkmodel.py index 2c07e05..01ebf67 100755 --- a/mkmodel.py +++ b/mkmodel.py @@ -1,19 +1,131 @@ #!/usr/bin/env python3 from eflomal import read_text, write_text, align +from ibm1 import IBM1 -import sys, argparse, random, os +import sys, argparse, os from tempfile import NamedTemporaryFile +import pickle from scipy.sparse import lil_matrix +from numpy import zeros -def compute_counts_fwd(voc_s, voc_t, src_sents, trg_sents, alignment_filename, lowercase): - counts = lil_matrix(len(voc_s.items()), len(voc_t.items())) - return counts +class XFile(): + def __init__(self, f, encoding="utf8"): + self.encoding = encoding + self.file = f + + def __enter__(self): + return self + + def __exit__(self, arg1, arg2, arg3): + return self.file.__exit__(arg1, arg2, arg3) + + def close(self): + self.file.close() + + def write(self, line): + if isinstance(self.file, gzip.GzipFile): + return self.file.write(line.encode(self.encoding)) + else: + return self.file.write(line) + + def read(self, size=-1): + line = f.read(size) + try: + return line.decode() if type(line)==bytes else line + except: + return line + + def readline(self, size): + line = f.readline(size) + try: + return line.decode(self.encoding) if type(line)==bytes else line + except: + return line + + def readlines(self, hint=-1): + if isinstance(self.file, gzip.GzipFile): + return [l.decode(self.encoding) for l in self.file.readlines(hint)] + else: + return self.file.readlines(hint) + + +def xopen(fname, mode="r", encoding="utf8"): + if fname.endswith(".gz"): + return XFile(gzip.open(fname, mode=mode), encoding) + else: + return XFile(open(fname, mode, encoding=encoding), encoding) def compute_counts_fwd(voc_s, voc_t, src_sents, trg_sents, alignment_filename, lowercase): + + counts = lil_matrix((len(voc_s.items()), len(voc_t.items()))) + s_counts = zeros(len(voc_s.items())) + + with xopen(alignment_filename , "r") as f: + i = 0 + s = src_sents[i] + t = trg_sents[i] + aline = f.read() + while aline != "": + a = [(int(x), int(y)) for x,y in [apair.split("-") for apair in aline.split()]] + + for s_i, t_i in a: + token_s = s[s_i] + token_t = t[t_i] + token_s_id = voc_s[token_s] + token_t_id = voc_s[token_t] + counts[token_s_id, token_t_id] += 1 + s_counts[token_s_id] += 1 + + i += 1 + if i < len(src_sents): + s = src_sents[i] + t = trg_sents[i] + aline = f.read() + + return counts, s_counts + +def compute_counts_rev(voc_s, voc_t, src_sents, trg_sents, alignment_filename, lowercase): + counts = lil_matrix(len(voc_t.items()), len(voc_s.items())) - return counts + t_counts = zeros(len(voc_t.items())) + + with xopen(alignment_filename, "r") as f: + i = 0 + s = src_sents[i] + t = trg_sents[i] + aline = f.read() + while aline!="": + a = [(int(x), int(y)) for x, y in [apair.split("-") for apair in aline.split()]] + + for s_i, t_i in a: + token_s = s[s_i] + token_t = t[t_i] + token_s_id = voc_s[token_s] + token_t_id = voc_s[token_t] + counts[token_t_id, token_s_id] += 1 + t_counts[token_t_id] += 1 + + i += 1 + if i 0: + for t_id in range(len(voc_t.items())): + p[s_id, t_id] = counts[s_id, t_id] / word_counts[s_id] + else: + p[s_id,:] = zeros(len(voc_t.items())) + + return p def preprocess(sentences, lowercase): processed = [] @@ -32,6 +144,11 @@ def make_voc(sentences): index += 1 return voc +def save_p(p, voc_t, voc_s, fname): + model = IBM1(p, voc_t, voc_s) + with xopen(fname, "w") as f: + pickle.dump(model, f) + def main(): parser = argparse.ArgumentParser( description='mkmodel.py: compute IBM-1 translation probabilties using eflomal, the efficient low-memory aligner') @@ -183,13 +300,13 @@ def main(): voc_t = make_voc(trg_sents) if args.p_filename_fwd is not None: - counts = compute_counts_fwd(voc_s, voc_t, src_sents, trg_sents, fwd_alignment_file.name, args.lowercase) - p = compute_p(voc_s, voc_t, counts) + counts, s_counts = compute_counts_fwd(voc_s, voc_t, src_sents, trg_sents, fwd_alignment_file.name, args.lowercase) + p = compute_p(voc_s, voc_t, counts, s_counts) save_p(p, voc_s, voc_t, args.p_filename_fwd) if args.p_filename_rev is not None: - counts = compute_counts_rev(voc_s, voc_t, src_sents, trg_sents, rev_alignment_file.name, args.lowercase) - p = compute_p(voc_t, voc_s, counts) + counts, t_counts = compute_counts_rev(voc_s, voc_t, src_sents, trg_sents, rev_alignment_file.name, args.lowercase) + p = compute_p(voc_t, voc_s, counts, t_counts) save_p(p, voc_t, voc_s, args.p_filename_rev) fwd_alignment_file.close() From ea5bd823b7fbe27249f3750587f3dde5e79cc226 Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Fri, 8 Jun 2018 16:01:59 +0200 Subject: [PATCH 10/29] various fixes --- mkmodel.py | 45 ++++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/mkmodel.py b/mkmodel.py index 01ebf67..70092d6 100755 --- a/mkmodel.py +++ b/mkmodel.py @@ -3,9 +3,8 @@ from eflomal import read_text, write_text, align from ibm1 import IBM1 -import sys, argparse, os +import sys, argparse, os, gzip, pickle from tempfile import NamedTemporaryFile -import pickle from scipy.sparse import lil_matrix from numpy import zeros @@ -25,20 +24,20 @@ def close(self): self.file.close() def write(self, line): - if isinstance(self.file, gzip.GzipFile): + if isinstance(self.file, gzip.GzipFile) and hasattr(line, "encode"): return self.file.write(line.encode(self.encoding)) else: return self.file.write(line) def read(self, size=-1): - line = f.read(size) + line = self.file.read(size) try: return line.decode() if type(line)==bytes else line except: return line - def readline(self, size): - line = f.readline(size) + def readline(self, size=-1): + line = self.file.readline(size) try: return line.decode(self.encoding) if type(line)==bytes else line except: @@ -52,7 +51,7 @@ def readlines(self, hint=-1): def xopen(fname, mode="r", encoding="utf8"): - if fname.endswith(".gz"): + if fname.endswith(".gz") or mode.endswith("b"): return XFile(gzip.open(fname, mode=mode), encoding) else: return XFile(open(fname, mode, encoding=encoding), encoding) @@ -62,19 +61,18 @@ def compute_counts_fwd(voc_s, voc_t, src_sents, trg_sents, alignment_filename, l counts = lil_matrix((len(voc_s.items()), len(voc_t.items()))) s_counts = zeros(len(voc_s.items())) - with xopen(alignment_filename , "r") as f: + with xopen(alignment_filename , "r") as afile: i = 0 s = src_sents[i] t = trg_sents[i] - aline = f.read() + aline = afile.readline() while aline != "": a = [(int(x), int(y)) for x,y in [apair.split("-") for apair in aline.split()]] - for s_i, t_i in a: token_s = s[s_i] token_t = t[t_i] token_s_id = voc_s[token_s] - token_t_id = voc_s[token_t] + token_t_id = voc_t[token_t] counts[token_s_id, token_t_id] += 1 s_counts[token_s_id] += 1 @@ -82,20 +80,20 @@ def compute_counts_fwd(voc_s, voc_t, src_sents, trg_sents, alignment_filename, l if i < len(src_sents): s = src_sents[i] t = trg_sents[i] - aline = f.read() + aline = afile.readline() return counts, s_counts def compute_counts_rev(voc_s, voc_t, src_sents, trg_sents, alignment_filename, lowercase): - counts = lil_matrix(len(voc_t.items()), len(voc_s.items())) + counts = lil_matrix((len(voc_t.items()), len(voc_s.items()))) t_counts = zeros(len(voc_t.items())) - with xopen(alignment_filename, "r") as f: + with xopen(alignment_filename, "r") as afile: i = 0 s = src_sents[i] t = trg_sents[i] - aline = f.read() + aline = afile.readline() while aline!="": a = [(int(x), int(y)) for x, y in [apair.split("-") for apair in aline.split()]] @@ -103,7 +101,7 @@ def compute_counts_rev(voc_s, voc_t, src_sents, trg_sents, alignment_filename, l token_s = s[s_i] token_t = t[t_i] token_s_id = voc_s[token_s] - token_t_id = voc_s[token_t] + token_t_id = voc_t[token_t] counts[token_t_id, token_s_id] += 1 t_counts[token_t_id] += 1 @@ -111,19 +109,17 @@ def compute_counts_rev(voc_s, voc_t, src_sents, trg_sents, alignment_filename, l if i 0: for t_id in range(len(voc_t.items())): p[s_id, t_id] = counts[s_id, t_id] / word_counts[s_id] - else: - p[s_id,:] = zeros(len(voc_t.items())) return p @@ -146,7 +142,7 @@ def make_voc(sentences): def save_p(p, voc_t, voc_s, fname): model = IBM1(p, voc_t, voc_s) - with xopen(fname, "w") as f: + with xopen(fname, "wb") as f: pickle.dump(model, f) def main(): @@ -245,6 +241,7 @@ def main(): src_index = None srcf = NamedTemporaryFile('wb') write_text(srcf, tuple(src_sents), src_voc_size) + src_sents = None if args.verbose: print('Reading target text from %s...' % args.target_filename, @@ -257,6 +254,7 @@ def main(): trg_index = None trgf = NamedTemporaryFile('wb') write_text(trgf, tuple(trg_sents), trg_voc_size) + trg_sents = None if n_src_sents != n_trg_sents: print('ERROR: number of sentences differ in input files (%d vs %d)' % ( @@ -292,8 +290,9 @@ def main(): trgf.close() # split and, if requested, lowercase tokens - src_sents = preprocess(src_sents, args.lowercase) - trg_sents = preprocess(trg_sents, args.lowercase) + with open(args.source_filename, 'r', encoding='utf-8') as fsrc, open(args.target_filename, 'r', encoding='utf-8') as ftgt : + src_sents = preprocess(fsrc.readlines(), args.lowercase) + trg_sents = preprocess(ftgt.readlines(), args.lowercase) # extract token --> index hash table voc_s = make_voc(src_sents) From dd53d4fa2b8f63b5c74a071db41699df1f1cbfdb Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Fri, 8 Jun 2018 23:09:43 +0200 Subject: [PATCH 11/29] make things faster by only iterating on non-zero counts --- mkmodel.py | 101 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 94 insertions(+), 7 deletions(-) diff --git a/mkmodel.py b/mkmodel.py index 70092d6..eef7ef2 100755 --- a/mkmodel.py +++ b/mkmodel.py @@ -3,12 +3,67 @@ from eflomal import read_text, write_text, align from ibm1 import IBM1 -import sys, argparse, os, gzip, pickle +import sys, argparse, os, gzip, pickle, logging from tempfile import NamedTemporaryFile +from progressbar import ProgressBar, Percentage, Bar from scipy.sparse import lil_matrix from numpy import zeros +def log_levels_mapping(verbose): + if verbose==0: return logging.WARNING + if verbose==1: return logging.INFO + if verbose>=2: return logging.DEBUG + + +logger = logging.getLogger(__name__) +handler = logging.StreamHandler() +logger.addHandler(handler) + + +def error(msg, code=1): + """Log an error message and exit with given code (default: 1).""" + logger.error(msg) + exit(code) + + +class bcolors: + HEADER = '\033[95m' + BLUE = '\033[94m' + GREEN = '\033[92m' + YELLOW = '\033[93m' + ORANGE = '\033[38;5;214m' + RED = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + + +def header(text): + return bcolors.HEADER+text+bcolors.ENDC + +def blue(text): + return bcolors.BLUE+text+bcolors.ENDC + +def green(text): + return bcolors.GREEN+text+bcolors.ENDC + +def yellow(text): + return bcolors.YELLOW+text+bcolors.ENDC + +def orange(text): + return bcolors.ORANGE+text+bcolors.ENDC + +def red(text): + return bcolors.RED+text+bcolors.ENDC + +def bold(text): + return bcolors.BOLD+text+bcolors.ENDC + +def underline(text): + return bcolors.UNDERLINE+text+bcolors.ENDC + + class XFile(): def __init__(self, f, encoding="utf8"): self.encoding = encoding @@ -62,6 +117,7 @@ def compute_counts_fwd(voc_s, voc_t, src_sents, trg_sents, alignment_filename, l s_counts = zeros(len(voc_s.items())) with xopen(alignment_filename , "r") as afile: + pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(src_sents)).start() i = 0 s = src_sents[i] t = trg_sents[i] @@ -77,10 +133,13 @@ def compute_counts_fwd(voc_s, voc_t, src_sents, trg_sents, alignment_filename, l s_counts[token_s_id] += 1 i += 1 + pbar.update(i) if i < len(src_sents): s = src_sents[i] t = trg_sents[i] aline = afile.readline() + + pbar.finish() return counts, s_counts @@ -90,6 +149,7 @@ def compute_counts_rev(voc_s, voc_t, src_sents, trg_sents, alignment_filename, l t_counts = zeros(len(voc_t.items())) with xopen(alignment_filename, "r") as afile: + pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(src_sents)).start() i = 0 s = src_sents[i] t = trg_sents[i] @@ -106,38 +166,55 @@ def compute_counts_rev(voc_s, voc_t, src_sents, trg_sents, alignment_filename, l t_counts[token_t_id] += 1 i += 1 + pbar.update(i) if i 0: - for t_id in range(len(voc_t.items())): - p[s_id, t_id] = counts[s_id, t_id] / word_counts[s_id] + for s_id, t_id in nonzeros: + p[s_id, t_id] = counts[s_id, t_id] / word_counts[s_id] + i += 1 + pbar.update(i) + + pbar.finish() return p def preprocess(sentences, lowercase): + pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(sentences)).start() processed = [] for sent in sentences: if lowercase: sent = sent.lower() processed.append(sent.split()) + pbar.update(len(processed)) + pbar.finish() return processed def make_voc(sentences): + pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(sentences)).start() voc = {} index = 0 - for sent in sentences: + for i, sent in enumerate(sentences): for token in sent: if token not in voc: voc[token] = index index += 1 + pbar.update(i + 1) + pbar.finish() return voc def save_p(p, voc_t, voc_s, fname): @@ -150,7 +227,7 @@ def main(): description='mkmodel.py: compute IBM-1 translation probabilties using eflomal, the efficient low-memory aligner') parser.add_argument( '-v', '--verbose', dest='verbose', - action='store_true', help='Enable verbose output') + action="count", default=0, help='Enable verbose output') parser.add_argument( '--debug', dest='debug', action='store_true', help='Enable gdb debugging of eflomal binary') @@ -211,6 +288,8 @@ def main(): args = parser.parse_args() + logger.setLevel(log_levels_mapping(args.verbose)) + if args.p_filename_fwd is None and args.p_filename_rev is None: print('ERROR: no file to save probabilities (-f/-r), will do nothing.', file=sys.stderr, flush=True) @@ -290,22 +369,30 @@ def main(): trgf.close() # split and, if requested, lowercase tokens + logger.info("Preprocessing sentences for probability estimation...") with open(args.source_filename, 'r', encoding='utf-8') as fsrc, open(args.target_filename, 'r', encoding='utf-8') as ftgt : src_sents = preprocess(fsrc.readlines(), args.lowercase) trg_sents = preprocess(ftgt.readlines(), args.lowercase) # extract token --> index hash table + logger.info("Extracting vocabulary...") voc_s = make_voc(src_sents) voc_t = make_voc(trg_sents) if args.p_filename_fwd is not None: + logger.info("Estimating forward counts...") counts, s_counts = compute_counts_fwd(voc_s, voc_t, src_sents, trg_sents, fwd_alignment_file.name, args.lowercase) + logger.info("Estimating forward probabilities...") p = compute_p(voc_s, voc_t, counts, s_counts) + logger.info("Saving forward probabilities...") save_p(p, voc_s, voc_t, args.p_filename_fwd) if args.p_filename_rev is not None: + logger.info("Estimating reverse counts...") counts, t_counts = compute_counts_rev(voc_s, voc_t, src_sents, trg_sents, rev_alignment_file.name, args.lowercase) + logger.info("Estimating reverse probabilities...") p = compute_p(voc_t, voc_s, counts, t_counts) + logger.info("Saving reverse probabilities...") save_p(p, voc_t, voc_s, args.p_filename_rev) fwd_alignment_file.close() From 0570d2258bf834b1f63972aefa3ff81066ad5482 Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Fri, 8 Jun 2018 23:18:38 +0200 Subject: [PATCH 12/29] don't force conversion of iterator zip to list --- mkmodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkmodel.py b/mkmodel.py index eef7ef2..dc65f28 100755 --- a/mkmodel.py +++ b/mkmodel.py @@ -180,7 +180,7 @@ def compute_p(voc_s, voc_t, counts, word_counts): p = lil_matrix((len(voc_s.items()), len(voc_t.items()))) nonzero_X, nonzero_Y = counts.nonzero() - nonzeros = list(zip(nonzero_X, nonzero_Y)) + nonzeros = zip(nonzero_X, nonzero_Y) pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(nonzeros)).start() i = 0 From 379f10da93a0b37ecd0509a85532fc83750b5e2b Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Fri, 8 Jun 2018 23:21:43 +0200 Subject: [PATCH 13/29] add human readable dump method --- ibm1.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ibm1.py b/ibm1.py index db36203..791a8b5 100644 --- a/ibm1.py +++ b/ibm1.py @@ -14,3 +14,8 @@ def get(self, word_s, word_t): if s_index < 0 or t_index < 0: return 0.0 return self.p[s_index, t_index] + + def dump(self, file): + X,Y = self.p.nonzero() + for s,t in zip(X,Y): + file.write("{} ||| {} ||| {}\n".format(self.voc_s[s], self.voc_t[t], self.p[s,t])) From 3fa02794253c34ee73d57af16bc40903d0d30b00 Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Fri, 8 Jun 2018 23:27:54 +0200 Subject: [PATCH 14/29] DO force conversion of iterator zip to list since we'll be using len() --- mkmodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkmodel.py b/mkmodel.py index dc65f28..eef7ef2 100755 --- a/mkmodel.py +++ b/mkmodel.py @@ -180,7 +180,7 @@ def compute_p(voc_s, voc_t, counts, word_counts): p = lil_matrix((len(voc_s.items()), len(voc_t.items()))) nonzero_X, nonzero_Y = counts.nonzero() - nonzeros = zip(nonzero_X, nonzero_Y) + nonzeros = list(zip(nonzero_X, nonzero_Y)) pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(nonzeros)).start() i = 0 From 2aeedb1bc29861e154b2e904512771484612a2eb Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Fri, 8 Jun 2018 23:48:34 +0200 Subject: [PATCH 15/29] fix human readable dump --- mkmodel.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/mkmodel.py b/mkmodel.py index eef7ef2..6becc09 100755 --- a/mkmodel.py +++ b/mkmodel.py @@ -217,8 +217,7 @@ def make_voc(sentences): pbar.finish() return voc -def save_p(p, voc_t, voc_s, fname): - model = IBM1(p, voc_t, voc_s) +def save_p(model, fname): with xopen(fname, "wb") as f: pickle.dump(model, f) @@ -280,11 +279,19 @@ def main(): parser.add_argument( '-f', '--forward-probabilities', dest='p_filename_fwd', type=str, metavar='filename', - help='Filename to write forward direction probabilities to') + help='Filename to write forward direction probabilities to, as pickle dump') parser.add_argument( '-r', '--reverse-probabilities', dest='p_filename_rev', type=str, metavar='filename', - help='Filename to write reverse direction probabilities to') + help='Filename to write reverse direction probabilities to, as pickle dump') + parser.add_argument( + '-F', '--forward-probabilities-human', dest='p_filename_fwd_h', type=str, + metavar='filename', + help='Filename to write forward direction probabilities to, as human readable dump') + parser.add_argument( + '-R', '--reverse-probabilities-human', dest='p_filename_rev_h', type=str, + metavar='filename', + help='Filename to write reverse direction probabilities to, as human readable dump') args = parser.parse_args() @@ -385,7 +392,11 @@ def main(): logger.info("Estimating forward probabilities...") p = compute_p(voc_s, voc_t, counts, s_counts) logger.info("Saving forward probabilities...") - save_p(p, voc_s, voc_t, args.p_filename_fwd) + model = IBM1(p, voc_s, voc_t) + save_p(model, args.p_filename_fwd) + if args.p_filename_fwd_h is not None: + with xopen(args.p_filename_fwd_h, "w") as f: + model.dump(f) if args.p_filename_rev is not None: logger.info("Estimating reverse counts...") @@ -393,7 +404,11 @@ def main(): logger.info("Estimating reverse probabilities...") p = compute_p(voc_t, voc_s, counts, t_counts) logger.info("Saving reverse probabilities...") - save_p(p, voc_t, voc_s, args.p_filename_rev) + model = IBM1(p, voc_t, voc_s) + save_p(model, args.p_filename_rev) + if args.p_filename_rev_h is not None: + with xopen(args.p_filename_rev_h, "w") as f: + model.dump(f) fwd_alignment_file.close() rev_alignment_file.close() From b2e1125824784adf3a07bebfaac2c8743e999e9b Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Fri, 8 Jun 2018 23:49:02 +0200 Subject: [PATCH 16/29] fix human readable dump --- ibm1.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ibm1.py b/ibm1.py index 791a8b5..7104419 100644 --- a/ibm1.py +++ b/ibm1.py @@ -16,6 +16,12 @@ def get(self, word_s, word_t): return self.p[s_index, t_index] def dump(self, file): + voc_s_rev = {} + for w, i in self.voc_s.items(): + voc_s_rev[i] = w + voc_t_rev = {} + for w, i in self.voc_t.items(): + voc_t_rev[i] = w X,Y = self.p.nonzero() for s,t in zip(X,Y): - file.write("{} ||| {} ||| {}\n".format(self.voc_s[s], self.voc_t[t], self.p[s,t])) + file.write("{}\t{}\t{}\n".format(voc_s_rev[s], voc_t_rev[t], self.p[s,t])) From 1dee3a39ace9b61a1c6d4f996657e21fa2fe791b Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Tue, 12 Jun 2018 14:17:18 +0200 Subject: [PATCH 17/29] allow to read txt data form gz file --- mkmodel.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/mkmodel.py b/mkmodel.py index 6becc09..099520b 100755 --- a/mkmodel.py +++ b/mkmodel.py @@ -69,6 +69,16 @@ def __init__(self, f, encoding="utf8"): self.encoding = encoding self.file = f + def __iter__(self): + return self + + def __next__(self): + line = self.readline() + if line == "": + raise StopIteration + else: + return line + def __enter__(self): return self @@ -87,22 +97,26 @@ def write(self, line): def read(self, size=-1): line = self.file.read(size) try: - return line.decode() if type(line)==bytes else line + return line.decode(encoding = self.encoding) if type(line) == bytes else line except: return line def readline(self, size=-1): line = self.file.readline(size) try: - return line.decode(self.encoding) if type(line)==bytes else line + return line.decode(self.encoding) if type(line) == bytes else line except: return line def readlines(self, hint=-1): + lines = self.file.readlines(hint) if isinstance(self.file, gzip.GzipFile): - return [l.decode(self.encoding) for l in self.file.readlines(hint)] + try: + return [l.decode(self.encoding) for l in lines] + except: + return lines else: - return self.file.readlines(hint) + return lines def xopen(fname, mode="r", encoding="utf8"): @@ -319,7 +333,7 @@ def main(): if args.verbose: print('Reading source text from %s...' % args.source_filename, file=sys.stderr, flush=True) - with open(args.source_filename, 'r', encoding='utf-8') as f: + with xopen(args.source_filename, 'r', encoding='utf-8') as f: src_sents, src_index = read_text( f, args.lowercase, args.source_prefix_len, args.source_suffix_len) n_src_sents = len(src_sents) @@ -332,7 +346,7 @@ def main(): if args.verbose: print('Reading target text from %s...' % args.target_filename, file=sys.stderr, flush=True) - with open(args.target_filename, 'r', encoding='utf-8') as f: + with xopen(args.target_filename, 'r', encoding='utf-8') as f: trg_sents, trg_index = read_text( f, args.lowercase, args.target_prefix_len, args.target_suffix_len) trg_voc_size = len(trg_index) @@ -377,7 +391,7 @@ def main(): # split and, if requested, lowercase tokens logger.info("Preprocessing sentences for probability estimation...") - with open(args.source_filename, 'r', encoding='utf-8') as fsrc, open(args.target_filename, 'r', encoding='utf-8') as ftgt : + with xopen(args.source_filename, 'r', encoding='utf-8') as fsrc, xopen(args.target_filename, 'r', encoding='utf-8') as ftgt : src_sents = preprocess(fsrc.readlines(), args.lowercase) trg_sents = preprocess(ftgt.readlines(), args.lowercase) From 38df9a9d444bc9a06bf3de337a40b7bba2677902 Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Tue, 12 Jun 2018 14:56:39 +0200 Subject: [PATCH 18/29] add method for computing IBM-1 sentence translation probability --- ibm1.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/ibm1.py b/ibm1.py index 7104419..6c0dc13 100644 --- a/ibm1.py +++ b/ibm1.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +from math import pow class IBM1(): def __init__(self, p, voc_s, voc_t): @@ -25,3 +26,21 @@ def dump(self, file): X,Y = self.p.nonzero() for s,t in zip(X,Y): file.write("{}\t{}\t{}\n".format(voc_s_rev[s], voc_t_rev[t], self.p[s,t])) + + def estimate(self, S, T): + p = 1.0 + for t in T: + partial = 0.0 + for s in S: + partial += self.get(s, t) + p = p * partial + if len(S) == 0: + return 0.0 + return p / len(S) + + def estimate_normalized(self, S, T): + p = self.estimate(S, T) + if len(T) == 0: + return p + else: + return pow(p, 1/len(T)) From 9064129e5114be0aaa1c33ef448f3ea8abdf2c7c Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Tue, 12 Jun 2018 15:22:09 +0200 Subject: [PATCH 19/29] cosmetic --- ibm1.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ibm1.py b/ibm1.py index 6c0dc13..0ef41a6 100644 --- a/ibm1.py +++ b/ibm1.py @@ -28,14 +28,14 @@ def dump(self, file): file.write("{}\t{}\t{}\n".format(voc_s_rev[s], voc_t_rev[t], self.p[s,t])) def estimate(self, S, T): + if len(S) == 0: + return 0.0 p = 1.0 for t in T: partial = 0.0 for s in S: partial += self.get(s, t) p = p * partial - if len(S) == 0: - return 0.0 return p / len(S) def estimate_normalized(self, S, T): From 7b0cd016fe0d968509c52630568f49cdc00e6f52 Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Tue, 12 Jun 2018 15:22:43 +0200 Subject: [PATCH 20/29] fix xopen wrapper handling of byte mode --- mkmodel.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mkmodel.py b/mkmodel.py index 099520b..c126690 100755 --- a/mkmodel.py +++ b/mkmodel.py @@ -97,20 +97,20 @@ def write(self, line): def read(self, size=-1): line = self.file.read(size) try: - return line.decode(encoding = self.encoding) if type(line) == bytes else line + return line.decode(encoding = self.encoding) if type(line) == bytes and not self.mode.endswith("b") else line except: return line def readline(self, size=-1): line = self.file.readline(size) try: - return line.decode(self.encoding) if type(line) == bytes else line + return line.decode(self.encoding) if type(line) == bytes and not self.mode.endswith("b") else line except: return line def readlines(self, hint=-1): lines = self.file.readlines(hint) - if isinstance(self.file, gzip.GzipFile): + if isinstance(self.file, gzip.GzipFile) and not self.mode.endswith("b"): try: return [l.decode(self.encoding) for l in lines] except: From 67bec78324a1cb395ba35a81605e52ec3d523804 Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Tue, 12 Jun 2018 17:30:11 +0200 Subject: [PATCH 21/29] add convience functions and package __init__ --- __init__.py | 0 convenience.py | 127 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+) create mode 100644 __init__.py create mode 100644 convenience.py diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/convenience.py b/convenience.py new file mode 100644 index 0000000..3360973 --- /dev/null +++ b/convenience.py @@ -0,0 +1,127 @@ +# -*- coding: utf-8 -*- + +import logging +import gzip + + +def log_levels_mapping(verbose): + if verbose==0: return logging.WARNING + if verbose==1: return logging.INFO + if verbose>=2: return logging.DEBUG + + +logger = logging.getLogger(__name__) +handler = logging.StreamHandler() +logger.addHandler(handler) + + +def error(msg, code=1): + """Log an error message and exit with given code (default: 1).""" + logger.error(msg) + exit(code) + + +class bcolors: + HEADER = '\033[95m' + BLUE = '\033[94m' + GREEN = '\033[92m' + YELLOW = '\033[93m' + ORANGE = '\033[38;5;214m' + RED = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + + +def header(text): + return bcolors.HEADER+text+bcolors.ENDC + + +def blue(text): + return bcolors.BLUE+text+bcolors.ENDC + + +def green(text): + return bcolors.GREEN+text+bcolors.ENDC + + +def yellow(text): + return bcolors.YELLOW+text+bcolors.ENDC + + +def orange(text): + return bcolors.ORANGE+text+bcolors.ENDC + + +def red(text): + return bcolors.RED+text+bcolors.ENDC + + +def bold(text): + return bcolors.BOLD+text+bcolors.ENDC + + +def underline(text): + return bcolors.UNDERLINE+text+bcolors.ENDC + + +class XFile(): + def __init__(self, f, encoding="utf8"): + self.encoding = encoding + self.file = f + + def __iter__(self): + return self + + def __next__(self): + line = self.readline() + if line=="": + raise StopIteration + else: + return line + + def __enter__(self): + return self + + def __exit__(self, arg1, arg2, arg3): + return self.file.__exit__(arg1, arg2, arg3) + + def close(self): + self.file.close() + + def write(self, line): + if isinstance(self.file, gzip.GzipFile) and hasattr(line, "encode"): + return self.file.write(line.encode(self.encoding)) + else: + return self.file.write(line) + + def read(self, size=-1): + line = self.file.read(size) + try: + return line.decode(encoding=self.encoding) if type(line)==bytes and not self.mode.endswith("b") else line + except: + return line + + def readline(self, size=-1): + line = self.file.readline(size) + try: + return line.decode(self.encoding) if type(line)==bytes and not self.mode.endswith("b") else line + except: + return line + + def readlines(self, hint=-1): + lines = self.file.readlines(hint) + if isinstance(self.file, gzip.GzipFile) and not self.mode.endswith("b"): + try: + return [l.decode(self.encoding) for l in lines] + except: + return lines + else: + return lines + + +def xopen(fname, mode="r", encoding="utf8"): + if fname.endswith(".gz") or mode.endswith("b"): + return XFile(gzip.open(fname, mode=mode), encoding) + else: + return XFile(open(fname, mode, encoding=encoding), encoding) From 962625311972c971ad9daf19ef6db34fb7f8bbb6 Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Tue, 12 Jun 2018 23:50:42 +0200 Subject: [PATCH 22/29] remove local implementation of convenience, import it as submodule --- .gitmodules | 3 ++ convenience | 1 + convenience.py | 127 ------------------------------------------------- 3 files changed, 4 insertions(+), 127 deletions(-) create mode 100644 .gitmodules create mode 160000 convenience delete mode 100644 convenience.py diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..4ddf2cf --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "convenience"] + path = convenience + url = git@gitlab.com:sylvainraybaud/convenience.git diff --git a/convenience b/convenience new file mode 160000 index 0000000..e24ec07 --- /dev/null +++ b/convenience @@ -0,0 +1 @@ +Subproject commit e24ec07c54d9188607b7005b6679db59edc77fd4 diff --git a/convenience.py b/convenience.py deleted file mode 100644 index 3360973..0000000 --- a/convenience.py +++ /dev/null @@ -1,127 +0,0 @@ -# -*- coding: utf-8 -*- - -import logging -import gzip - - -def log_levels_mapping(verbose): - if verbose==0: return logging.WARNING - if verbose==1: return logging.INFO - if verbose>=2: return logging.DEBUG - - -logger = logging.getLogger(__name__) -handler = logging.StreamHandler() -logger.addHandler(handler) - - -def error(msg, code=1): - """Log an error message and exit with given code (default: 1).""" - logger.error(msg) - exit(code) - - -class bcolors: - HEADER = '\033[95m' - BLUE = '\033[94m' - GREEN = '\033[92m' - YELLOW = '\033[93m' - ORANGE = '\033[38;5;214m' - RED = '\033[91m' - ENDC = '\033[0m' - BOLD = '\033[1m' - UNDERLINE = '\033[4m' - - -def header(text): - return bcolors.HEADER+text+bcolors.ENDC - - -def blue(text): - return bcolors.BLUE+text+bcolors.ENDC - - -def green(text): - return bcolors.GREEN+text+bcolors.ENDC - - -def yellow(text): - return bcolors.YELLOW+text+bcolors.ENDC - - -def orange(text): - return bcolors.ORANGE+text+bcolors.ENDC - - -def red(text): - return bcolors.RED+text+bcolors.ENDC - - -def bold(text): - return bcolors.BOLD+text+bcolors.ENDC - - -def underline(text): - return bcolors.UNDERLINE+text+bcolors.ENDC - - -class XFile(): - def __init__(self, f, encoding="utf8"): - self.encoding = encoding - self.file = f - - def __iter__(self): - return self - - def __next__(self): - line = self.readline() - if line=="": - raise StopIteration - else: - return line - - def __enter__(self): - return self - - def __exit__(self, arg1, arg2, arg3): - return self.file.__exit__(arg1, arg2, arg3) - - def close(self): - self.file.close() - - def write(self, line): - if isinstance(self.file, gzip.GzipFile) and hasattr(line, "encode"): - return self.file.write(line.encode(self.encoding)) - else: - return self.file.write(line) - - def read(self, size=-1): - line = self.file.read(size) - try: - return line.decode(encoding=self.encoding) if type(line)==bytes and not self.mode.endswith("b") else line - except: - return line - - def readline(self, size=-1): - line = self.file.readline(size) - try: - return line.decode(self.encoding) if type(line)==bytes and not self.mode.endswith("b") else line - except: - return line - - def readlines(self, hint=-1): - lines = self.file.readlines(hint) - if isinstance(self.file, gzip.GzipFile) and not self.mode.endswith("b"): - try: - return [l.decode(self.encoding) for l in lines] - except: - return lines - else: - return lines - - -def xopen(fname, mode="r", encoding="utf8"): - if fname.endswith(".gz") or mode.endswith("b"): - return XFile(gzip.open(fname, mode=mode), encoding) - else: - return XFile(open(fname, mode, encoding=encoding), encoding) From e7daa102b2145d5fc424087a78586be72210e928 Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Fri, 15 Jun 2018 14:11:02 +0200 Subject: [PATCH 23/29] put convenience functions into convenience submodule --- mkmodel.py | 117 +---------------------------------------------------- 1 file changed, 2 insertions(+), 115 deletions(-) diff --git a/mkmodel.py b/mkmodel.py index c126690..2c0e683 100755 --- a/mkmodel.py +++ b/mkmodel.py @@ -6,124 +6,11 @@ import sys, argparse, os, gzip, pickle, logging from tempfile import NamedTemporaryFile from progressbar import ProgressBar, Percentage, Bar +from convenience import xopen, Logger from scipy.sparse import lil_matrix from numpy import zeros -def log_levels_mapping(verbose): - if verbose==0: return logging.WARNING - if verbose==1: return logging.INFO - if verbose>=2: return logging.DEBUG - - -logger = logging.getLogger(__name__) -handler = logging.StreamHandler() -logger.addHandler(handler) - - -def error(msg, code=1): - """Log an error message and exit with given code (default: 1).""" - logger.error(msg) - exit(code) - - -class bcolors: - HEADER = '\033[95m' - BLUE = '\033[94m' - GREEN = '\033[92m' - YELLOW = '\033[93m' - ORANGE = '\033[38;5;214m' - RED = '\033[91m' - ENDC = '\033[0m' - BOLD = '\033[1m' - UNDERLINE = '\033[4m' - - -def header(text): - return bcolors.HEADER+text+bcolors.ENDC - -def blue(text): - return bcolors.BLUE+text+bcolors.ENDC - -def green(text): - return bcolors.GREEN+text+bcolors.ENDC - -def yellow(text): - return bcolors.YELLOW+text+bcolors.ENDC - -def orange(text): - return bcolors.ORANGE+text+bcolors.ENDC - -def red(text): - return bcolors.RED+text+bcolors.ENDC - -def bold(text): - return bcolors.BOLD+text+bcolors.ENDC - -def underline(text): - return bcolors.UNDERLINE+text+bcolors.ENDC - - -class XFile(): - def __init__(self, f, encoding="utf8"): - self.encoding = encoding - self.file = f - - def __iter__(self): - return self - - def __next__(self): - line = self.readline() - if line == "": - raise StopIteration - else: - return line - - def __enter__(self): - return self - - def __exit__(self, arg1, arg2, arg3): - return self.file.__exit__(arg1, arg2, arg3) - - def close(self): - self.file.close() - - def write(self, line): - if isinstance(self.file, gzip.GzipFile) and hasattr(line, "encode"): - return self.file.write(line.encode(self.encoding)) - else: - return self.file.write(line) - - def read(self, size=-1): - line = self.file.read(size) - try: - return line.decode(encoding = self.encoding) if type(line) == bytes and not self.mode.endswith("b") else line - except: - return line - - def readline(self, size=-1): - line = self.file.readline(size) - try: - return line.decode(self.encoding) if type(line) == bytes and not self.mode.endswith("b") else line - except: - return line - - def readlines(self, hint=-1): - lines = self.file.readlines(hint) - if isinstance(self.file, gzip.GzipFile) and not self.mode.endswith("b"): - try: - return [l.decode(self.encoding) for l in lines] - except: - return lines - else: - return lines - - -def xopen(fname, mode="r", encoding="utf8"): - if fname.endswith(".gz") or mode.endswith("b"): - return XFile(gzip.open(fname, mode=mode), encoding) - else: - return XFile(open(fname, mode, encoding=encoding), encoding) def compute_counts_fwd(voc_s, voc_t, src_sents, trg_sents, alignment_filename, lowercase): @@ -309,7 +196,7 @@ def main(): args = parser.parse_args() - logger.setLevel(log_levels_mapping(args.verbose)) + logger = Logger(args.verbose) if args.p_filename_fwd is None and args.p_filename_rev is None: print('ERROR: no file to save probabilities (-f/-r), will do nothing.', From e9b239b1ebdfc525993df96bbe2c139fb40d51a1 Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Fri, 15 Jun 2018 14:11:13 +0200 Subject: [PATCH 24/29] update convenience submodule --- convenience | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convenience b/convenience index e24ec07..9330d31 160000 --- a/convenience +++ b/convenience @@ -1 +1 @@ -Subproject commit e24ec07c54d9188607b7005b6679db59edc77fd4 +Subproject commit 9330d3179dde5321482bb3c8bbffdc4508e1bcdc From 7a0884c9bf4c064583ba8272893444cc24deb126 Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Fri, 15 Jun 2018 16:47:20 +0200 Subject: [PATCH 25/29] populate __init__.py --- __init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/__init__.py b/__init__.py index e69de29..4156998 100644 --- a/__init__.py +++ b/__init__.py @@ -0,0 +1 @@ +from .ibm1 import IBM1 From 8c999211b8c7313982540d3a4244064b1dc3e427 Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Sat, 16 Jun 2018 13:44:51 +0200 Subject: [PATCH 26/29] simple script for getting model stats --- ibm1stats.py | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100755 ibm1stats.py diff --git a/ibm1stats.py b/ibm1stats.py new file mode 100755 index 0000000..80616af --- /dev/null +++ b/ibm1stats.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import argparse +import logging +import json +import gzip +import pickle + +from convenience import xopen +from convenience import Logger +from convenience import header, blue, green, yellow, orange, red, bold, underline + +if __name__=="__main__": + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument("input", help="Input model file. Pickle or text, gzipped or not. Automatically processed extensions are .pickle, .pickle.gz, .gz. Required.") + parser.add_argument("--delimiter", "-d", type=str, dest="delimiter", default="\t", + help="Delimiter used in model file, if in text format. Use plain string between quotes. Default=.") + parser.add_argument("-v", "--verbosity", action="count", default=0, help="increase verbosity") + args = parser.parse_args() + logger = Logger(args.verbosity) + + logger.info("Loading model") + + if args.input.endswith(".pickle.gz") or args.input.endswith(".pickle"): + logger.debug("Pickle detected") + with xopen(args.input, "rb") as f: + model = pickle.load(f) + print(blue("Source vocabulary size:\t"+bold(str(len(model.voc_s))))) + print(blue("Target vocabulary size:\t"+bold(str(len(model.voc_t))))) + print(green("Number of entries:\t"+bold(str(model.p.count_nonzero())))) + + else: + logger.debug("Text format detected") + with xopen(args.input, "r") as f: + n_entries = 0 + voc_s = set() + voc_t = set() + for line in f.readlines(): + entry = line.split(args.delimiter, maxsplit=2) + voc_s.add(entry[0]) + voc_t.add(entry[1]) + n_entries += 1 + + print(blue("Source vocabulary size:\t"+bold(str(len(voc_s))))) + print(blue("Target vocabulary size:\t"+bold(str(len(voc_t))))) + print(green("Number of entries:\t"+bold(str(n_entries)))) From f39b53c6a7491f322305ece86669d015519776d7 Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Mon, 18 Jun 2018 21:30:40 +0200 Subject: [PATCH 27/29] Removed convience submodule, now installed via pip and listed as a requirement --- .gitmodules | 3 --- convenience | 1 - requirements.txt | 1 + 3 files changed, 1 insertion(+), 4 deletions(-) delete mode 160000 convenience create mode 100644 requirements.txt diff --git a/.gitmodules b/.gitmodules index 4ddf2cf..e69de29 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +0,0 @@ -[submodule "convenience"] - path = convenience - url = git@gitlab.com:sylvainraybaud/convenience.git diff --git a/convenience b/convenience deleted file mode 160000 index 9330d31..0000000 --- a/convenience +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 9330d3179dde5321482bb3c8bbffdc4508e1bcdc diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..adcd978 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +convenience From ddfd90a8c39ff9e3e319cc0fbae0942c314d6233 Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Mon, 18 Jun 2018 22:58:38 +0200 Subject: [PATCH 28/29] add docstrings --- ibm1.py | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/ibm1.py b/ibm1.py index 0ef41a6..d5b244f 100644 --- a/ibm1.py +++ b/ibm1.py @@ -1,14 +1,40 @@ # -*- coding: utf-8 -*- - +"""Implement IBM1 translation model with translation table and estimator.""" from math import pow class IBM1(): + """Implement IBM1 translation model with translation table and estimator. + + Class members: + voc_s (dict{str -> int}): source vocabulary index + voc_t (dict{str -> int}): target vocabulary index + p (scipy.sparse.lil_matrix): source to target translation probabilities + + Class methods: + __init__(self, p, voc_s, voc_t): instantiate an IBM1 object from a matrix and a source and a target vocabulary hash tables + get: translation probability look up + estimate: compute phrase translation probability + estimate_normalized: compute phrase translation probability, normalized by target phrase length + dump: write out human readable serialization of the translation table + """ def __init__(self, p, voc_s, voc_t): + """Instantiate an IBM1 object. + + :param p (scipy.sparse.lil_matrix): translation table stored as sparse matrix + :param voc_s (dict{str -> int}): source vocabulary index + :param voc_t (dict{str -> int}): target vocabulary index + """ self.p = p self.voc_s = voc_s self.voc_t = voc_t def get(self, word_s, word_t): + """Look up translation probability. Parameters can be strings or indexes. + + :param word_s (str or int): source word + :param word_t (str or int): target word + :return: translation probability P(word_t|word_s) + """ s_index = word_s if type(word_s) == int else self.voc_s.get(word_s, -1) t_index = word_t if type(word_t) == int else self.voc_t.get(word_t, -1) @@ -17,6 +43,10 @@ def get(self, word_s, word_t): return self.p[s_index, t_index] def dump(self, file): + """Write out human readable serialization of the translation table as TSV file. + + :param file: File object opened for writing (works with convenience.XFiles) + """ voc_s_rev = {} for w, i in self.voc_s.items(): voc_s_rev[i] = w @@ -28,6 +58,12 @@ def dump(self, file): file.write("{}\t{}\t{}\n".format(voc_s_rev[s], voc_t_rev[t], self.p[s,t])) def estimate(self, S, T): + """Compute phrase translation probability according to IBM1 model. P(T|S) = \prod_t \sum_s P(t|s) + + :param S: list of source words (str or int) + :param T: list of target words (str or int) + :return (float): P(T|S) + """ if len(S) == 0: return 0.0 p = 1.0 @@ -39,6 +75,13 @@ def estimate(self, S, T): return p / len(S) def estimate_normalized(self, S, T): + """Compute phrase translation probability according to IBM1 model, normalized in order not penalize longer sentences. + + Pnorm(T|S) = P(T|S)^len(1/T) + :param S: list of source words (str or int) + :param T: list of target words (str or int) + :return (float): P(T|S)^len(1/T) + """ p = self.estimate(S, T) if len(T) == 0: return p From ecc664cedd1561535def93a2ef5097d6ffd4e262 Mon Sep 17 00:00:00 2001 From: Sylvain Raybaud Date: Mon, 18 Jun 2018 22:58:56 +0200 Subject: [PATCH 29/29] print out simple stats about the phrase table --- ibm1stats.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/ibm1stats.py b/ibm1stats.py index 80616af..3634a27 100755 --- a/ibm1stats.py +++ b/ibm1stats.py @@ -2,9 +2,6 @@ # -*- coding: utf-8 -*- import argparse -import logging -import json -import gzip import pickle from convenience import xopen