From 2409161d4954bb399f96d95da4e852ef447568d9 Mon Sep 17 00:00:00 2001 From: q Date: Tue, 19 May 2020 20:11:01 +0300 Subject: [PATCH 1/3] Python with buffer + memoryview --- bench.py | 1 + src/freq02.py | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 src/freq02.py diff --git a/bench.py b/bench.py index 0841030..cad0cac 100755 --- a/bench.py +++ b/bench.py @@ -18,6 +18,7 @@ def run1(args, src_name, num_runs): runs = [ [['java', '-jar', './bin/freq01scala.jar'], 'freq01.scala', 3], [['python', './src/freq01.py'], 'freq01.py', 3], + [['python', './src/freq02.py'], 'freq02.py', 3], [['./bin/freq03cpp' + EXE], 'freq03.cpp'], [['./bin/freq02cpp' + EXE], 'freq02.cpp'], [['./bin/freq01cpp' + EXE], 'freq01.cpp'], diff --git a/src/freq02.py b/src/freq02.py new file mode 100644 index 0000000..d06a60a --- /dev/null +++ b/src/freq02.py @@ -0,0 +1,67 @@ +import string +import sys +from collections import Counter, defaultdict +from itertools import islice + + +def make_translation_table(): + ascii = bytes(range(0, 256)) + letters = string.ascii_letters.encode('ascii') + intab = ascii + space = b' '[0] + outtab = bytearray(b in letters and b or space for b in ascii).lower() + translation_table = bytearray.maketrans(intab, outtab) + return translation_table + + +def bwords(stream, chunk_size, max_bword_len): + table = make_translation_table() + space_byte = b' ' + space = space_byte[0] + buffer = bytearray(space for _ in range(max_bword_len + chunk_size)) + view = memoryview(buffer) + head_view = view[:max_bword_len] + tail_view = view[max_bword_len:] + space_head = b''.rjust(max_bword_len) + + while True: + n = stream.readinto(tail_view) + if n == 0: + break + if n < chunk_size: + tail_view[n:] = b''.rjust(len(tail_view) - n) + + words_and_spaces_only = buffer.translate(table) + words = words_and_spaces_only.split() + + all_words = (bytes(w[:max_bword_len]) for w in words) + + if len(words) > 1: + yield from islice(all_words, len(words) - 1) + + head_view[:] = space_head + if words_and_spaces_only[-1] != 0: + for w in all_words: + head_view[-len(w) :] = w + else: + yield from all_words + + yield from bytes(head_view).split() + + +if __name__ == '__main__': + if len(sys.argv) != 3: + exit(1) + + input_file = sys.argv[1] + output_file = sys.argv[2] + + with open(input_file, 'rb') as f: + counter = Counter(bwords(f, chunk_size=100, max_bword_len=256)) + with open(output_file, 'w') as f: + results = defaultdict(list) + for word, count in sorted(counter.items(), key=lambda kv: (-kv[1], kv[0])): + f.write(str(count)) + f.write(' ') + f.write(word.decode()) + f.write('\n') From bd693250fa9486626d8edd188816215b8f22bf0c Mon Sep 17 00:00:00 2001 From: bofm Date: Tue, 19 May 2020 23:06:06 +0300 Subject: [PATCH 2/3] increade buffer size in freq02.py --- src/freq02.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/freq02.py b/src/freq02.py index d06a60a..a027bab 100644 --- a/src/freq02.py +++ b/src/freq02.py @@ -57,7 +57,7 @@ def bwords(stream, chunk_size, max_bword_len): output_file = sys.argv[2] with open(input_file, 'rb') as f: - counter = Counter(bwords(f, chunk_size=100, max_bword_len=256)) + counter = Counter(bwords(f, chunk_size=1024*1024, max_bword_len=256)) with open(output_file, 'w') as f: results = defaultdict(list) for word, count in sorted(counter.items(), key=lambda kv: (-kv[1], kv[0])): From f87a1c665236214b64fc6b13b7d8e319547f9d96 Mon Sep 17 00:00:00 2001 From: bofm Date: Tue, 19 May 2020 23:20:12 +0300 Subject: [PATCH 3/3] rm unused code --- src/freq02.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/freq02.py b/src/freq02.py index a027bab..de234ca 100644 --- a/src/freq02.py +++ b/src/freq02.py @@ -1,6 +1,6 @@ import string import sys -from collections import Counter, defaultdict +from collections import Counter from itertools import islice @@ -59,7 +59,6 @@ def bwords(stream, chunk_size, max_bword_len): with open(input_file, 'rb') as f: counter = Counter(bwords(f, chunk_size=1024*1024, max_bword_len=256)) with open(output_file, 'w') as f: - results = defaultdict(list) for word, count in sorted(counter.items(), key=lambda kv: (-kv[1], kv[0])): f.write(str(count)) f.write(' ')