From 2409161d4954bb399f96d95da4e852ef447568d9 Mon Sep 17 00:00:00 2001
From: q <q>
Date: Tue, 19 May 2020 20:11:01 +0300
Subject: [PATCH 1/3] Python with buffer + memoryview

---
 bench.py      |  1 +
 src/freq02.py | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)
 create mode 100644 src/freq02.py
diff --git a/bench.py b/bench.py
index 0841030..cad0cac 100755
--- a/bench.py
+++ b/bench.py
@@ -18,6 +18,7 @@ def run1(args, src_name, num_runs):
 runs = [
 	[['java', '-jar', './bin/freq01scala.jar'], 'freq01.scala', 3],
 	[['python', './src/freq01.py'], 'freq01.py', 3],
+	[['python', './src/freq02.py'], 'freq02.py', 3],
 	[['./bin/freq03cpp' + EXE], 'freq03.cpp'],
 	[['./bin/freq02cpp' + EXE], 'freq02.cpp'],
 	[['./bin/freq01cpp' + EXE], 'freq01.cpp'],
diff --git a/src/freq02.py b/src/freq02.py
new file mode 100644
index 0000000..d06a60a
--- /dev/null
+++ b/src/freq02.py
@@ -0,0 +1,67 @@
+import string
+import sys
+from collections import Counter, defaultdict
+from itertools import islice
+
+
+def make_translation_table():
+    ascii = bytes(range(0, 256))
+    letters = string.ascii_letters.encode('ascii')
+    intab = ascii
+    space = b' '[0]
+    outtab = bytearray(b in letters and b or space for b in ascii).lower()
+    translation_table = bytearray.maketrans(intab, outtab)
+    return translation_table
+
+
+def bwords(stream, chunk_size, max_bword_len):
+    table = make_translation_table()
+    space_byte = b' '
+    space = space_byte[0]
+    buffer = bytearray(space for _ in range(max_bword_len + chunk_size))
+    view = memoryview(buffer)
+    head_view = view[:max_bword_len]
+    tail_view = view[max_bword_len:]
+    space_head = b''.rjust(max_bword_len)
+
+    while True:
+        n = stream.readinto(tail_view)
+        if n == 0:
+            break
+        if n < chunk_size:
+            tail_view[n:] = b''.rjust(len(tail_view) - n)
+
+        words_and_spaces_only = buffer.translate(table)
+        words = words_and_spaces_only.split()
+
+        all_words = (bytes(w[:max_bword_len]) for w in words)
+
+        if len(words) > 1:
+            yield from islice(all_words, len(words) - 1)
+
+        head_view[:] = space_head
+        if words_and_spaces_only[-1] != 0:
+            for w in all_words:
+                head_view[-len(w) :] = w
+        else:
+            yield from all_words
+
+    yield from bytes(head_view).split()
+
+
+if __name__ == '__main__':
+    if len(sys.argv) != 3:
+        exit(1)
+
+    input_file = sys.argv[1]
+    output_file = sys.argv[2]
+
+    with open(input_file, 'rb') as f:
+        counter = Counter(bwords(f, chunk_size=100, max_bword_len=256))
+    with open(output_file, 'w') as f:
+        results = defaultdict(list)
+        for word, count in sorted(counter.items(), key=lambda kv: (-kv[1], kv[0])):
+            f.write(str(count))
+            f.write(' ')
+            f.write(word.decode())
+            f.write('\n')

From bd693250fa9486626d8edd188816215b8f22bf0c Mon Sep 17 00:00:00 2001
From: bofm <bofm.gh@gmail.com>
Date: Tue, 19 May 2020 23:06:06 +0300
Subject: [PATCH 2/3] increade buffer size in freq02.py

---
 src/freq02.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/freq02.py b/src/freq02.py
index d06a60a..a027bab 100644
--- a/src/freq02.py
+++ b/src/freq02.py
@@ -57,7 +57,7 @@ def bwords(stream, chunk_size, max_bword_len):
     output_file = sys.argv[2]
 
     with open(input_file, 'rb') as f:
-        counter = Counter(bwords(f, chunk_size=100, max_bword_len=256))
+        counter = Counter(bwords(f, chunk_size=1024*1024, max_bword_len=256))
     with open(output_file, 'w') as f:
         results = defaultdict(list)
         for word, count in sorted(counter.items(), key=lambda kv: (-kv[1], kv[0])):

From f87a1c665236214b64fc6b13b7d8e319547f9d96 Mon Sep 17 00:00:00 2001
From: bofm <bofm.gh@gmail.com>
Date: Tue, 19 May 2020 23:20:12 +0300
Subject: [PATCH 3/3] rm unused code

---
 src/freq02.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/freq02.py b/src/freq02.py
index a027bab..de234ca 100644
--- a/src/freq02.py
+++ b/src/freq02.py
@@ -1,6 +1,6 @@
 import string
 import sys
-from collections import Counter, defaultdict
+from collections import Counter
 from itertools import islice
 
 
@@ -59,7 +59,6 @@ def bwords(stream, chunk_size, max_bword_len):
     with open(input_file, 'rb') as f:
         counter = Counter(bwords(f, chunk_size=1024*1024, max_bword_len=256))
     with open(output_file, 'w') as f:
-        results = defaultdict(list)
         for word, count in sorted(counter.items(), key=lambda kv: (-kv[1], kv[0])):
             f.write(str(count))
             f.write(' ')