From b750590ce1618d140e2de59ab5ff55fb5f0c656f Mon Sep 17 00:00:00 2001 From: MblSH Date: Tue, 19 May 2020 02:36:39 +0800 Subject: [PATCH 1/5] Added C# freq01/02 --- build/vs19/freq01.csproj | 55 ++++++++++++++++++++ build/vs19/freq02.csproj | 55 ++++++++++++++++++++ src/freq01.cs | 83 ++++++++++++++++++++++++++++++ src/freq02.cs | 107 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 300 insertions(+) create mode 100644 build/vs19/freq01.csproj create mode 100644 build/vs19/freq02.csproj create mode 100644 src/freq01.cs create mode 100644 src/freq02.cs diff --git a/build/vs19/freq01.csproj b/build/vs19/freq01.csproj new file mode 100644 index 0000000..0a65790 --- /dev/null +++ b/build/vs19/freq01.csproj @@ -0,0 +1,55 @@ + + + + + Debug + AnyCPU + {4590C92A-CCD9-4BDB-9D9A-8F5E70EA6CA9} + Exe + freq01 + freq01 + v4.7.2 + 512 + true + true + $(SolutionDir)\..\..\bin\ + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\ + + + AnyCPU + true + full + false + ..\..\bin\ + TRACE;DEBUG + prompt + 4 + false + false + + + AnyCPU + pdbonly + true + ..\..\bin\ + + + prompt + 4 + false + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/build/vs19/freq02.csproj b/build/vs19/freq02.csproj new file mode 100644 index 0000000..dd2773e --- /dev/null +++ b/build/vs19/freq02.csproj @@ -0,0 +1,55 @@ + + + + + Debug + AnyCPU + {4590C92A-CCD9-4BDB-9D9A-8F5E70EA6CA9} + Exe + freq01 + freq01 + v4.7.2 + 512 + true + true + $(SolutionDir)\..\..\bin\ + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\ + + + AnyCPU + true + full + false + ..\..\bin\ + TRACE;DEBUG + prompt + 4 + false + false + + + AnyCPU + pdbonly + true + ..\..\bin\ + + + prompt + 4 + false + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/freq01.cs b/src/freq01.cs new file mode 100644 index 0000000..52ed7b2 --- /dev/null +++ b/src/freq01.cs @@ -0,0 +1,83 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Linq; +using System.Text; + +namespace freq01 +{ + class freq01 + { + private static readonly Dictionary dict = new Dictionary(500000); + + private static void Count(string filename) + { + var text = File.ReadAllText(filename); + + var word = new StringBuilder(16); + var strWord = ""; + + foreach (var ch in text) + { + if (ch >= 'a' && ch <= 'z') + { + word.Append(ch); + continue; + } + + if (ch >= 'A' && ch <= 'Z') + { + word.Append((char)(ch + 32)); + continue; + } + + if (word.Length == 0) + continue; + + strWord = word.ToString(); + + if (dict.ContainsKey(strWord)) + dict[strWord]++; + else + dict[strWord] = 1; + + word = new StringBuilder(16); + } + + if (word.Length > 0) + { + strWord = word.ToString(); + + if (dict.ContainsKey(strWord)) + dict[strWord]++; + else + dict[strWord] = 1; + } + } + + private static void SortAndDump(string filename) + { + var sorted = dict.OrderByDescending(kvp => kvp.Value).ThenBy(kvp => kvp.Key); + File.WriteAllLines(filename, sorted.Select(kvp => $"{kvp.Value} {kvp.Key}")); + } + + static void Main(string[] args) + { + if (args.Length != 2) + { + Console.WriteLine("Usage: freq "); + return; + } + + var stopwatch = new Stopwatch(); + stopwatch.Start(); + + Count(args[0]); + SortAndDump(args[1]); + + stopwatch.Stop(); + Console.WriteLine($"Elapsed time: {stopwatch.ElapsedMilliseconds/1000}s"); + } + } +} diff --git a/src/freq02.cs b/src/freq02.cs new file mode 100644 index 0000000..46d44e6 --- /dev/null +++ b/src/freq02.cs @@ -0,0 +1,107 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Linq; +using System.Text; + +namespace freq02 +{ + class trie + { + public node Root = new node() { Word = "" }; + + public IEnumerable Enumerate(node root) + { + for (int i = 0; i < 26; i++) + { + var child = root.Children[i]; + if (child != null) + { + if (child.Count > 0) + yield return root.Children[i]; + + foreach (var child_node in Enumerate(child)) + { + yield return child_node; + } + } + } + } + + public class node + { + public node[] Children = new node[26]; + public int Count; + public string Word; + + public node WeNeedToGoDeeper(int index) + { + return Children[index] ?? (Children[index] = new node() {Word = Word + (char) (index + 97)}); + } + } + } + + class freq02 + { + private static readonly trie _trie = new trie(); + private static void Count(string filename) + { + var text = File.ReadAllText(filename); + + trie.node node = _trie.Root; + foreach (var ch in text) + { + if (ch >= 'a' && ch <= 'z') + { + var index = ch - 97; + node = node.WeNeedToGoDeeper(index); + continue; + } + + if (ch >= 'A' && ch <= 'Z') + { + var index = ch - 65; + node = node.WeNeedToGoDeeper(index); + continue; + } + + if (node == _trie.Root) + continue; + + node.Count++; + node = _trie.Root; + } + + if (node != _trie.Root) + { + node.Count++; + } + } + + private static void SortAndDump(string filename) + { + var enumerable = _trie.Enumerate(_trie.Root); + var sorted = enumerable.OrderByDescending(n => n.Count); //.ThenBy(n => n.Word); words are already sorted during trie traversal + File.WriteAllLines(filename, sorted.Select(n => $"{n.Count} {n.Word}")); + } + + static void Main(string[] args) + { + if (args.Length != 2) + { + Console.WriteLine("Usage: freq "); + return; + } + + var stopwatch = new Stopwatch(); + stopwatch.Start(); + + Count(args[0]); + SortAndDump(args[1]); + + stopwatch.Stop(); + Console.WriteLine($"Elapsed time: {(decimal) stopwatch.ElapsedMilliseconds / 1000}s"); + } + } +} From d09e33d35a4f572b6b4527d52ecd5396b386418d Mon Sep 17 00:00:00 2001 From: MblSH Date: Tue, 19 May 2020 03:14:07 +0800 Subject: [PATCH 2/5] Speeded up file read --- src/freq02.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/freq02.cs b/src/freq02.cs index 46d44e6..262af1b 100644 --- a/src/freq02.cs +++ b/src/freq02.cs @@ -47,7 +47,7 @@ class freq02 private static readonly trie _trie = new trie(); private static void Count(string filename) { - var text = File.ReadAllText(filename); + var text = File.ReadAllBytes(filename); trie.node node = _trie.Root; foreach (var ch in text) From f1e9537f723ed11bd51b12736e8418501617f1b1 Mon Sep 17 00:00:00 2001 From: MblSH Date: Wed, 20 May 2020 19:38:04 +0800 Subject: [PATCH 3/5] Final --- build/vs19/freq01.csproj | 37 +++------- build/vs19/freq02.csproj | 37 +++------- build/vs19/freq05.csproj | 38 ++++++++++ src/freq05.cs | 155 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 213 insertions(+), 54 deletions(-) create mode 100644 build/vs19/freq05.csproj create mode 100644 src/freq05.cs diff --git a/build/vs19/freq01.csproj b/build/vs19/freq01.csproj index 0a65790..cfa34b7 100644 --- a/build/vs19/freq01.csproj +++ b/build/vs19/freq01.csproj @@ -1,34 +1,8 @@  - - Debug - AnyCPU - {4590C92A-CCD9-4BDB-9D9A-8F5E70EA6CA9} - Exe - freq01 - freq01 - v4.7.2 - 512 - true - true - $(SolutionDir)\..\..\bin\ - $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\ - - - AnyCPU - true - full - false - ..\..\bin\ - TRACE;DEBUG - prompt - 4 - false - false - - AnyCPU + x64 pdbonly true ..\..\bin\ @@ -36,7 +10,16 @@ prompt 4 + false false + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\bin\ + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\obj\ + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\tmp\ + + + {B61A5E2F-D44A-4ED0-B058-208149DCFC79} + v4.7.2 + diff --git a/build/vs19/freq02.csproj b/build/vs19/freq02.csproj index dd2773e..6bb347f 100644 --- a/build/vs19/freq02.csproj +++ b/build/vs19/freq02.csproj @@ -1,34 +1,8 @@  - - Debug - AnyCPU - {4590C92A-CCD9-4BDB-9D9A-8F5E70EA6CA9} - Exe - freq01 - freq01 - v4.7.2 - 512 - true - true - $(SolutionDir)\..\..\bin\ - $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\ - - - AnyCPU - true - full - false - ..\..\bin\ - TRACE;DEBUG - prompt - 4 - false - false - - AnyCPU + x64 pdbonly true ..\..\bin\ @@ -36,7 +10,16 @@ prompt 4 + false false + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\bin\ + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\obj\ + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\tmp\ + + + {B61A5E2F-D44A-4ED0-B058-208149DCFC79} + v4.7.2 + diff --git a/build/vs19/freq05.csproj b/build/vs19/freq05.csproj new file mode 100644 index 0000000..6f5a7ef --- /dev/null +++ b/build/vs19/freq05.csproj @@ -0,0 +1,38 @@ + + + + + x64 + pdbonly + true + ..\..\bin\ + + + prompt + 4 + false + false + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\bin\ + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\obj\ + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\tmp\ + + + {B61A5E2F-D44A-4ED0-B058-208149DCFC79} + v4.7.2 + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/freq05.cs b/src/freq05.cs new file mode 100644 index 0000000..4539d99 --- /dev/null +++ b/src/freq05.cs @@ -0,0 +1,155 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Linq; + +namespace freq05 +{ + public class Accountant + { + private const int CAPACITY = 15000000; + + public class node + { + public int PersonalndexStart; + public int Count; + public int Depth; + public string Word; + } + + private node _root = new node(); + private readonly char[] _enumerationWordBuf = new char[256]; + private readonly char[] _charmap = { 'a', 'b', 'c', 'd', 'e', 'f', 'j', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' }; + + private node[] _sparseNodes = new node[CAPACITY]; + public int _nextFreeChunkIndex = 26; + + public IEnumerable Enumerate() + { + return Enumerate(_root); + } + + public IEnumerable Enumerate(node root) + { + var realIndex = root.PersonalndexStart; + for (int i = 0; i < 26; i++) + { + var child = _sparseNodes[realIndex]; + _enumerationWordBuf[root.Depth] = _charmap[i]; + + if (child != null) + { + if (child.Count > 0) + { + child.Word = new string(_enumerationWordBuf, 0, root.Depth + 1); + yield return child; + } + + foreach (var child_node in Enumerate(child)) + { + yield return child_node; + } + } + + realIndex++; + } + } + + public void Count(byte[] text) + { + var indexLookup = new[] + { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + + var node = _root; + int index; + + for (int i = 0; i < text.Length; i++) + { + index = indexLookup[text[i]]; + + if (index != -1) + { + var newnode = _sparseNodes[node.PersonalndexStart + index]; + if (newnode == null) + { + newnode = new node(); + _sparseNodes[node.PersonalndexStart + index] = newnode; + newnode.PersonalndexStart = _nextFreeChunkIndex; + newnode.Depth = node.Depth + 1; + _nextFreeChunkIndex += 26; + } + + node = newnode; + continue; + } + + if (node == _root) + continue; + + node.Count++; + node = _root; + } + + if (node != _root) + { + node.Count++; + } + } + } + + class freq05 + { + private static void SortAndDump(Accountant accountant, string filename) + { + var enumerable = accountant.Enumerate(); + var sorted = enumerable.OrderByDescending(n => n.Count); // words are already sorted by traversal + File.WriteAllLines(filename, sorted.Select(n => $"{n.Count} {n.Word}")); + } + + static void Main(string[] args) + { + if (args.Length != 2) + { + Console.WriteLine("Usage: freq "); + return; + } + + var stopwatch = new Stopwatch(); + stopwatch.Start(); + + var text = File.ReadAllBytes(args[0]); + var acc = new Accountant(); + var readMs = stopwatch.ElapsedMilliseconds; + + GC.TryStartNoGCRegion(244 * 1000 * 1000, true); + + acc.Count(text); + var countMs = stopwatch.ElapsedMilliseconds; + + SortAndDump(acc, args[1]); + + stopwatch.Stop(); + + Console.WriteLine($"Read/prealloc time: {(decimal)readMs / 1000}s"); + Console.WriteLine($"Count time: {(decimal)(countMs - readMs) / 1000}s"); + Console.WriteLine($"Sort and dump time: {(decimal)(stopwatch.ElapsedMilliseconds - countMs - readMs) / 1000}s"); + Console.WriteLine($"Total time: {(decimal)stopwatch.ElapsedMilliseconds / 1000}s"); + + + Console.WriteLine($"\n{acc._nextFreeChunkIndex} used nodes"); + //GC.EndNoGCRegion(); + } + } +} From cbf36b6794724934ab7f9f371164ca429a19429c Mon Sep 17 00:00:00 2001 From: MblSH Date: Wed, 20 May 2020 20:26:50 +0800 Subject: [PATCH 4/5] Changed freq01.cs to 'canonical' implementation, removed debugging printouts from freq05.cs --- src/freq01.cs | 86 +++++++++++++++++++-------------------------------- src/freq05.cs | 18 ----------- 2 files changed, 31 insertions(+), 73 deletions(-) diff --git a/src/freq01.cs b/src/freq01.cs index 52ed7b2..b102082 100644 --- a/src/freq01.cs +++ b/src/freq01.cs @@ -1,6 +1,5 @@ using System; using System.Collections.Generic; -using System.Diagnostics; using System.IO; using System.Linq; using System.Text; @@ -9,57 +8,16 @@ namespace freq01 { class freq01 { - private static readonly Dictionary dict = new Dictionary(500000); + private static readonly Dictionary dict = new Dictionary(); - private static void Count(string filename) + private static void AddWord(StringBuilder word) { - var text = File.ReadAllText(filename); + var strWord = word.ToString().ToLowerInvariant(); - var word = new StringBuilder(16); - var strWord = ""; - - foreach (var ch in text) - { - if (ch >= 'a' && ch <= 'z') - { - word.Append(ch); - continue; - } - - if (ch >= 'A' && ch <= 'Z') - { - word.Append((char)(ch + 32)); - continue; - } - - if (word.Length == 0) - continue; - - strWord = word.ToString(); - - if (dict.ContainsKey(strWord)) - dict[strWord]++; - else - dict[strWord] = 1; - - word = new StringBuilder(16); - } - - if (word.Length > 0) - { - strWord = word.ToString(); - - if (dict.ContainsKey(strWord)) - dict[strWord]++; - else - dict[strWord] = 1; - } - } - - private static void SortAndDump(string filename) - { - var sorted = dict.OrderByDescending(kvp => kvp.Value).ThenBy(kvp => kvp.Key); - File.WriteAllLines(filename, sorted.Select(kvp => $"{kvp.Value} {kvp.Key}")); + if (dict.ContainsKey(strWord)) + dict[strWord]++; + else + dict[strWord] = 1; } static void Main(string[] args) @@ -70,14 +28,32 @@ static void Main(string[] args) return; } - var stopwatch = new Stopwatch(); - stopwatch.Start(); + using (StreamReader sr = new StreamReader(args[0])) + { + var word = new StringBuilder(); + while (sr.Peek() >= 0) + { + var ch = (char)sr.Read(); + if (Char.IsLetter(ch)) + { + word.Append(ch); + continue; + } + + if (word.Length == 0) + continue; + + AddWord(word); + word = new StringBuilder(); + } - Count(args[0]); - SortAndDump(args[1]); + if (word.Length > 0) + AddWord(word); + } - stopwatch.Stop(); - Console.WriteLine($"Elapsed time: {stopwatch.ElapsedMilliseconds/1000}s"); + File.WriteAllLines(args[1], dict.OrderByDescending(kvp => kvp.Value) + .ThenBy(kvp => kvp.Key) + .Select(kvp => $"{kvp.Value} {kvp.Key}")); } } } diff --git a/src/freq05.cs b/src/freq05.cs index 4539d99..bfd0465 100644 --- a/src/freq05.cs +++ b/src/freq05.cs @@ -126,30 +126,12 @@ static void Main(string[] args) return; } - var stopwatch = new Stopwatch(); - stopwatch.Start(); - var text = File.ReadAllBytes(args[0]); var acc = new Accountant(); - var readMs = stopwatch.ElapsedMilliseconds; GC.TryStartNoGCRegion(244 * 1000 * 1000, true); - acc.Count(text); - var countMs = stopwatch.ElapsedMilliseconds; - SortAndDump(acc, args[1]); - - stopwatch.Stop(); - - Console.WriteLine($"Read/prealloc time: {(decimal)readMs / 1000}s"); - Console.WriteLine($"Count time: {(decimal)(countMs - readMs) / 1000}s"); - Console.WriteLine($"Sort and dump time: {(decimal)(stopwatch.ElapsedMilliseconds - countMs - readMs) / 1000}s"); - Console.WriteLine($"Total time: {(decimal)stopwatch.ElapsedMilliseconds / 1000}s"); - - - Console.WriteLine($"\n{acc._nextFreeChunkIndex} used nodes"); - //GC.EndNoGCRegion(); } } } From 02ee0e411c9a745dbad5bc4c0da18ed4fe2d54a7 Mon Sep 17 00:00:00 2001 From: MblSH Date: Thu, 21 May 2020 01:32:50 +0800 Subject: [PATCH 5/5] Multi-threaded freq06 + typo fix --- build/vs19/freq06.csproj | 38 ++++++++ src/freq05.cs | 2 +- src/freq06.cs | 186 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 225 insertions(+), 1 deletion(-) create mode 100644 build/vs19/freq06.csproj create mode 100644 src/freq06.cs diff --git a/build/vs19/freq06.csproj b/build/vs19/freq06.csproj new file mode 100644 index 0000000..e3a1a35 --- /dev/null +++ b/build/vs19/freq06.csproj @@ -0,0 +1,38 @@ + + + + + x64 + pdbonly + true + ..\..\bin\ + + + prompt + 4 + false + false + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\bin\ + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\obj\ + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\tmp\ + + + {B61A5E2F-D44A-4ED0-B058-208149DCFC79} + v4.7.2 + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/freq05.cs b/src/freq05.cs index bfd0465..2c3d790 100644 --- a/src/freq05.cs +++ b/src/freq05.cs @@ -20,7 +20,7 @@ public class node private node _root = new node(); private readonly char[] _enumerationWordBuf = new char[256]; - private readonly char[] _charmap = { 'a', 'b', 'c', 'd', 'e', 'f', 'j', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' }; + private readonly char[] _charmap = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' }; private node[] _sparseNodes = new node[CAPACITY]; public int _nextFreeChunkIndex = 26; diff --git a/src/freq06.cs b/src/freq06.cs new file mode 100644 index 0000000..00d7f83 --- /dev/null +++ b/src/freq06.cs @@ -0,0 +1,186 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Linq; +using System.Security.Policy; +using System.Threading; + +namespace freq06 +{ + public class Accountant + { + private const int CAPACITY = 15000000; + + public class node + { + public int PersonalndexStart; + public int Count; + public int Depth; + public string Word; + } + + public class ThreadStartInfo + { + public int Start; + public int End; + public byte[] Data; + } + + private node _root = new node(); + private readonly char[] _enumerationWordBuf = new char[256]; + private readonly char[] _charmap = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' }; + private readonly int[] _charIndexLookup = new[] + { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + + private volatile node[] _sparseNodes = new node[CAPACITY]; + public volatile int _nextFreeChunkIndex = 26; + + public IEnumerable Enumerate() + { + return Enumerate(_root); + } + + public IEnumerable Enumerate(node root) + { + var realIndex = root.PersonalndexStart; + for (int i = 0; i < 26; i++) + { + var child = _sparseNodes[realIndex]; + _enumerationWordBuf[root.Depth] = _charmap[i]; + + if (child != null) + { + if (child.Count > 0) + { + child.Word = new string(_enumerationWordBuf, 0, root.Depth + 1); + yield return child; + } + + foreach (var child_node in Enumerate(child)) + { + yield return child_node; + } + } + + realIndex++; + } + } + + public void Count(byte[] text, int chunks) + { + var len = text.Length; + var start = 0; + var threads = new List(); + + for (int i = 1; i <= chunks; i++) + { + var end = (len / chunks) * i + 5; + if (end > len) + end = len; + + while (end < len && _charIndexLookup[text[end]] != -1) + { + end++; + } + + var thread = new Thread(Count); + thread.Start(new ThreadStartInfo { Data = text, Start = start, End = end }); + threads.Add(thread); + + start = end + 1; + } + + foreach (var thread in threads) + thread.Join(); + } + + private void Count(object state) + { + var tsi = state as ThreadStartInfo; + + var node = _root; + int index; + + for (int i = tsi.Start; i < tsi.End; i++) + { + index = _charIndexLookup[tsi.Data[i]]; + + if (index != -1) + { + var newnode = _sparseNodes[node.PersonalndexStart + index]; + if (newnode == null) + { + lock (_sparseNodes) + { + newnode = _sparseNodes[node.PersonalndexStart + index]; + if (newnode == null) + { + newnode = new node(); + newnode.PersonalndexStart = _nextFreeChunkIndex; + newnode.Depth = node.Depth + 1; + _nextFreeChunkIndex += 26; + _sparseNodes[node.PersonalndexStart + index] = newnode; + } + } + } + + node = newnode; + continue; + } + + if (node == _root) + continue; + + Interlocked.Increment(ref node.Count); + node = _root; + } + + if (node != _root) + { + Interlocked.Increment(ref node.Count); + } + } + } + + class freq06 + { + private static void SortAndDump(Accountant accountant, string filename) + { + var enumerable = accountant.Enumerate(); + var sorted = enumerable.OrderByDescending(n => n.Count); // words are already sorted by traversal + File.WriteAllLines(filename, sorted.Select(n => $"{n.Count} {n.Word}")); + } + + static void Main(string[] args) + { + if (args.Length != 2 && args.Length != 3) + { + Console.WriteLine("Usage: freq [chunks (default=4)]"); + return; + } + + int chunks = 4; + if (args.Length == 3) + chunks = Int32.Parse(args[2]); + + var text = File.ReadAllBytes(args[0]); + var acc = new Accountant(); + + GC.TryStartNoGCRegion(244 * 1000 * 1000, true); + acc.Count(text, chunks); + SortAndDump(acc, args[1]); + } + } +}