diff --git a/build/vs19/freq01.csproj b/build/vs19/freq01.csproj new file mode 100644 index 0000000..cfa34b7 --- /dev/null +++ b/build/vs19/freq01.csproj @@ -0,0 +1,38 @@ + + + + + x64 + pdbonly + true + ..\..\bin\ + + + prompt + 4 + false + false + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\bin\ + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\obj\ + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\tmp\ + + + {B61A5E2F-D44A-4ED0-B058-208149DCFC79} + v4.7.2 + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/build/vs19/freq02.csproj b/build/vs19/freq02.csproj new file mode 100644 index 0000000..6bb347f --- /dev/null +++ b/build/vs19/freq02.csproj @@ -0,0 +1,38 @@ + + + + + x64 + pdbonly + true + ..\..\bin\ + + + prompt + 4 + false + false + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\bin\ + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\obj\ + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\tmp\ + + + {B61A5E2F-D44A-4ED0-B058-208149DCFC79} + v4.7.2 + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/build/vs19/freq05.csproj b/build/vs19/freq05.csproj new file mode 100644 index 0000000..6f5a7ef --- /dev/null +++ b/build/vs19/freq05.csproj @@ -0,0 +1,38 @@ + + + + + x64 + pdbonly + true + ..\..\bin\ + + + prompt + 4 + false + false + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\bin\ + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\obj\ + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\tmp\ + + + {B61A5E2F-D44A-4ED0-B058-208149DCFC79} + v4.7.2 + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/build/vs19/freq06.csproj b/build/vs19/freq06.csproj new file mode 100644 index 0000000..e3a1a35 --- /dev/null +++ b/build/vs19/freq06.csproj @@ -0,0 +1,38 @@ + + + + + x64 + pdbonly + true + ..\..\bin\ + + + prompt + 4 + false + false + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\bin\ + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\obj\ + $(SolutionDir)\junk\vs19\$(Configuration)\$(ProjectName)\tmp\ + + + {B61A5E2F-D44A-4ED0-B058-208149DCFC79} + v4.7.2 + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/freq01.cs b/src/freq01.cs new file mode 100644 index 0000000..b102082 --- /dev/null +++ b/src/freq01.cs @@ -0,0 +1,59 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; + +namespace freq01 +{ + class freq01 + { + private static readonly Dictionary dict = new Dictionary(); + + private static void AddWord(StringBuilder word) + { + var strWord = word.ToString().ToLowerInvariant(); + + if (dict.ContainsKey(strWord)) + dict[strWord]++; + else + dict[strWord] = 1; + } + + static void Main(string[] args) + { + if (args.Length != 2) + { + Console.WriteLine("Usage: freq "); + return; + } + + using (StreamReader sr = new StreamReader(args[0])) + { + var word = new StringBuilder(); + while (sr.Peek() >= 0) + { + var ch = (char)sr.Read(); + if (Char.IsLetter(ch)) + { + word.Append(ch); + continue; + } + + if (word.Length == 0) + continue; + + AddWord(word); + word = new StringBuilder(); + } + + if (word.Length > 0) + AddWord(word); + } + + File.WriteAllLines(args[1], dict.OrderByDescending(kvp => kvp.Value) + .ThenBy(kvp => kvp.Key) + .Select(kvp => $"{kvp.Value} {kvp.Key}")); + } + } +} diff --git a/src/freq02.cs b/src/freq02.cs new file mode 100644 index 0000000..262af1b --- /dev/null +++ b/src/freq02.cs @@ -0,0 +1,107 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Linq; +using System.Text; + +namespace freq02 +{ + class trie + { + public node Root = new node() { Word = "" }; + + public IEnumerable Enumerate(node root) + { + for (int i = 0; i < 26; i++) + { + var child = root.Children[i]; + if (child != null) + { + if (child.Count > 0) + yield return root.Children[i]; + + foreach (var child_node in Enumerate(child)) + { + yield return child_node; + } + } + } + } + + public class node + { + public node[] Children = new node[26]; + public int Count; + public string Word; + + public node WeNeedToGoDeeper(int index) + { + return Children[index] ?? (Children[index] = new node() {Word = Word + (char) (index + 97)}); + } + } + } + + class freq02 + { + private static readonly trie _trie = new trie(); + private static void Count(string filename) + { + var text = File.ReadAllBytes(filename); + + trie.node node = _trie.Root; + foreach (var ch in text) + { + if (ch >= 'a' && ch <= 'z') + { + var index = ch - 97; + node = node.WeNeedToGoDeeper(index); + continue; + } + + if (ch >= 'A' && ch <= 'Z') + { + var index = ch - 65; + node = node.WeNeedToGoDeeper(index); + continue; + } + + if (node == _trie.Root) + continue; + + node.Count++; + node = _trie.Root; + } + + if (node != _trie.Root) + { + node.Count++; + } + } + + private static void SortAndDump(string filename) + { + var enumerable = _trie.Enumerate(_trie.Root); + var sorted = enumerable.OrderByDescending(n => n.Count); //.ThenBy(n => n.Word); words are already sorted during trie traversal + File.WriteAllLines(filename, sorted.Select(n => $"{n.Count} {n.Word}")); + } + + static void Main(string[] args) + { + if (args.Length != 2) + { + Console.WriteLine("Usage: freq "); + return; + } + + var stopwatch = new Stopwatch(); + stopwatch.Start(); + + Count(args[0]); + SortAndDump(args[1]); + + stopwatch.Stop(); + Console.WriteLine($"Elapsed time: {(decimal) stopwatch.ElapsedMilliseconds / 1000}s"); + } + } +} diff --git a/src/freq05.cs b/src/freq05.cs new file mode 100644 index 0000000..2c3d790 --- /dev/null +++ b/src/freq05.cs @@ -0,0 +1,137 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Linq; + +namespace freq05 +{ + public class Accountant + { + private const int CAPACITY = 15000000; + + public class node + { + public int PersonalndexStart; + public int Count; + public int Depth; + public string Word; + } + + private node _root = new node(); + private readonly char[] _enumerationWordBuf = new char[256]; + private readonly char[] _charmap = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' }; + + private node[] _sparseNodes = new node[CAPACITY]; + public int _nextFreeChunkIndex = 26; + + public IEnumerable Enumerate() + { + return Enumerate(_root); + } + + public IEnumerable Enumerate(node root) + { + var realIndex = root.PersonalndexStart; + for (int i = 0; i < 26; i++) + { + var child = _sparseNodes[realIndex]; + _enumerationWordBuf[root.Depth] = _charmap[i]; + + if (child != null) + { + if (child.Count > 0) + { + child.Word = new string(_enumerationWordBuf, 0, root.Depth + 1); + yield return child; + } + + foreach (var child_node in Enumerate(child)) + { + yield return child_node; + } + } + + realIndex++; + } + } + + public void Count(byte[] text) + { + var indexLookup = new[] + { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + + var node = _root; + int index; + + for (int i = 0; i < text.Length; i++) + { + index = indexLookup[text[i]]; + + if (index != -1) + { + var newnode = _sparseNodes[node.PersonalndexStart + index]; + if (newnode == null) + { + newnode = new node(); + _sparseNodes[node.PersonalndexStart + index] = newnode; + newnode.PersonalndexStart = _nextFreeChunkIndex; + newnode.Depth = node.Depth + 1; + _nextFreeChunkIndex += 26; + } + + node = newnode; + continue; + } + + if (node == _root) + continue; + + node.Count++; + node = _root; + } + + if (node != _root) + { + node.Count++; + } + } + } + + class freq05 + { + private static void SortAndDump(Accountant accountant, string filename) + { + var enumerable = accountant.Enumerate(); + var sorted = enumerable.OrderByDescending(n => n.Count); // words are already sorted by traversal + File.WriteAllLines(filename, sorted.Select(n => $"{n.Count} {n.Word}")); + } + + static void Main(string[] args) + { + if (args.Length != 2) + { + Console.WriteLine("Usage: freq "); + return; + } + + var text = File.ReadAllBytes(args[0]); + var acc = new Accountant(); + + GC.TryStartNoGCRegion(244 * 1000 * 1000, true); + acc.Count(text); + SortAndDump(acc, args[1]); + } + } +} diff --git a/src/freq06.cs b/src/freq06.cs new file mode 100644 index 0000000..00d7f83 --- /dev/null +++ b/src/freq06.cs @@ -0,0 +1,186 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Linq; +using System.Security.Policy; +using System.Threading; + +namespace freq06 +{ + public class Accountant + { + private const int CAPACITY = 15000000; + + public class node + { + public int PersonalndexStart; + public int Count; + public int Depth; + public string Word; + } + + public class ThreadStartInfo + { + public int Start; + public int End; + public byte[] Data; + } + + private node _root = new node(); + private readonly char[] _enumerationWordBuf = new char[256]; + private readonly char[] _charmap = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' }; + private readonly int[] _charIndexLookup = new[] + { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + + private volatile node[] _sparseNodes = new node[CAPACITY]; + public volatile int _nextFreeChunkIndex = 26; + + public IEnumerable Enumerate() + { + return Enumerate(_root); + } + + public IEnumerable Enumerate(node root) + { + var realIndex = root.PersonalndexStart; + for (int i = 0; i < 26; i++) + { + var child = _sparseNodes[realIndex]; + _enumerationWordBuf[root.Depth] = _charmap[i]; + + if (child != null) + { + if (child.Count > 0) + { + child.Word = new string(_enumerationWordBuf, 0, root.Depth + 1); + yield return child; + } + + foreach (var child_node in Enumerate(child)) + { + yield return child_node; + } + } + + realIndex++; + } + } + + public void Count(byte[] text, int chunks) + { + var len = text.Length; + var start = 0; + var threads = new List(); + + for (int i = 1; i <= chunks; i++) + { + var end = (len / chunks) * i + 5; + if (end > len) + end = len; + + while (end < len && _charIndexLookup[text[end]] != -1) + { + end++; + } + + var thread = new Thread(Count); + thread.Start(new ThreadStartInfo { Data = text, Start = start, End = end }); + threads.Add(thread); + + start = end + 1; + } + + foreach (var thread in threads) + thread.Join(); + } + + private void Count(object state) + { + var tsi = state as ThreadStartInfo; + + var node = _root; + int index; + + for (int i = tsi.Start; i < tsi.End; i++) + { + index = _charIndexLookup[tsi.Data[i]]; + + if (index != -1) + { + var newnode = _sparseNodes[node.PersonalndexStart + index]; + if (newnode == null) + { + lock (_sparseNodes) + { + newnode = _sparseNodes[node.PersonalndexStart + index]; + if (newnode == null) + { + newnode = new node(); + newnode.PersonalndexStart = _nextFreeChunkIndex; + newnode.Depth = node.Depth + 1; + _nextFreeChunkIndex += 26; + _sparseNodes[node.PersonalndexStart + index] = newnode; + } + } + } + + node = newnode; + continue; + } + + if (node == _root) + continue; + + Interlocked.Increment(ref node.Count); + node = _root; + } + + if (node != _root) + { + Interlocked.Increment(ref node.Count); + } + } + } + + class freq06 + { + private static void SortAndDump(Accountant accountant, string filename) + { + var enumerable = accountant.Enumerate(); + var sorted = enumerable.OrderByDescending(n => n.Count); // words are already sorted by traversal + File.WriteAllLines(filename, sorted.Select(n => $"{n.Count} {n.Word}")); + } + + static void Main(string[] args) + { + if (args.Length != 2 && args.Length != 3) + { + Console.WriteLine("Usage: freq [chunks (default=4)]"); + return; + } + + int chunks = 4; + if (args.Length == 3) + chunks = Int32.Parse(args[2]); + + var text = File.ReadAllBytes(args[0]); + var acc = new Accountant(); + + GC.TryStartNoGCRegion(244 * 1000 * 1000, true); + acc.Count(text, chunks); + SortAndDump(acc, args[1]); + } + } +}