-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathWordhas2.bas
More file actions
305 lines (268 loc) · 6.95 KB
/
Wordhas2.bas
File metadata and controls
305 lines (268 loc) · 6.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
'===========================================================================
' Subject: Word Hashing Date: 03-16-03 ( : )
' Author: Antoni Gual et.al. Code: QB, PDS
' Origin: agual@eic.ictnet.es Packet: TEXT.ABC
'===========================================================================
DECLARE SUB fastsort ()
DECLARE FUNCTION getaword2$ (a$, ENDED%)
DECLARE FUNCTION updatefreq% (a$)
DECLARE FUNCTION funFirstPrime% (threshold%)
'
'----------------------------------------------------------------------------
'-----------------------------------------------------------------------------
DEFINT A-Z
CONST TRUE = -1, FALSE = 0
CONST empty$ = " "
TYPE stacktype
low AS INTEGER
hi AS INTEGER
END TYPE
DIM SHARED aa$
DIM SHARED tablesize
DIM SHARED new.words
DIM n AS LONG
Main:
CLS
LOCATE 1, 1
PRINT "WORDHASH.BAS By Rich Geldreich 1992"
PRINT " Tweaked by Quinn Tyler Jackson 1993"
PRINT " and by Antoni Gual 2003"
filename$ = RTRIM$(COMMAND$)
IF LEN(filename$) = 0 THEN
PRINT
PRINT "USE: WORDHASH textfilename"
PRINT "Results outputted to result.txt"
END
END IF
OPEN filename$ FOR INPUT AS #1 LEN = 16384
'Dont set directly the table size, for a fast hash it must be 30% bigger
' than nr of entries and be a prime number. That's what the function finds.
tablesize = funFirstPrime(10001)
REDIM SHARED wordtable(tablesize) AS STRING * 10
REDIM SHARED counts(tablesize) AS LONG
T! = TIMER
PRINT
PRINT "Processing "; filename$; " with a "; tablesize; " elements table"
aa$ = SPACE$(10)
bb$ = aa$
FOR I = 0 TO tablesize: LSET wordtable(I) = SPACE$(10): NEXT
DO UNTIL EOF(1)
LINE INPUT #1, a$
a$ = LTRIM$(RTRIM$(a$))
IF LEN(a$) THEN
'PRINT A$
a$ = UCASE$(a$)
DO
b$ = getaword2$(a$, ENDED)
IF LEN(b$) THEN
'PRINT B$; "<"
LSET bb$ = b$
wRD& = wRD& + 1
IF updatefreq(bb$) THEN GOTO exitit
END IF
LOOP UNTIL ENDED
END IF
n = n + 1
LOCATE 7, 1: PRINT USING "######## words ###### lines : ###### new words"; wRD&; n; new.words;
LOOP
exitit:
PRINT " in "; TIMER - T!; " sec."
CLOSE
PRINT "Sorting results.."
fastsort
GOSUB printtable
ERASE wordtable, counts
PRINT
PRINT "Done. You can check the results in RESULT.TXT"
a$ = INPUT$(1)
END
printtable:
OPEN "RESULT.TXT" FOR OUTPUT AS #1
PRINT #1, "Wordcount of the file:"; COMMAND$
PRINT #1, USING "######## words ###### lines : ###### new words";
Print #1
j = 0
FOR I = 0 TO tablesize
IF ASC(wordtable(I)) <> 32 THEN
PRINT #1, USING "###### , \ \ "; counts(I); wordtable(I)
'use this for CSV output to view results with MS Excel
'WRITE #1, counts(I), wordtable(I)
j = j + 1
END IF
NEXT
print #1;"----End---"
CLOSE
RETURN
SUB fastsort
'QuickSort iterative (rather than recursive) by Cornel Huth
DIM Lstack(1 TO 128) AS stacktype 'our stack
DIM Sp AS INTEGER 'out stack pointer
Sp = 1
Lstack(Sp).low = 0
Lstack(Sp).hi = tablesize
Sp = Sp + 1
DO
Sp = Sp - 1
low = Lstack(Sp).low
hi = Lstack(Sp).hi
DO
I = low
j = hi
mid = (low + hi) \ 2
GOSUB sortcounts
IF j - low < hi - I THEN
IF I < hi THEN
Lstack(Sp).low = I
Lstack(Sp).hi = hi
Sp = Sp + 1
END IF
hi = j
ELSE
IF low < j THEN
Lstack(Sp).low = low
Lstack(Sp).hi = j
Sp = Sp + 1
END IF
low = I
END IF
LOOP WHILE low < hi
LOOP WHILE Sp <> 1
EXIT SUB
sortcounts:
compare& = counts(mid)
DO
WHILE counts(I) > compare&: I = I + 1: WEND
WHILE counts(j) < compare&: j = j - 1: WEND
IF I <= j THEN
SWAP wordtable(I), wordtable(j)
SWAP counts(I), counts(j)
I = I + 1
j = j - 1
END IF
LOOP WHILE I <= j
RETURN
sortwords:
compare$ = wordtable(mid)
DO
WHILE wordtable(I) > compare$: I = I + 1: WEND
WHILE wordtable(j) < compare$: j = j - 1: WEND
IF I <= j THEN
SWAP wordtable(I), wordtable(j)
SWAP counts(I), counts(j)
I = I + 1
j = j - 1
END IF
LOOP WHILE I <= j
RETURN
END SUB
' This FUNCTION returns a prime number that is at least 30% greater than
' threshold. It will TRY to return a prime number that also fits into the
' form 4K+3, where k is any integer, but if the prime number is twice the
' size of the threshold, it will ignore this criterion.
'
' Written by Charles Graham, Tweaked by Quinn Tyler Jackson
'
FUNCTION funFirstPrime (threshold)
CONST TRUE = -1
CONST FALSE = NOT TRUE
tp30 = INT((threshold * 1.3))
IF tp30 / 2 = tp30 \ 2 THEN
tp30 = tp30 + 1
END IF
c = tp30 - 2
IF c < 1 THEN
c = 1
END IF
t2 = threshold * 2
DO
c = c + 2
FOR z = 3 TO SQR(c)
ind = TRUE
IF c / z = c \ z THEN
ind = FALSE
EXIT FOR
END IF
NEXT z
IF ind THEN
IF (c - 3) / 4 = INT((c - 3) / 4) OR c > t2 THEN
funFirstPrime = c
EXIT DO
END IF
END IF
LOOP
END FUNCTION
FUNCTION getaword2$ (a$, ENDED)
'Uses only a single string assign at the end so it's fast!
'Needs the line passed to be uppercase!
'Very buggy! Should be reworked!
'Takes all chars>128 as valid in a word!
'
STATIC ptr
IF LEN(a$) = 0 THEN ENDED = -1: EXIT FUNCTION
ENDED = 0
DEF SEG
wptr& = CLNG(SADD(a$)) + ptr - 1
DO
ptr = ptr + 1
wptr& = wptr& + 1
c = PEEK(wptr&)
LOOP WHILE (c < 65 OR (c > 90 AND c < 129)) AND (ptr < LEN(a$))
IF ptr = LEN(a$) THEN GETWORD2$ = "": ENDED = -1: ptr = 0: EXIT FUNCTION
ptr1 = ptr
DO
ptr = ptr + 1
wptr& = wptr& + 1
c = PEEK(wptr&)
LOOP UNTIL c < 65 OR (c > 90 AND c < 129) OR ptr > LEN(a$)
IF ptr > LEN(a$) THEN
getaword2$ = MID$(a$, ptr1, ptr - ptr1)
ENDED = -1: ptr = 0
ELSE
getaword2$ = MID$(a$, ptr1, ptr - ptr1)
END IF
END FUNCTION
FUNCTION updatefreq (a$)
STATIC collisions AS LONG
n = 0
FOR I = 1 TO 10 STEP 2
n = n XOR CVI(MID$(a$, I, 2))
NEXT
keyindex = n AND &H7FFF
'adjust the keyindex so its within the table
keyindex = keyindex MOD tablesize
'calculate an offset for retries
IF keyindex = 0 THEN
Offset = 1
ELSE
Offset = tablesize - keyindex
END IF
'main loop of hashing
DO
'is this entry empty?
IF wordtable(keyindex) = empty$ THEN
'add this entry to the hash table
LSET wordtable(keyindex) = a$
counts(keyindex) = 1
new.words = new.words + 1
IF new.words = tablesize THEN
updatefreq = 1
EXIT FUNCTION
END IF
EXIT FUNCTION
'is this what we're looking for?
ELSEIF wordtable(keyindex) = a$ THEN
'increment the frequency of the entry
counts(keyindex) = counts(keyindex) + 1
EXIT FUNCTION
'this entry contains a string other than what we're looking for:
'adjust the KeyIndex and try again
ELSE
collisions = collisions + 1
keyindex = keyindex - Offset
'wrap back the keyindex if it's <0
IF keyindex < 0 THEN
keyindex = keyindex + tablesize
END IF
END IF
LOOP
END FUNCTION