forked from sd16spring/TextMining
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtext_mining.py
More file actions
106 lines (88 loc) · 3.88 KB
/
text_mining.py
File metadata and controls
106 lines (88 loc) · 3.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
"""
This compares word frequancy, tone, word length, and the amount of unique words.
@author: Lauren Pudvan
"""
import string
from pattern.web import *
"""
TheTaleOfPeterRabbitURL = URL('http://www.gutenberg.org/cache/epub/14838/pg14838.txt').download()
TheTaleOfPeterRabbit = plaintext(TheTaleOfPeterRabbitURL)
f_TheTaleOfPeterRabbit = open('TheTaleOfPeterRabbitDownload.txt', 'w')
f_TheTaleOfPeterRabbit.write(TheTaleOfPeterRabbit.encode('UTF-8'))
f_TheTaleOfPeterRabbit.close
TheTaleOfPeterRabbitClean = open('TheTaleOfPeterRabbitDownload.txt', 'r').read()
exclude = set(string.punctuation)
TheTaleOfPeterRabbitClean = ''.join(ch for ch in TheTaleOfPeterRabbitClean if ch not in exclude)
TheTaleOfPeterRabbitClean = TheTaleOfPeterRabbitClean.lower()
FinalTheTaleOfPeterRabbit = open('TheTaleOfPeterRabbitClean.txt', 'w')
FinalTheTaleOfPeterRabbit.write(TheTaleOfPeterRabbitClean)
FinalTheTaleOfPeterRabbit.close
"""
#That was an example of how I got one of the books downloaded.
#Because Gutenberg was down I got the other two by copy and pasting them into a plain text file.
from pattern.en import *
import operator
def word_frequency(book):
""" This goes through each word of the story and
if it does not exist in the dictionary it creates a key of the word and gives it a value of 1.
If it does exist in the dictionary it increases the value by 1.
Then it sorts the dictionary from lovest to highest values (words that occure most are at the end)
Then it returns the dictionary.
>>> word_frequency('DocTesting.txt')
[('used', 1), ('for', 1), ('This', 1), ('doc', 1), ('is', 1), ('testing', 1)]
"""
f = open(book,'r') # sets f equal to a sting of the book
wordcount={} # new dictionary
for word in f.read().split():
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
f.close();
sorted_wordcount = sorted(wordcount.items(), key=operator.itemgetter(1))
return sorted_wordcount
def amount_of_independent_words(book): # The amount of unique words not counting repetition.
""" This takes the dictionary result from the word frequancy function and returns the length of that dictionary.
The length of that dictionary is the number of original words.
>>> amount_of_independent_words('DocTesting.txt')
6
"""
dictOfWords = word_frequency(book)
return len(dictOfWords)
def average_word_length(book):
""" This will append a list with the length of each word then take the avarage of the list.
This gives the average word length.
>>> average_word_length('DocTesting.txt')
3.8333333333333335
"""
f = open(book,'r') # sets f equal to a sting of the book
wordLenths = []
for word in f.read().split():
length = len(word)
wordLenths.append(length)
f.close();
return sum(wordLenths) / float(len(wordLenths))
def tone(book):
""" This takes in a string and returns (positive sentiment polarity)
I do not know how to predict a doctest for this because i do not know the specifics for how to predict the result of sentiment.
"""
b = open(book,'r') # sets b equal to a sting of the book
b.read()
sent = sentiment(file)
b.close()
return sent[0]
# if __name__ == "__main__":
# import doctest
# doctest.testmod()
print word_frequency('TheVeryHungryCatipillar.txt')
print amount_of_independent_words('TheVeryHungryCatipillar.txt')
print average_word_length('TheVeryHungryCatipillar.txt')
print tone('TheVeryHungryCatipillar.txt')
print word_frequency('TheGivingTree.txt')
print amount_of_independent_words('TheGivingTree.txt')
print average_word_length('TheGivingTree.txt')
print tone('TheGivingTree.txt')
print word_frequency('TheTaleOfPeterRabbitClean.txt')
print amount_of_independent_words('TheTaleOfPeterRabbitClean.txt')
print average_word_length('TheTaleOfPeterRabbitClean.txt')
print tone('TheTaleOfPeterRabbitClean.txt')