TextMining/analyzeTexts.py at master · JonathanBJacobs/TextMining · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118

""" Analyses the texts downloaded from project gutenburg

    @author: Jonathan Jacobs
"""


import pickle
import string
import re
from pattern.en import *

# load the pickled files and test to see if they work

input_file = open('gutenburgTexts', 'r')
gutenTexts = pickle.load(input_file)


wordFreq = {}
sentimentOfBooks = {}


# runs the two things I want to see about the books
def analysis(books):

    cleanUpBooks(books)

    print books['toHaveAndHold']

    for text in books:
        # sentimentOfBooks[text] = analyzeSentiment(books[text])
        # wordFreq[text] = findWordFrequency(books[text])

    wordList = []
    temp = []
    # keeps track of the order of the titles in the sorted list
    order = []

    # puts wordFreq into a sorted list
    i = 0
    for name in wordFreq:
        order.append(name)
        temp = wordFreq[name].items()

        # reverses the key, value tuple thing
        for j in range(len(temp)):
            anotherTemp = temp[j][1], temp[j][0]
            temp[j] = anotherTemp

        # reverse sorts the list and adds it to the sorted list
        temp.sort(reverse=True)
        wordList.append(temp)
        i += 1

    # writes all of the word frequency lists with appropriate titles
    for k in range(len(order)):
        newFile = open(order[k] + '--Word_Frequency.txt', 'w')
        newFile.write(str(wordList[k]))
        newFile.close()

    savedFile = open('results.txt', 'w')
    savedFile.write(str(sentimentOfBooks))
    savedFile.close()


# removes the gutenburg stuff at the beginning
def cleanUpBooks(novels):
    words = ''
    for singleBook in novels:
        words = novels[singleBook]
        for index in xrange(5, len(words)):
            if words[index-4: index] == ' ***':
                novels[singleBook] = words[index:]
                break
        for index in xrange(1000, len(words)):
            if words[index-27: index] == '*** START: FULL LICENSE ***':
                novels[singleBook] = words[:index-27]
                break


# uses pattern to analyze the sentiments of the texts
def analyzeSentiment(book):
    num = 0.0
    polarity = 0.0
    subjectivity = 0.0
    textToAnalyze = re.split('[?!.]', book)
    temporaryVar = ()
    for line in textToAnalyze:
        if line != '':
            num += 1
            temporaryVar = sentiment(line)
            polarity += float(temporaryVar[0])
            subjectivity += float(temporaryVar[1])
    return (polarity/num, subjectivity/num)


# finds the frequency of words throught the entire book
def findWordFrequency(book):
    wordHolder = ''
    temp = {}
    text_file = open('temporay.txt', 'w')
    text_file.write(book)
    text_file.close()
    f = open('temporay.txt', 'r')

    for line in f:
        for word in line.split():
            wordHolder = word
            wordHolder = wordHolder.strip(string.punctuation)
            wordHolder = wordHolder.strip(string.whitespace)
            wordHolder = wordHolder.lower()
            temp[wordHolder] = temp.get(wordHolder, 0) + 1
    f.close()
    return temp


if __name__ == '__main__':
    analysis(gutenTexts)