-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathGetFeatures.py
More file actions
98 lines (79 loc) · 3.29 KB
/
GetFeatures.py
File metadata and controls
98 lines (79 loc) · 3.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from datetime import datetime
from nltk.classify import apply_features
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import stopwords
from nltk.text import Text
from nltk.tokenize import word_tokenize
from random import shuffle
import gzip
import nltk
import nltk.classify.util
import pickle
import string
porter = nltk.PorterStemmer()
wnl = nltk.WordNetLemmatizer()
# Same as reviewdata class but takes one review
def get_most_common_words(rvw, amount):
all_words = []
for words in word_tokenize(rvw["summary"]):
all_words.append(words.lower())
stop_words = set(stopwords.words("english"))
all_words = [words for words in all_words if not words in stop_words]
punctuation = list(string.punctuation)
all_words = [words for words in all_words if not words in punctuation]
all_words = [porter.stem(t) for t in all_words]
all_words = [wnl.lemmatize(t) for t in all_words]
all_words = nltk.FreqDist(all_words)
return all_words.most_common(amount)
def get_most_common_taggs(reviews):
all_words = []
for rvw in reviews:
tagged_text = nltk.pos_tag(word_tokenize(rvw["reviewText"]))
for tagged in tagged_text:
if tagged[1] == "X": #JJ" or tagged[1] == "JJR" or tagged[1] == "JJS":
all_words.append(tagged[0])
all_words = nltk.FreqDist(all_words)
return all_words.most_common(20)
def word_feats(words):
return dict([(word, True) for word in words])
def label_review(review):
if review["overall"] < 3.0:
return (review, "neg")
elif review["overall"] == 3.0:
return (review, "neutral")
else:
return (review, "pos")
# get_features takes a review and generates features for it. additionlly it
# labels the review "pos" "neutral" or "neg". The function returns a tuple of
# the form (review_features, label) where review_features is a dictionary.
def get_features(review):
f = {}
#f.update(word_feats(review["nltkText"]))
# helpful field is tuple with helpful[0] being yes and helpful[1] being no.
# the field counts the number of times the review was flagged as helpful vs
# not helpful
# This extracts the helpful field into a single feature with
# helpful = y(es) if there are more yes answers than no
# helpful = u(nknown) if there are the same number of yes and no answers
# helpful = n(0) if there are less yes answers than no
if review["helpful"][0] > review["helpful"][1]:
f["helpful"] = "y"
elif review["helpful"][0] == review["helpful"][1]:
f["helpful"] = "u"
else:
f["helpful"] = "n"
#f["score"] = review["overall"]
#I think we need to better filter words from the nltkText rather than just
#adding every one. We could also construct our own preliminary classifier
#that takes the words of a review and outputs some sort of score as an
#aggregate.
most_common = [w[0] for w in get_most_common_words(review, 30)]
#most_common = [w[0] for w in get_most_common_taggs(review)]
f.update(word_feats(most_common))
# Seeing if most common bigrams as features works better
#bigrams = nltk.bigrams(review["nltkText"])
#bigram_freq = nltk.FreqDist(bigrams)
##print bigram_freq.most_common(5)
#for b in bigram_freq.most_common(10):
# f.update({(b[0], True)})
return f