-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmulti.py
More file actions
166 lines (165 loc) · 5.91 KB
/
multi.py
File metadata and controls
166 lines (165 loc) · 5.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import os,string,json,time,multiprocessing,enchant,sys
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from multiprocessing import Pool
import pandas as pd
from multiprocessing.dummy import Pool as ThreadPool
import numpy as np
import scipy.io
from scipy.sparse import coo_matrix
words=enchant.Dict("en_US")
stop = stopwords.words('english') + list(string.punctuation)
def vocap_category(arg):
folder_name=arg[0].split("/")
doc=set()
folder_name=folder_name[len(folder_name)-2]
with open(os.getcwd()+r'/input/jsons/'+str(folder_name)+r'.json','r') as file1:
dp=file1.read()
dp=json.loads(dp)
for i in dp:
for sub in dp[i]:
doc=doc.union(set( dp[i][sub].split()))
return doc
def process_folder(arg):
files=[]
path=arg[0]
threads=int(arg[1])
folder_name=path.split('/')
folder_name=folder_name[len(folder_name)-2]
d={}
for i in os.listdir(path):
files.append([i,path+i])
a={}
d={}
for i in files:
with open(i[1],'r') as file1:
doc=file1.read()
file1.close()
doc=" ".join([k for k in doc.split() if k.isalnum() or k==" " or k=="\n"])
doc=" ".join([word for word in doc.split() if word not in stop and words.check(word)])
doc=" ".join([k.lower() for k in doc.split()])
d[str(i[0])]=doc
a[folder_name]=d
#print folder_name,len(a[folder_name]),"files Indexed"
with open(os.getcwd()+r'/input/jsons/'+str(folder_name)+r'.json','w') as file1:
dp=json.dumps(a, sort_keys=True, indent=4, separators=(',', ': '))
file1.write(dp)
file1.close()
return 1
def join_JSON():
bag={}
json_files = [pos_json for pos_json in os.listdir(os.getcwd()+r'/input/jsons') if pos_json.endswith('.json')]
for i in json_files:
with open(os.getcwd()+r'/input/jsons/'+str(i),'r') as file1:
dp=file1.read()
dp2=json.loads(dp)
for cat in dp2:
bag[str(cat)]=dp2[cat]
file1.close()
with open(os.getcwd()+r'/input/jsons/train-multi.json','w') as file1:
bag=json.dumps(bag, sort_keys=True, indent=4, separators=(',', ': '))
file1.write(bag)
if __name__ == '__main__':
start=time.time()
#processes_limit=int(sys.argv[1])
#threads=int(sys.argv[2])
processes_limit=3
threads=30
print "Program started at: ",start
print "Number of Processors used: ", processes_limit
print "Number of Threads used: ", threads
cwd=os.getcwd()
train=cwd+r"/input/train/"
categories=os.listdir(train)
itter=[]
for i in range(0,len(categories)):
itter.append([train+categories[i]+r'/',threads])
p = Pool(processes=processes_limit)
results=p.map(process_folder, itter)
#code to combine all jsons
join_JSON()
#code to make vocabulary
p = Pool(processes=processes_limit)
results=p.map(vocap_category, itter)
voc=set()#set of all unique words in a dataset
for i in results:
voc=voc.union(i)
bag={}
try:
with open(r'/home/shivam/Programing/Current_Projects/Text_Classifiers/input/jsons/train-multi.json','r') as file1:
dp=file1.read()
file1.close()
bag=json.loads(dp)
except IOError as err:
print err
indd={}
for cat in bag:
for sub in bag[cat]:
indd[str(cat)+"_"+str(sub)]=str(bag[cat][sub]).split()
n_nonzero = 0
vocab=set()
for docterms in indd.values():
unique_terms = set(docterms)
vocab |= unique_terms
n_nonzero += len(unique_terms)
docnames = list(indd.keys())
docnames = np.array(docnames)
vocab = np.array(list(vocab))
vocab_sorter = np.argsort(vocab)
ndocs = len(docnames)
nvocab = len(vocab)
data = np.empty(n_nonzero, dtype=np.intc)
rows = np.empty(n_nonzero, dtype=np.intc)
cols = np.empty(n_nonzero, dtype=np.intc)
ind = 0
for docname, terms in indd.items():
term_indices = vocab_sorter[np.searchsorted(vocab, terms, sorter=vocab_sorter)]
uniq_indices, counts = np.unique(term_indices, return_counts=True)
n_vals = len(uniq_indices)
ind_end = ind + n_vals
data[ind:ind_end] = counts
cols[ind:ind_end] = uniq_indices
doc_idx = np.where(docnames == docname)
rows[ind:ind_end] = np.repeat(doc_idx, n_vals)
ind = ind_end
dtm = coo_matrix((data, (rows, cols)), shape=(ndocs, nvocab), dtype=np.intc)
cat_list=[]
cat_prob={}
cat_list_value={} #a list of each category where the values indicate the particulat docs belonging to it
vocab_value={} #for each vocab write the position the occuring word
for i in docnames: #all categories are arranged in the order of matrix representing each row's term frequencey count
n=i.split('_')
cat_list.append(n[0])
cat_prob[n[0]]=0.0 #for probablity of each category
for i in xrange(len(cat_list)):
cat_prob[cat_list[i]]+=1.0
cat_list_value[cat_list[i]]=[]
total_val=sum(cat_prob.values())
for i in cat_prob:
cat_prob[i]=cat_prob[i]/total_val
#for each vocab write the position the occuring word
for i in xrange(len(vocab)):
vocab_value[vocab[i]]=i
#for each cat write the position of docs belonging to a category
for i in xrange(len(cat_list)):
a=cat_list_value[cat_list[i]]
a.append(i)
cat_list_value[cat_list[i]]=a
train_js={}
train_js['vocab']=list(voc)
train_js['vocab_value']=vocab_value
train_js['cat_list']=cat_list
train_js['cat_prob']=cat_prob
train_js['cat_list_value']=cat_list_value
with open(os.getcwd()+r'/input/jsons/'+"trained",'w') as file1:
dp=json.dumps(train_js, sort_keys=True, indent=4, separators=(',', ': '))
file1.write(dp)
file1.close()
scipy.io.mmwrite(os.getcwd()+r'/input/jsons/'+"dtm",dtm)
end=time.time()
print "Program ended at: ",end
print "Total Time to process: ",end-start
end=time.time()
print "Program ended at: ",end
print "Total Time to process: ",end-start
sys.exit(end-start)