diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..086977d --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*.pyc +.idea/ +*.orig +virtual/ \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..06aee8f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +nltk==3.2.4 +numpy==1.13.1 +scikit-learn==0.18.2 +six==1.10.0 +sklearn==0.0 \ No newline at end of file diff --git a/server/dataGen.py b/server/dataGen.py index 978fc5e..b82bcb4 100644 --- a/server/dataGen.py +++ b/server/dataGen.py @@ -1,37 +1,23 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import +from __future__ import print_function -# coding: utf-8 - -# In[16]: - - - -# In[60]: - +import re import sys -sys.path.append('/anaconda/envs/py27/lib/python2.7/site-packages/') -#sys.path.append('/usr/local/lib/python2.7/site-packages') - -#for smartdata server -#sys.path.append('/usr/local/lib/python2.7/dist-packages') - import json -import mailbox -from datetime import datetime, timedelta -import re import email +import mailbox +from datetime import datetime, timedelta -# In[17]: - -# In[61]: -class emailobj: +class Email: origin = None target = None cc = None bcc = None subject = "" - date = None + date = None content = "" def __init__(self, origin, target, cc, bcc, subject, date, content): @@ -44,48 +30,44 @@ def __init__(self, origin, target, cc, bcc, subject, date, content): self.content = content -def retrievedata(mymail): - emailArchive = [] - emailmsg = None - +def retrieve_data(mymail): + email_archive = [] + for message in mymail: - emailmsg = emailobj(normalizeContacts(message['from']), - normalizeContacts(message['to']), - normalizeContacts(message['cc']), - normalizeContacts(message['bcc']), - normalizeSubject(message['subject']), - normalizeDate(message['date']), - normalizeBody(message)) - emailArchive.append(emailmsg) - return emailArchive - -def retrieveOriginaldata(mymail): - emailArchive = [] - emailmsg = None - + email_message = Email( + normalizeContacts(message['from']), + normalizeContacts(message['to']), + normalizeContacts(message['cc']), + normalizeContacts(message['bcc']), + normalizeSubject(message['subject']), + normalizeDate(message['date']), + normalizeBody(message)) + email_archive.append(email_message) + return email_archive + + +def retrieve_original_data(mymail): + email_archive = [] + for message in mymail: - emailmsg = emailobj(normalizeContacts(message['from']), - normalizeContacts(message['to']), - normalizeContacts(message['cc']), - normalizeContacts(message['bcc']), - normalizeSubject(message['subject']), - normalizeDate(message['date']), - message) - emailArchive.append(emailmsg) - return emailArchive - - -# In[19]: - -# In[63]: - -def genEmailArch(emailsData): - - #emailArch [[origin],[target],[cc/bcc],"subject",date, "content","urls","emails"] - emailArch = [] - for emailDir in emailsData: + email_message = Email( + normalizeContacts(message['from']), + normalizeContacts(message['to']), + normalizeContacts(message['cc']), + normalizeContacts(message['bcc']), + normalizeSubject(message['subject']), + normalizeDate(message['date']), + message) + email_archive.append(email_message) + return email_archive + + +def generate_email_archive(emails_data): + # emailArch [[origin],[target],[cc/bcc],"subject",date, "content","urls","emails"] + email_archive = [] + for emailDir in emails_data: for emailmsg in emailDir: - emailArch.append([ + email_archive.append([ list(set(emailmsg.origin)), list(set(emailmsg.target)), list(set(emailmsg.cc + emailmsg.bcc)), @@ -94,35 +76,32 @@ def genEmailArch(emailsData): emailmsg.content, infoInBody(emailmsg.content)[0], infoInBody(emailmsg.content)[1] - ]) - return emailArch + ]) + return email_archive -# In[20]: - -# In[64]: - def genNodes(emailArch): nodes = {} idCount = 0 for emailRow in emailArch: allnodes = list(set(emailRow[0] + emailRow[1] + emailRow[2])) for n in allnodes: - if(n not in nodes): + if (n not in nodes): nodes[n] = idCount idCount += 1 return nodes -def genEdges(emailArch,nodes): + +def genEdges(emailArch, nodes): edges = {} edgesInfo = [] - + idCount = 0 for emailRow in emailArch: for origin in emailRow[0]: targetNodes = list(set(emailRow[1] + emailRow[2])) for target in targetNodes: - tuplaKey = (nodes[origin],nodes[target]) + tuplaKey = (nodes[origin], nodes[target]) if (tuplaKey not in edges): edges[tuplaKey] = idCount idCount += 1 @@ -137,74 +116,86 @@ def genEdges(emailArch,nodes): emailRow[6], emailRow[7]]) return edgesInfo - + + def normalizeContacts(contacts): - if contacts == None : + if contacts == None: return [] else: - tl = re.findall(r'[a-zA-Z0-9_-]+[.|\w]\w+@[a-zA-Z0-9_-]+[.]\w+[.|\w+]+',contacts) + tl = re.findall( + r'[a-zA-Z0-9_-]+[.|\w]\w+@[a-zA-Z0-9_-]+[.]\w+[.|\w+]+', contacts) return list(set(tl)) + def normalizeSubject(mysubject): if mysubject == None: return None else: ee = email.Header.decode_header(mysubject) - return re.sub('Re: |RE: ', '', ee[0][0]) + return re.sub('Re: |RE: ', '', ee[0][0]) + def normalizeDate(mydatetime): try: filterdate = re.sub(r' \+.*$| -.*$', "", mydatetime) - #print filterdate + # print filterdate myTime = datetime.strptime(filterdate, '%a, %d %b %Y %H:%M:%S') if (myTime.year < 20): - #add 2000 years + # add 2000 years years = 2000 days_per_year = 365.24 - newtime = myTime + timedelta(days=(years*days_per_year)) + newtime = myTime + timedelta(days=(years * days_per_year)) myTime = newtime return myTime.strftime('%Y/%m/%d %H:%M:%S') except (ValueError, TypeError, NameError): return "" + def infoInBody(msgbody): - urls = re.findall(ur'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?',msgbody) + urls = re.findall( + r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', msgbody) myurls = [] for url in urls: myurls.append(url[1]) - emails = re.findall(r'[a-zA-Z0-9_-]+[.|\w]\w+@[a-zA-Z0-9_-]+[.]\w+[.|\w+]+',msgbody); - #msgbody = re.sub('(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', msgbody) - + emails = re.findall( + r'[a-zA-Z0-9_-]+[.|\w]\w+@[a-zA-Z0-9_-]+[.]\w+[.|\w+]+', msgbody) + # msgbody = re.sub('(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', msgbody) + urlString = "" for url in myurls: urlString = urlString + url + "" - + emailString = "" for email in emails: emailString = emailString + email + "" - - return (urlString,emailString) - + + return (urlString, emailString) + + def normalizeBody(message): msgbody = getbody(message) - msgbody = removeReplies(msgbody,message) - #remove too many white spaces + msgbody = removeReplies(msgbody, message) + # remove too many white spaces msgbody = re.sub('[ ]{2,}', ' ', msgbody) - #msgbody = re.sub("(\n|<|>|}|{|\")",' ',msgbody) - msgbody = re.sub("[^a-zA-Z0-9]+",' ', msgbody) - #if(message['subject'] == 'Re: Bike sharing project'): + # msgbody = re.sub("(\n|<|>|}|{|\")",' ',msgbody) + msgbody = re.sub("[^a-zA-Z0-9]+", ' ', msgbody) + # if(message['subject'] == 'Re: Bike sharing project'): # print msgbody return msgbody -def removeReplies(msgbody,message): + +def removeReplies(msgbody, message): noReplies = msgbody - #if message['In-Reply-To'] != None: - #noReplies = ''.join(noReplies.partition('From:')[0:2]) - noReplies = re.sub(r' From:.*$| wrote:.*$| Original Message .*$| ha scritto:.*$| Da:.*$|-----Original Message-----.*$', "", noReplies) + # if message['In-Reply-To'] != None: + # noReplies = ''.join(noReplies.partition('From:')[0:2]) + noReplies = re.sub( + r' From:.*$| wrote:.*$| Original Message .*$| ha scritto:.*$| Da:.*$|-----Original Message-----.*$', "", + noReplies) noReplies = re.sub('[_]{2,}', ' ', noReplies) return noReplies -def getbody(message): #getting plain text 'email body' + +def getbody(message): # getting plain text 'email body' body = "None" if message.is_multipart(): for part in message.walk(): @@ -218,36 +209,36 @@ def getbody(message): #getting plain text 'email body' body = message.get_payload(decode=True) return body -def genEdgesNodesJson(edges,nodes,myContacts): - + +def genEdgesNodesJson(edges, nodes, myContacts): nodesDic = {} for k in nodes.keys(): nodesDic[nodes[k]] = k - + myContactIds = [] for myContact in myContacts: if myContact in nodes: myContactIds.append(nodes[myContact]) - + undEdgesId = 0 dicUndEdgesId = {} for k in edges: - lista = [k[1],k[2]] + lista = [k[1], k[2]] lista.sort() tupla = tuple(lista) if tupla not in dicUndEdgesId: dicUndEdgesId[tupla] = undEdgesId undEdgesId = undEdgesId + 1 - - arrEdgesDirected = [] - arrEdgesUndirected = [] - objDicDirectedNodes = {} + + arr_edges_directed = [] + arr_edges_undirected = [] + obj_dic_directed_nodes = {} objDicUndirectedNodes = {} - arrCollSubj = collaborativeSubj(edges) + arrCollSubj = collaborative_subj(edges) for k in edges: objDic = {} - - #Directed case + + # Directed case objDic['id'] = k[0] objDic['origin'] = k[1] objDic['target'] = k[2] @@ -258,16 +249,16 @@ def genEdgesNodesJson(edges,nodes,myContacts): objDic['content'] = k[5] objDic['urls'] = k[6] objDic['emails'] = k[7] - - arrEdgesDirected.append(objDic) - - objDicDirectedNodes[objDic['origin']] = objDic['originLbl'] - objDicDirectedNodes[objDic['target']] = objDic['targetLbl'] - - #Undirected case - if ((not(k[1] in myContactIds)) and (not(k[2] in myContactIds))): - if k[3] in arrCollSubj : - lista = [k[1],k[2]] + + arr_edges_directed.append(objDic) + + obj_dic_directed_nodes[objDic['origin']] = objDic['originLbl'] + obj_dic_directed_nodes[objDic['target']] = objDic['targetLbl'] + + # Undirected case + if ((not (k[1] in myContactIds)) and (not (k[2] in myContactIds))): + if k[3] in arrCollSubj: + lista = [k[1], k[2]] lista.sort() tupla = tuple(lista) objDic = {} @@ -282,173 +273,166 @@ def genEdgesNodesJson(edges,nodes,myContacts): objDic['urls'] = k[6] objDic['emails'] = k[7] - arrEdgesUndirected.append(objDic) + arr_edges_undirected.append(objDic) objDicUndirectedNodes[objDic['origin']] = objDic['originLbl'] objDicUndirectedNodes[objDic['target']] = objDic['targetLbl'] - - + arrNodesDirected = [] - for k in objDicDirectedNodes.keys(): + for k in obj_dic_directed_nodes.keys(): objDic = {} objDic['id'] = k - objDic['label'] = objDicDirectedNodes[k] + objDic['label'] = obj_dic_directed_nodes[k] arrNodesDirected.append(objDic) - + arrNodesUndirected = [] for k in objDicUndirectedNodes.keys(): objDic = {} objDic['id'] = k objDic['label'] = objDicUndirectedNodes[k] arrNodesUndirected.append(objDic) - + globalDic = {} internalDic = {} - internalDic['directed'] = arrEdgesDirected - internalDic['undirected'] = arrEdgesUndirected + internalDic['directed'] = arr_edges_directed + internalDic['undirected'] = arr_edges_undirected globalDic['edges'] = internalDic - + internalDic = {} internalDic['directed'] = arrNodesDirected internalDic['undirected'] = arrNodesUndirected globalDic['nodes'] = internalDic - return globalDic + def getTest(): stringa = "ciao" return stringa -def getAllArchive(mboxUrl): + +def get_all_archive(mboxUrl): emailData = [] - emailData.append(retrievedata(mailbox.mbox(str(mboxUrl)))) - - emailArch = genEmailArch(emailData) - + emailData.append(retrieve_data(mailbox.mbox(str(mboxUrl)))) + + emailArch = generate_email_archive(emailData) + return emailArch - -#returns archive dictionary {(subject,time)} -> content -def getAllArchiveText(mboxUrls): - emailData = [] - for mboxUrl in mboxUrls: - emailData.append(retrievedata(mailbox.mbox(str(mboxUrl)))) - emailArch = genEmailArch(emailData) - - dicEmails = {} - for e in emailArch: - #(subject,time) the tupla key - tupla = (e[3],e[4]) - #content to the key - dicEmails[tupla] = e[5] - return dicEmails - -def getEmailsOfSub(subj,subjTime,arrMboxUrl): + + +# returns archive dictionary {(subject,time)} -> content +def get_all_archive_text(mboxUrls): + email_data = [] + for mbox_url in mboxUrls: + email_data.append(retrieve_data(mailbox.mbox(str(mbox_url)))) + email_arch = generate_email_archive(email_data) + + dic_emails = {} + for email in email_arch: + # (subject,time) the tupla key + tupla = (email[3], email[4]) + # content to the key + dic_emails[tupla] = email[5] + return dic_emails + + +def getEmailsOfSub(subj, subjTime, arrMboxUrl): arrEmails = [] for mboxUrl in arrMboxUrl: mymail = mailbox.mbox(mboxUrl) for message in mymail: - if(normalizeSubject(message['subject']) == subj): + if (normalizeSubject(message['subject']) == subj): mytime = normalizeDate(message['date']) - #return [str(mytime),subjTime] - if(str(mytime) == subjTime): - msgObj = {'date': str(mytime),'allmsg':str(message)} + # return [str(mytime),subjTime] + if (str(mytime) == subjTime): + msgObj = {'date': str(mytime), 'allmsg': str(message)} arrEmails.append(msgObj) return arrEmails -def getNodesContents(nodes,mboxUrl): - emailData = [] - emailData.append(retrievedata(mailbox.mbox(str(mboxUrl)))) - - emailArch = genEmailArch(emailData) - - - return emailArch -# In[69]: - -def convertToJSON(emailArch): - emailsJSON = [] - for e in emailArch: - emailObj = {} - emailObj['from'] = e[0] - emailObj['to'] = e[1] - emailObj['cc'] = e[2] - emailObj['subject'] = e[3] - emailObj['time'] = e[4] - emailObj['content'] = e[5] - emailsJSON.append(emailObj) - return emailsJSON - - -def collaborativeSubj(edges): - dicSubj = {} - +def get_nodes_contents(nodes, mboxUrl): + email_data = list() + email_data.append(retrieve_data(mailbox.mbox(str(mboxUrl)))) + email_arch = generate_email_archive(email_data) + return email_arch + + +def convert_to_json(email_archive): + emails_json = list() + for e in email_archive: + email_obj = dict() + email_obj['from'] = e[0] + email_obj['to'] = e[1] + email_obj['cc'] = e[2] + email_obj['subject'] = e[3] + email_obj['time'] = e[4] + email_obj['content'] = e[5] + emails_json.append(email_obj) + return emails_json + + +def collaborative_subj(edges): + dic_subj = dict() + for k in edges: - if k[3] not in dicSubj: - dicSubj[k[3]] = [] - dicSubj[k[3]].append(k[4]) + if k[3] not in dic_subj: + dic_subj[k[3]] = [] + dic_subj[k[3]].append(k[4]) else: - if k[4] not in dicSubj[k[3]]: - dicSubj[k[3]].append(k[4]) - - arrSubj = [] - for k in dicSubj.keys(): - if (int(len(dicSubj[k])) > 1): - arrSubj.append(k) - return arrSubj + if k[4] not in dic_subj[k[3]]: + dic_subj[k[3]].append(k[4]) + + list_subj = list + for k in dic_subj.keys(): + if len(dic_subj[k]) > 1: + list_subj.append(k) + return list_subj cmdargs = str(sys.argv) functionname = str(sys.argv[1]) -if(functionname == "getNodesEdges"): - +if functionname == "getNodesEdges": mboxUrls = re.split('NEWURL', sys.argv[2]) mboxMyContacts = re.split('NEWCONTACT', sys.argv[3]) emailData = [] - + for mboxUrl in mboxUrls: - emailData.append(retrievedata(mailbox.mbox(mboxUrl))) + emailData.append(retrieve_data(mailbox.mbox(mboxUrl))) - - emailArch = genEmailArch(emailData) + emailArch = generate_email_archive(emailData) nodes = genNodes(emailArch) - edges = genEdges(emailArch,nodes) - allData = genEdgesNodesJson(edges,nodes,mboxMyContacts) - - resultDic = {} - resultDic['allEmails'] = convertToJSON(emailArch) - resultDic['nodesEdges'] = allData - - print json.dumps(resultDic, ensure_ascii=False) + edges = genEdges(emailArch, nodes) + allData = genEdgesNodesJson(edges, nodes, mboxMyContacts) + result_dict = dict() + result_dict['allEmails'] = convert_to_json(emailArch) + result_dict['nodesEdges'] = allData + print(json.dumps(result_dict, ensure_ascii=False)) -elif(functionname == 'getOriginalEmails'): - +elif functionname == 'getOriginalEmails': mboxUrls = re.split('', sys.argv[2]) datasetName = str(sys.argv[3]) - subjectTime = sys.argv[4]+" "+sys.argv[5] + subjectTime = sys.argv[4] + " " + sys.argv[5] subject = str(sys.argv[6]) - - for i in range(7,len(sys.argv)): + + for i in range(7, len(sys.argv)): subject = subject + " " + sys.argv[i] - - globalDic = {} - globalDic['name'] = subject - globalDic['msgs'] = getEmailsOfSub(subject,subjectTime,mboxUrls) - - print json.dumps(globalDic, ensure_ascii=False) -elif(functionname == 'getAllEmails'): - + global_dict = dict() + global_dict['name'] = subject + global_dict['msgs'] = getEmailsOfSub(subject, subjectTime, mboxUrls) + + print(json.dumps(global_dict, ensure_ascii=False)) + +elif functionname == 'getAllEmails': mboxUrls = re.split('NEWURL', sys.argv[2]) emailData = [] - + for mboxUrl in mboxUrls: - emailData.append(retrieveOriginaldata(mailbox.mbox(mboxUrl))) + emailData.append(retrieve_original_data(mailbox.mbox(mboxUrl))) + + emailArch = generate_email_archive(emailData) - emailArch = genEmailArch(emailData) - - resultDic = {} - resultDic['allEmails'] = convertToJSON(emailArch) - print json.dumps(resultDic, ensure_ascii=False) + result_dict = dict() + result_dict['allEmails'] = convert_to_json(emailArch) + print(json.dumps(result_dict, ensure_ascii=False)) diff --git a/server/nlp-proc.py b/server/nlp-proc.py index c494503..a879f11 100644 --- a/server/nlp-proc.py +++ b/server/nlp-proc.py @@ -1,350 +1,337 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import +from __future__ import print_function +import re import sys -sys.path.append('/anaconda/envs/py27/lib/python2.7/site-packages/') -#sys.path.append('/usr/local/lib/python2.7/site-packages') +import json +import collections -#for smartdata server -#sys.path.append('/usr/local/lib/python2.7/dist-packages') +from datetime import datetime -import json -import re -import numpy as np +from nltk.corpus import stopwords from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import TruncatedSVD from sklearn.metrics.pairwise import linear_kernel -import collections -import nltk -from nltk.corpus import stopwords +from .dataGen import getAllArchive +from .dataGen import getAllArchiveText -from dataGen import getAllArchive -from dataGen import getAllArchiveText -from datetime import datetime +def set_stop_words(dataset): -def setStopWords(dataset): - stop_words = None - otherSWords = [] - - + if dataset == "clinton": stop_words = set(stopwords.words('english')) - otherSWords = ['UNCLASSIFIED','U.S.','Department', 'State', 'Case', 'No.','No','US','Doc', 'Date','From','To','Subject','Clinton','clinton','sent','Sent','Send','Ok','ok','pm','am'] - stop_words.update(list(set(otherSWords))) - - #arrContent = getContacts() - #allNames = [] - #for row in arrContent : - # split1 = row[0].split("@") - # for n in split1: - # split2= n.split(".") - # for m in split2: - # allNames.append(m) - # #allNames.append(unicode(names, "utf-8",errors='ignore')) - #allNames = list(set(allNames)) - - stop_words.update(allNames) - + other_s_words = ['UNCLASSIFIED', 'U.S.', 'Department', 'State', 'Case', 'No.', 'No', 'US', 'Doc', 'Date', + 'From', 'To', 'Subject', 'Clinton', 'clinton', 'sent', 'Sent', 'Send', 'Ok', 'ok', 'pm', 'am'] + stop_words.update(list(set(other_s_words))) + + # XXX: what? this wouldn't work for clinton + # stop_words.update(allNames) + raise ValueError('AllNames var is not defined') elif dataset == "enron": stop_words = set(stopwords.words('english')) - otherSWords = ['enron','No.','No','US','Ok','ok','pm','am','http','link','www','com','html','travelocity'] - stop_words.update(list(set(otherSWords))) - - #stop_words.update(allNames) - + other_s_words = ['enron', 'No.', 'No', 'US', 'Ok', 'ok', 'pm', + 'am', 'http', 'link', 'www', 'com', 'html', 'travelocity'] + stop_words.update(list(set(other_s_words))) elif dataset == "uniboIvan": stop_words = set(stopwords.words('italian')) eng_stop_words = set(stopwords.words('english')) stop_words.update(list(eng_stop_words)) - - otherSWords = ['Grazie','grazie','Saluti','saluti','Salve','salve','Distinti','distinti','Cordiali','cordiali','http','pm','am','www'] - stop_words.update(list(set(otherSWords))) - - #additional stop words - htmlTags =["a","abbr","acronym","address","area","b","base","bdo","big","blockquote","body","br","button","caption","cite","code","col","colgroup","dd","del","dfn","div","dl","DOCTYPE","dt","em","fieldset","form","h1","h2","h3","h4","h5","h6","head","html","hr","i","img","input","ins","kbd","label","legend","li","link","map","meta","noscript","object","ol","optgroup","option","p","param","pre","q","samp","script","select","small","span","strong","style","sub","sup","table","tbody","td","textarea","tfoot","th","thead","title","tr","tt","ul","var"] - otherHtmlParams = ["font","width","height","href","gif","color","size","00","image","net","asp"] - irrNumbers = [] + + other_s_words = ['Grazie', 'grazie', 'Saluti', 'saluti', 'Salve', 'salve', + 'Distinti', 'distinti', 'Cordiali', 'cordiali', 'http', 'pm', 'am', 'www'] + stop_words.update(list(set(other_s_words))) + + # additional stop words + html_tags = ["a", "abbr", "acronym", "address", "area", "b", "base", "bdo", "big", "blockquote", "body", "br", + "button", "caption", "cite", "code", "col", "colgroup", "dd", "del", "dfn", "div", "dl", "DOCTYPE", + "dt", "em", "fieldset", "form", "h1", "h2", "h3", "h4", "h5", "h6", "head", "html", "hr", "i", "img", + "input", "ins", "kbd", "label", "legend", "li", "link", "map", "meta", "noscript", "object", "ol", + "optgroup", "option", "p", "param", "pre", "q", "samp", "script", "select", "small", "span", "strong", + "style", "sub", "sup", "table", "tbody", "td", "textarea", "tfoot", "th", "thead", "title", "tr", "tt", + "ul", "var"] + other_html_params = ["font", "width", "height", "href", "gif", "color", "size", "00", "image", "net", "asp"] + irr_numbers = [] for i in range(101): - irrNumbers.append(str(i)) - - stop_words.update(list(set(htmlTags))) - stop_words.update(list(set(otherHtmlParams))) - stop_words.update(list(set(irrNumbers))) - + irr_numbers.append(str(i)) + + stop_words.update(list(set(html_tags))) + stop_words.update(list(set(other_html_params))) + stop_words.update(list(set(irr_numbers))) + return stop_words -def inStopwords(word, stopwords): - for sw in stopwords: - if iequal(word,sw): + +def in_stop_words(word, stop_words): + for stop_word in stop_words: + if iequal(word, stop_word): return True return False -def bestwords(strtext,numWords,dataset): - stop_words = setStopWords(dataset) - text = strtext - +def best_words(string_text, num_words, dataset): + + stop_words = set_stop_words(dataset) + text = string_text + words = re.findall('\w+', text) - filterWords = [] + filter_words = [] for w in words: - if not inStopwords(w, stop_words): - filterWords.append(w) - - myColl = collections.Counter(filterWords) - best10 = myColl.most_common(numWords) - - dicbest = [] - for elem in best10: - elemJson = {} - elemJson['word'] = elem[0] - elemJson['value'] = elem[1] - dicbest.append(elemJson) - #dicbest[elem[0]] = elem[1] - - globalDic = {} - globalDic['words'] = dicbest - #return json.dumps(dicbest, ensure_ascii=False) - return globalDic - -def termsCountMatrix(dataset,datasetUrls,voc): - - stopset = setStopWords(dataset) - #returns a dictionary where keys are the pairs (subject,time) - dicEmails = getAllArchiveText(datasetUrls) - - #create array of all emails retrieved - arrEmails = [] - for k in dicEmails.keys(): - tupla = (k,dicEmails[k]) - arrEmails.append(tupla) - - #create corpus from all emails - myCorpus = [] - for textElem in arrEmails: - msgContent = re.sub('[^0-9\'.a-zA-Z]+', ' ', textElem[1]) - myCorpus.append(unicode(msgContent, "utf-8",errors='ignore')) - - vect = CountVectorizer(analyzer='word', vocabulary = list(set(voc)), ngram_range=(1,3), stop_words = stopset) - count_matrix = vect.fit_transform(myCorpus) + if not in_stop_words(w, stop_words): + filter_words.append(w) + + my_coll = collections.Counter(filter_words) + best10 = my_coll.most_common(num_words) + + dicbest = list() + for item in best10: + item_json = dict() + item_json['word'] = item[0] + item_json['value'] = item[1] + dicbest.append(item_json) + + global_dic = dict() + global_dic['words'] = dicbest + return global_dic + + +def terms_count_matrix(dataset, dataset_urls, voc): + + stopset = set_stop_words(dataset) + # returns a dictionary where keys are the pairs (subject,time) + dic_emails = getAllArchiveText(dataset_urls) + + # create array of all emails retrieved + arr_emails = [] + for k in dic_emails.keys(): + tupla = (k, dic_emails[k]) + arr_emails.append(tupla) + + # create corpus from all emails + my_corpus = [] + for textElem in arr_emails: + msg_content = re.sub('[^0-9\'.a-zA-Z]+', ' ', textElem[1]) + my_corpus.append(u'{}'.format(msg_content)) + + vect = CountVectorizer(analyzer='word', vocabulary=list( + set(voc)), ngram_range=(1, 3), stop_words=stopset) + count_matrix = vect.fit_transform(my_corpus) feature_names = vect.get_feature_names() dense = count_matrix.todense() - - i=0 - dicTerms = {} - for timeItem in dense : - - #array of all terms freq counts for each email + + i = 0 + dic_terms = dict() + for timeItem in dense: + + # array of all terms freq counts for each email email_scores = timeItem.tolist()[0] num_terms = len(email_scores) - - #phrases_scores: creates an array of pairs (term_array_index, term_score) for every email - phrase_scores = [pair for pair in zip(range(0, num_terms), email_scores) if pair[1] > 0] - - #sort the phrases_scores - sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1) - + + # phrases_scores: creates an array of pairs (term_array_index, term_score) for every email + phrase_scores = [pair for pair in zip( + range(0, num_terms), email_scores) if pair[1] > 0] + + # sort the phrases_scores + # XXX: sorted_phrase_scores is not used + # sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1) + for phrase, score in [(feature_names[word_id], score) for (word_id, score) in phrase_scores]: - data = {'score':score,'subject':arrEmails[i][0][0],'time':arrEmails[i][0][1]} - - if dicTerms.has_key(phrase): - dicTerms[phrase].append(data) + data = {'score': score, + 'subject': arr_emails[i][0][0], 'time': arr_emails[i][0][1]} + + if dic_terms.has_key(phrase): + dic_terms[phrase].append(data) else: - dicTerms[phrase] = [data] - - i= i + 1 - - arrResTerms = [] - for termKey in dicTerms: - #arrResTerms[dicTerms[termKey]['name']] = dicTerms[termKey] - arrResTerms.append(dicTerms[termKey]) - - #returns a JSON {phrase_word:[array-of-emails]} - return dicTerms - - -def wordsXtime(dataset,wordsNum,datasetUrls): - - stopset = setStopWords(dataset) - - dicEmails = getAllArchiveText(datasetUrls) - - #create array of all emails retrieved - arrEmails = [] - for k in dicEmails.keys(): - tupla = (k,dicEmails[k]) - arrEmails.append(tupla) - - #create dictionary according to the dates - dicTime = {} - for elem in arrEmails: - #time = elem[0][1] - time = datetime.strptime(elem[0][1],'%Y/%m/%d %H:%M:%S') - stepTime = time.strftime('%Y-%m') - if stepTime in dicTime : - dicTime[stepTime].append(elem[1]) + dic_terms[phrase] = [data] + i += 1 + + arr_res_terms = [] + for termKey in dic_terms: + # arr_res_terms[dicTerms[termKey]['name']] = dicTerms[termKey] + arr_res_terms.append(dic_terms[termKey]) + + # returns a JSON {phrase_word:[array-of-emails]} + return dic_terms + + +def words_per_time(dataset, words_num, dataset_urls): + + stopset = set_stop_words(dataset) + dic_emails = getAllArchiveText(dataset_urls) + + # create array of all emails retrieved + arr_emails = [] + for k in dic_emails.keys(): + tupla = (k, dic_emails[k]) + arr_emails.append(tupla) + + # create dictionary according to the dates + dic_time = {} + for elem in arr_emails: + time = datetime.strptime(elem[0][1], '%Y/%m/%d %H:%M:%S') + step_time = time.strftime('%Y-%m') + if step_time in dic_time: + dic_time[step_time].append(elem[1]) else: - dicTime[stepTime] = [elem[1]] - - - #convert to arr to keep indexes - arrTimeContent = [] - for k in dicTime.keys(): - tupla = (k,dicTime[k]) - arrTimeContent.append(tupla) - - #create corpus - myCorpus = [] - for elem in arrTimeContent: - allMsgsContent = "" - for textMsg in elem[1]: - msgContent = re.sub('[^0-9\'.a-zA-Z]+', ' ', textMsg) - allMsgsContent = allMsgsContent + " " + msgContent + " " - myCorpus.append(unicode(allMsgsContent, "utf-8",errors='ignore')) - - #tf-idf vars - vect = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0, stop_words = stopset) - tfidf_matrix = vect.fit_transform(myCorpus) + dic_time[step_time] = [elem[1]] + + # convert to arr to keep indexes + arr_time_content = [] + for k in dic_time.keys(): + tupla = (k, dic_time[k]) + arr_time_content.append(tupla) + + # create corpus + my_corpus = [] + for elem in arr_time_content: + all_msgs_content = "" + for text_msg in elem[1]: + msg_content = re.sub('[^0-9\'.a-zA-Z]+', ' ', text_msg) + all_msgs_content = all_msgs_content + " " + msg_content + " " + my_corpus.append(u'{}'.format(all_msgs_content)) + + # tf-idf vars + vect = TfidfVectorizer(analyzer='word', ngram_range=( + 1, 3), min_df=0, stop_words=stopset) + tfidf_matrix = vect.fit_transform(my_corpus) feature_names = vect.get_feature_names() dense = tfidf_matrix.todense() - - #build the final arr of Timeseries - globalArrTime = [] - i=0 - for timeItem in dense : + + # build the final arr of Timeseries + global_arr_time = [] + i = 0 + for timeItem in dense: email = timeItem.tolist()[0] - phrase_scores = [pair for pair in zip(range(0, len(email)), email) if pair[1] > 0] + phrase_scores = [pair for pair in zip( + range(0, len(email)), email) if pair[1] > 0] sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1) - - dicTime = {} - dicTime['name'] = str(arrTimeContent[i][0]) - dicTime['words'] = [] - - #if dicTime['name'] == "2001-05": - # for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][:100]: - # print phrase+" - "+str(score) - - for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][:wordsNum * 10]: - data = {'name':phrase,'score':score} - dicTime['words'].append(data) - #print dicTime - - globalArrTime.append(dicTime) + + dic_time = dict() + dic_time['name'] = str(arr_time_content[i][0]) + dic_time['words'] = [] + + for phrase, score in [(feature_names[word_id], score) + for (word_id, score) in sorted_phrase_scores][:words_num * 10]: + data = {'name': phrase, 'score': score} + dic_time['words'].append(data) + # print dicTime + + global_arr_time.append(dic_time) i = i + 1 - - finalArr = globalArrTime - myVoc = [] - #get Best Words Num - finalArr = [] - for dt in globalArrTime: - - dicTime = {} - dicTime['name'] = dt['name'] - dicTime['words'] = [] - wordsXt = wordsNum + + # get Best Words Num + my_voc = [] + final_arr = [] + for dt in global_arr_time: + dic_time = dict() + dic_time['name'] = dt['name'] + dic_time['words'] = [] + words_xt = words_num for wScore in dt['words']: - if checkGramTime(wScore,dt['words']): - dicTime['words'].append(wScore) - myVoc.append(wScore['name']) - wordsXt = wordsXt - 1 - if wordsXt == 0: + if check_gram_time(wScore, dt['words']): + dic_time['words'].append(wScore) + my_voc.append(wScore['name']) + words_xt = words_xt - 1 + if words_xt == 0: break - finalArr.append(dicTime) - - globalDic = {} - globalDic['wordsXtime'] = finalArr - globalDic['termsCount'] = termsCountMatrix(dataset,datasetUrls,myVoc) - return globalDic - + final_arr.append(dic_time) + + global_dic = dict() + global_dic['wordsXtime'] = final_arr + global_dic['termsCount'] = terms_count_matrix(dataset, dataset_urls, my_voc) + return global_dic + -def wordClustering(dataset,conceptNum,wordsNum,datasetUrls): +def word_clustering(dataset, conceptNum, wordsNum, datasetUrls): - stopset = setStopWords(dataset) - - dicEmails = getAllArchiveText(datasetUrls) + stop_set = set_stop_words(dataset) - #create array of all emails retrieved + dic_emails = getAllArchiveText(datasetUrls) + + # create array of all emails retrieved arrEmails = [] - for k in dicEmails.keys(): - tupla = (k,dicEmails[k]) + for k in dic_emails.keys(): + tupla = (k, dic_emails[k]) arrEmails.append(tupla) - - myCorpus = [] + + my_corpus = [] for elem in arrEmails: - filteredText = re.sub('[^0-9\'.a-zA-Z]+', ' ', elem[1]) - myCorpus.append(unicode(filteredText, "utf-8",errors='ignore')) - - vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1,3), min_df = 0, stop_words = stopset) - X = vectorizer.fit_transform(myCorpus) + filtered_text = re.sub('[^0-9\'.a-zA-Z]+', ' ', elem[1]) + my_corpus.append(u'{}'.format(filtered_text)) + + vectorizer = TfidfVectorizer(use_idf=True, ngram_range=( + 1, 3), min_df=0, stop_words=stop_set) + X = vectorizer.fit_transform(my_corpus) - # Truncate the matrix to a number of components(concepts) lsa = TruncatedSVD(n_components=conceptNum, n_iter=100) lsa.fit(X) - - allData = [] + + all_data = list() terms = vectorizer.get_feature_names() for i, comp in enumerate(lsa.components_): - #merge 2 lists together terms and comp - termsInComp = zip (terms,comp) - sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse=True) [:wordsNum*3] - - cluster = {} - #cluster['name'] = "Concept %d" % i - cluster['name'] = str(sortedTerms[0][0]) + # merge 2 lists together terms and comp + terms_in_comp = zip(terms, comp) + sorted_terms = sorted(terms_in_comp, key=lambda x: x[1], reverse=True)[:wordsNum*3] + + cluster = dict() + cluster['name'] = str(sorted_terms[0][0]) cluster['id'] = str(i) - clusterData =[] - #print "Concept %d:" % i - - wordsXclus = wordsNum - for term in sortedTerms: - data = {} + cluster_data = [] + + words_xclus = wordsNum + for term in sorted_terms: + data = dict() data['name'] = term[0] data['score'] = term[1] - if checkGram(data,sortedTerms): - clusterData.append(data) - wordsXclus = wordsXclus - 1 - if wordsXclus == 0: + if check_gram(data, sorted_terms): + cluster_data.append(data) + words_xclus = words_xclus - 1 + if words_xclus == 0: break - #clusterData.append(data) - #print term - cluster['children'] = clusterData - allData.append(cluster) - - arrCosSim = [] - for i in range(0,len(arrEmails)): - cosine_similarities = linear_kernel(X[i:i+1], lsa.components_).flatten() - - objDic = {} - objDic['subject'] = arrEmails[i][0][0] - objDic['time'] = arrEmails[i][0][1] - objDic['cosSim'] = [] - for i in range(0,len(cosine_similarities)-1): - objCosScore = {} - objCosScore['cluster'] = i - objCosScore['score'] = cosine_similarities[i] - objDic['cosSim'].append(objCosScore) - - arrCosSim.append(objDic) - - returnedDic = {} - returnedDic['name'] = 'clusters' - returnedDic['children'] = allData - returnedDic['emailsClusters'] = arrCosSim - - return returnedDic - -def checkGram(data,sortedTerms): - for elem in sortedTerms: + cluster['children'] = cluster_data + all_data.append(cluster) + + arr_cos_sim = [] + for i in range(0, len(arrEmails)): + cosine_similarities = linear_kernel( + X[i:i+1], lsa.components_).flatten() + + obj_dic = {} + obj_dic['subject'] = arrEmails[i][0][0] + obj_dic['time'] = arrEmails[i][0][1] + obj_dic['cosSim'] = [] + for i in range(0, len(cosine_similarities)-1): + obj_cos_score = {} + obj_cos_score['cluster'] = i + obj_cos_score['score'] = cosine_similarities[i] + obj_dic['cosSim'].append(obj_cos_score) + + arr_cos_sim.append(obj_dic) + + returned_dic = dict() + returned_dic['name'] = 'clusters' + returned_dic['children'] = all_data + returned_dic['emailsClusters'] = arr_cos_sim + + return returned_dic + + +def check_gram(data, sorted_terms): + for elem in sorted_terms: if data['name'] in elem[0]: if len(elem[0]) > len(data['name']): if data['score'] == elem[1]: return False return True -def checkGramTime(data,sortedTerms): + +def check_gram_time(data, sortedTerms): for elem in sortedTerms: if data['name'] in elem['name']: if len(elem['name']) > len(data['name']): @@ -352,6 +339,7 @@ def checkGramTime(data,sortedTerms): return False return True + def iequal(a, b): try: return a.upper() == b.upper() @@ -359,67 +347,45 @@ def iequal(a, b): return a == b -#When this file is called +# When this file is called cmdargs = str(sys.argv) functionname = str(sys.argv[1]) -#datasetUrl = 'server/data/mbox/enron/enron-smith-m.mbox' +# datasetUrl = 'server/data/mbox/enron/enron-smith-m.mbox' -if(functionname == "bestwords"): +if functionname == "bestwords": mboxUrls = re.split('', sys.argv[2]) datasetName = str(sys.argv[3]) numWords = int(sys.argv[4]) allText = sys.argv[5] - - globalDic = bestwords(allText,numWords,datasetName) - - print json.dumps(globalDic, ensure_ascii=False) - -elif(functionname == "wordsXtime"): + global_dict = best_words(allText, numWords, datasetName) + print(json.dumps(global_dict, ensure_ascii=False)) +elif functionname == "wordsXtime": mboxUrls = re.split('NEWURL', sys.argv[2]) datasetName = str(sys.argv[3]) wordsNum = int(sys.argv[4]) - - globalDic = {} - globalDic['name'] = 'wordsXtime' - #globalDic['columns'] = wordsXtime(datasetName,wordsNum,mboxUrls) - res = wordsXtime(datasetName,wordsNum,mboxUrls) - - print json.dumps(res, ensure_ascii=False) - -elif(functionname == "clusterwords"): + global_dict = dict() + global_dict['name'] = 'wordsXtime' + res = words_per_time(datasetName, wordsNum, mboxUrls) + print(json.dumps(res, ensure_ascii=False)) +elif functionname == "clusterwords": mboxUrls = re.split('NEWURL', sys.argv[2]) datasetName = str(sys.argv[3]) conceptNum = int(sys.argv[4]) wordsNum = int(sys.argv[5]) + global_dict = word_clustering(datasetName, conceptNum, wordsNum, mboxUrls) + print(json.dumps(global_dict, ensure_ascii=False)) - globalDic = wordClustering(datasetName,conceptNum,wordsNum,mboxUrls) - - print json.dumps(globalDic, ensure_ascii=False) - -elif(functionname == "getStopWords"): - +elif functionname == "getStopWords": datasetName = str(sys.argv[2]) - - arrSW = [] - for sw in setStopWords(datasetName): - arrSW.append(sw) - - globalDic = {} - globalDic['stopWords'] = arrSW - - print json.dumps(globalDic, ensure_ascii=False) - - -''' -elif(functionname == "termsCountMatrix"): - mboxUrls = re.split('', sys.argv[2]) - datasetName = str(sys.argv[3]) + arr_sw = [] + for sw in set_stop_words(datasetName): + arr_sw.append(sw) + global_dict = dict() + global_dict['stopWords'] = arr_sw + + print(json.dumps(global_dict, ensure_ascii=False)) - globalDic = {} - globalDic['termsCountMatrix'] = termsCountMatrix(datasetName,mboxUrls) - print json.dumps(globalDic, ensure_ascii=False) -''' -#exampleCall: python2.7 server/nlp-proc.py clusterwords server/data/mbox/enron/enron-smith-m.mbox enron 10 10 -#returns : {name:'clusters',children:[{'name':clusterX,'children':childrenX}], -# 'emailsClusters':[{'subject':subjX,'time':timeX,'cosSim':[{}]}]} \ No newline at end of file +# exampleCall: python2.7 server/nlp-proc.py clusterwords server/data/mbox/enron/enron-smith-m.mbox enron 10 10 +# returns : {name:'clusters',children:[{'name':clusterX,'children':childrenX}], +# 'emailsClusters':[{'subject':subjX,'time':timeX,'cosSim':[{}]}]}