diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..086977d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+*.pyc
+.idea/
+*.orig
+virtual/
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..06aee8f
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+nltk==3.2.4
+numpy==1.13.1
+scikit-learn==0.18.2
+six==1.10.0
+sklearn==0.0
\ No newline at end of file
diff --git a/server/dataGen.py b/server/dataGen.py
index 978fc5e..b82bcb4 100644
--- a/server/dataGen.py
+++ b/server/dataGen.py
@@ -1,37 +1,23 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+from __future__ import print_function
 
-# coding: utf-8
-
-# In[16]:
-
-
-
-# In[60]:
-
+import re
 import sys
-sys.path.append('/anaconda/envs/py27/lib/python2.7/site-packages/')
-#sys.path.append('/usr/local/lib/python2.7/site-packages')
-
-#for smartdata server
-#sys.path.append('/usr/local/lib/python2.7/dist-packages')
-
 import json
-import mailbox
-from datetime import datetime, timedelta
-import re
 import email
+import mailbox
 
+from datetime import datetime, timedelta
 
-# In[17]:
-
-# In[61]:
 
-class emailobj:
+class Email:
     origin = None
     target = None
     cc = None
     bcc = None
     subject = ""
-    date = None 
+    date = None
     content = ""
 
     def __init__(self, origin, target, cc, bcc, subject, date, content):
@@ -44,48 +30,44 @@ def __init__(self, origin, target, cc, bcc, subject, date, content):
         self.content = content
 
 
-def retrievedata(mymail):
-    emailArchive = []
-    emailmsg = None
-    
+def retrieve_data(mymail):
+    email_archive = []
+
     for message in mymail:
-            emailmsg = emailobj(normalizeContacts(message['from']),
-                          normalizeContacts(message['to']),
-                          normalizeContacts(message['cc']),
-                          normalizeContacts(message['bcc']),
-                          normalizeSubject(message['subject']),
-                          normalizeDate(message['date']),
-                          normalizeBody(message))
-            emailArchive.append(emailmsg)
-    return emailArchive
-
-def retrieveOriginaldata(mymail):
-    emailArchive = []
-    emailmsg = None
-    
+        email_message = Email(
+            normalizeContacts(message['from']),
+            normalizeContacts(message['to']),
+            normalizeContacts(message['cc']),
+            normalizeContacts(message['bcc']),
+            normalizeSubject(message['subject']),
+            normalizeDate(message['date']),
+            normalizeBody(message))
+        email_archive.append(email_message)
+    return email_archive
+
+
+def retrieve_original_data(mymail):
+    email_archive = []
+
     for message in mymail:
-            emailmsg = emailobj(normalizeContacts(message['from']),
-                          normalizeContacts(message['to']),
-                          normalizeContacts(message['cc']),
-                          normalizeContacts(message['bcc']),
-                          normalizeSubject(message['subject']),
-                          normalizeDate(message['date']),
-                          message)
-            emailArchive.append(emailmsg)
-    return emailArchive
-
-
-# In[19]:
-
-# In[63]:
-
-def genEmailArch(emailsData):   
-            
-    #emailArch [[origin],[target],[cc/bcc],"subject",date, "content","urls","emails"]
-    emailArch = []
-    for emailDir in emailsData:
+        email_message = Email(
+            normalizeContacts(message['from']),
+            normalizeContacts(message['to']),
+            normalizeContacts(message['cc']),
+            normalizeContacts(message['bcc']),
+            normalizeSubject(message['subject']),
+            normalizeDate(message['date']),
+            message)
+        email_archive.append(email_message)
+    return email_archive
+
+
+def generate_email_archive(emails_data):
+    # emailArch [[origin],[target],[cc/bcc],"subject",date, "content","urls","emails"]
+    email_archive = []
+    for emailDir in emails_data:
         for emailmsg in emailDir:
-            emailArch.append([
+            email_archive.append([
                 list(set(emailmsg.origin)),
                 list(set(emailmsg.target)),
                 list(set(emailmsg.cc + emailmsg.bcc)),
@@ -94,35 +76,32 @@ def genEmailArch(emailsData):
                 emailmsg.content,
                 infoInBody(emailmsg.content)[0],
                 infoInBody(emailmsg.content)[1]
-                ])
-    return emailArch
+            ])
+    return email_archive
 
 
-# In[20]:
-
-# In[64]:
-
 def genNodes(emailArch):
     nodes = {}
     idCount = 0
     for emailRow in emailArch:
         allnodes = list(set(emailRow[0] + emailRow[1] + emailRow[2]))
         for n in allnodes:
-            if(n not in nodes):
+            if (n not in nodes):
                 nodes[n] = idCount
                 idCount += 1
     return nodes
 
-def genEdges(emailArch,nodes):
+
+def genEdges(emailArch, nodes):
     edges = {}
     edgesInfo = []
-    
+
     idCount = 0
     for emailRow in emailArch:
         for origin in emailRow[0]:
             targetNodes = list(set(emailRow[1] + emailRow[2]))
             for target in targetNodes:
-                tuplaKey = (nodes[origin],nodes[target])
+                tuplaKey = (nodes[origin], nodes[target])
                 if (tuplaKey not in edges):
                     edges[tuplaKey] = idCount
                     idCount += 1
@@ -137,74 +116,86 @@ def genEdges(emailArch,nodes):
                     emailRow[6],
                     emailRow[7]])
     return edgesInfo
-         
+
+
 def normalizeContacts(contacts):
-    if contacts == None :
+    if contacts == None:
         return []
     else:
-        tl = re.findall(r'[a-zA-Z0-9_-]+[.|\w]\w+@[a-zA-Z0-9_-]+[.]\w+[.|\w+]+',contacts)
+        tl = re.findall(
+            r'[a-zA-Z0-9_-]+[.|\w]\w+@[a-zA-Z0-9_-]+[.]\w+[.|\w+]+', contacts)
         return list(set(tl))
 
+
 def normalizeSubject(mysubject):
     if mysubject == None:
         return None
     else:
         ee = email.Header.decode_header(mysubject)
-        return re.sub('Re: |RE: ', '', ee[0][0]) 
+        return re.sub('Re: |RE: ', '', ee[0][0])
+
 
 def normalizeDate(mydatetime):
     try:
         filterdate = re.sub(r' \+.*$| -.*$', "", mydatetime)
-        #print filterdate
+        # print filterdate
         myTime = datetime.strptime(filterdate, '%a, %d %b %Y %H:%M:%S')
         if (myTime.year < 20):
-            #add 2000 years 
+            # add 2000 years
             years = 2000
             days_per_year = 365.24
-            newtime = myTime + timedelta(days=(years*days_per_year))
+            newtime = myTime + timedelta(days=(years * days_per_year))
             myTime = newtime
         return myTime.strftime('%Y/%m/%d %H:%M:%S')
     except (ValueError, TypeError, NameError):
         return ""
 
+
 def infoInBody(msgbody):
-    urls = re.findall(ur'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?',msgbody)
+    urls = re.findall(
+        r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', msgbody)
     myurls = []
     for url in urls:
         myurls.append(url[1])
-    emails = re.findall(r'[a-zA-Z0-9_-]+[.|\w]\w+@[a-zA-Z0-9_-]+[.]\w+[.|\w+]+',msgbody);
-    #msgbody = re.sub('(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', msgbody)
-    
+    emails = re.findall(
+        r'[a-zA-Z0-9_-]+[.|\w]\w+@[a-zA-Z0-9_-]+[.]\w+[.|\w+]+', msgbody)
+    # msgbody = re.sub('(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', msgbody)
+
     urlString = ""
     for url in myurls:
         urlString = urlString + url + "<newelem>"
-    
+
     emailString = ""
     for email in emails:
         emailString = emailString + email + "<newelem>"
-    
-    return (urlString,emailString)
-    
+
+    return (urlString, emailString)
+
+
 def normalizeBody(message):
     msgbody = getbody(message)
-    msgbody = removeReplies(msgbody,message)
-    #remove too many white spaces
+    msgbody = removeReplies(msgbody, message)
+    # remove too many white spaces
     msgbody = re.sub('[ ]{2,}', ' ', msgbody)
-    #msgbody = re.sub("(\n|<|>|}|{|\")",' ',msgbody)
-    msgbody = re.sub("[^a-zA-Z0-9]+",' ', msgbody)
-    #if(message['subject'] == 'Re: Bike sharing project'):
+    # msgbody = re.sub("(\n|<|>|}|{|\")",' ',msgbody)
+    msgbody = re.sub("[^a-zA-Z0-9]+", ' ', msgbody)
+    # if(message['subject'] == 'Re: Bike sharing project'):
     #    print msgbody
     return msgbody
 
-def removeReplies(msgbody,message):
+
+def removeReplies(msgbody, message):
     noReplies = msgbody
-    #if message['In-Reply-To'] != None:
-    #noReplies = ''.join(noReplies.partition('From:')[0:2])
-    noReplies = re.sub(r' From:.*$| wrote:.*$| Original Message .*$| ha scritto:.*$| Da:.*$|-----Original Message-----.*$', "", noReplies)
+    # if message['In-Reply-To'] != None:
+    # noReplies = ''.join(noReplies.partition('From:')[0:2])
+    noReplies = re.sub(
+        r' From:.*$| wrote:.*$| Original Message .*$| ha scritto:.*$| Da:.*$|-----Original Message-----.*$', "",
+        noReplies)
     noReplies = re.sub('[_]{2,}', ' ', noReplies)
     return noReplies
 
-def getbody(message): #getting plain text 'email body'
+
+def getbody(message):  # getting plain text 'email body'
     body = "None"
     if message.is_multipart():
         for part in message.walk():
@@ -218,36 +209,36 @@ def getbody(message): #getting plain text 'email body'
         body = message.get_payload(decode=True)
     return body
 
-def genEdgesNodesJson(edges,nodes,myContacts):
-    
+
+def genEdgesNodesJson(edges, nodes, myContacts):
     nodesDic = {}
     for k in nodes.keys():
         nodesDic[nodes[k]] = k
-    
+
     myContactIds = []
     for myContact in myContacts:
         if myContact in nodes:
             myContactIds.append(nodes[myContact])
-    
+
     undEdgesId = 0
     dicUndEdgesId = {}
     for k in edges:
-        lista = [k[1],k[2]]
+        lista = [k[1], k[2]]
         lista.sort()
         tupla = tuple(lista)
         if tupla not in dicUndEdgesId:
             dicUndEdgesId[tupla] = undEdgesId
             undEdgesId = undEdgesId + 1
-        
-    arrEdgesDirected = []
-    arrEdgesUndirected = []
-    objDicDirectedNodes = {}
+
+    arr_edges_directed = []
+    arr_edges_undirected = []
+    obj_dic_directed_nodes = {}
     objDicUndirectedNodes = {}
-    arrCollSubj = collaborativeSubj(edges)
+    arrCollSubj = collaborative_subj(edges)
     for k in edges:
         objDic = {}
-        
-        #Directed case
+
+        # Directed case
         objDic['id'] = k[0]
         objDic['origin'] = k[1]
         objDic['target'] = k[2]
@@ -258,16 +249,16 @@ def genEdgesNodesJson(edges,nodes,myContacts):
         objDic['content'] = k[5]
         objDic['urls'] = k[6]
         objDic['emails'] = k[7]
-        
-        arrEdgesDirected.append(objDic)
-
-        objDicDirectedNodes[objDic['origin']] = objDic['originLbl']
-        objDicDirectedNodes[objDic['target']] = objDic['targetLbl']
-        
-        #Undirected case
-        if ((not(k[1] in myContactIds)) and (not(k[2] in myContactIds))):
-            if k[3] in arrCollSubj :
-                lista = [k[1],k[2]]
+
+        arr_edges_directed.append(objDic)
+
+        obj_dic_directed_nodes[objDic['origin']] = objDic['originLbl']
+        obj_dic_directed_nodes[objDic['target']] = objDic['targetLbl']
+
+        # Undirected case
+        if ((not (k[1] in myContactIds)) and (not (k[2] in myContactIds))):
+            if k[3] in arrCollSubj:
+                lista = [k[1], k[2]]
                 lista.sort()
                 tupla = tuple(lista)
                 objDic = {}
@@ -282,173 +273,166 @@ def genEdgesNodesJson(edges,nodes,myContacts):
                 objDic['urls'] = k[6]
                 objDic['emails'] = k[7]
 
-                arrEdgesUndirected.append(objDic)
+                arr_edges_undirected.append(objDic)
                 objDicUndirectedNodes[objDic['origin']] = objDic['originLbl']
                 objDicUndirectedNodes[objDic['target']] = objDic['targetLbl']
-    
-    
+
     arrNodesDirected = []
-    for k in objDicDirectedNodes.keys():
+    for k in obj_dic_directed_nodes.keys():
         objDic = {}
         objDic['id'] = k
-        objDic['label'] = objDicDirectedNodes[k]
+        objDic['label'] = obj_dic_directed_nodes[k]
         arrNodesDirected.append(objDic)
-        
+
     arrNodesUndirected = []
     for k in objDicUndirectedNodes.keys():
         objDic = {}
         objDic['id'] = k
         objDic['label'] = objDicUndirectedNodes[k]
         arrNodesUndirected.append(objDic)
-    
+
     globalDic = {}
     internalDic = {}
-    internalDic['directed'] = arrEdgesDirected
-    internalDic['undirected'] = arrEdgesUndirected
+    internalDic['directed'] = arr_edges_directed
+    internalDic['undirected'] = arr_edges_undirected
     globalDic['edges'] = internalDic
-    
+
     internalDic = {}
     internalDic['directed'] = arrNodesDirected
     internalDic['undirected'] = arrNodesUndirected
     globalDic['nodes'] = internalDic
 
-        
     return globalDic
 
+
 def getTest():
     stringa = "ciao"
     return stringa
 
-def getAllArchive(mboxUrl):
+
+def get_all_archive(mboxUrl):
     emailData = []
-    emailData.append(retrievedata(mailbox.mbox(str(mboxUrl))))
-    
-    emailArch = genEmailArch(emailData)
-    
+    emailData.append(retrieve_data(mailbox.mbox(str(mboxUrl))))
+
+    emailArch = generate_email_archive(emailData)
+
     return emailArch
-  
-#returns archive dictionary {(subject,time)} -> content  
-def getAllArchiveText(mboxUrls):
-    emailData = []
-    for mboxUrl in mboxUrls:
-        emailData.append(retrievedata(mailbox.mbox(str(mboxUrl))))
-    emailArch = genEmailArch(emailData)
-    
-    dicEmails = {}
-    for e in emailArch:
-        #(subject,time) the tupla key
-        tupla = (e[3],e[4])
-        #content to the key
-        dicEmails[tupla] = e[5]
-    return dicEmails
-    
-def getEmailsOfSub(subj,subjTime,arrMboxUrl):
+
+
+# returns archive dictionary {(subject,time)} -> content
+def get_all_archive_text(mboxUrls):
+    email_data = []
+    for mbox_url in mboxUrls:
+        email_data.append(retrieve_data(mailbox.mbox(str(mbox_url))))
+    email_arch = generate_email_archive(email_data)
+
+    dic_emails = {}
+    for email in email_arch:
+        # (subject,time) the tupla key
+        tupla = (email[3], email[4])
+        # content to the key
+        dic_emails[tupla] = email[5]
+    return dic_emails
+
+
+def getEmailsOfSub(subj, subjTime, arrMboxUrl):
     arrEmails = []
     for mboxUrl in arrMboxUrl:
         mymail = mailbox.mbox(mboxUrl)
         for message in mymail:
-            if(normalizeSubject(message['subject']) == subj):
+            if (normalizeSubject(message['subject']) == subj):
                 mytime = normalizeDate(message['date'])
-                #return [str(mytime),subjTime]
-                if(str(mytime) == subjTime):
-                    msgObj = {'date': str(mytime),'allmsg':str(message)}
+                # return [str(mytime),subjTime]
+                if (str(mytime) == subjTime):
+                    msgObj = {'date': str(mytime), 'allmsg': str(message)}
                     arrEmails.append(msgObj)
     return arrEmails
 
-def getNodesContents(nodes,mboxUrl):
-    emailData = []
-    emailData.append(retrievedata(mailbox.mbox(str(mboxUrl))))
-    
-    emailArch = genEmailArch(emailData)
-    
-    
-    return emailArch
 
-# In[69]:
-
-def convertToJSON(emailArch):
-    emailsJSON = []
-    for e in emailArch:
-        emailObj = {}
-        emailObj['from'] = e[0]
-        emailObj['to'] = e[1]
-        emailObj['cc'] = e[2]
-        emailObj['subject'] = e[3]
-        emailObj['time'] = e[4]
-        emailObj['content'] = e[5]
-        emailsJSON.append(emailObj)
-    return emailsJSON
-
-    
-def collaborativeSubj(edges):
-    dicSubj = {}
-    
+def get_nodes_contents(nodes, mboxUrl):
+    email_data = list()
+    email_data.append(retrieve_data(mailbox.mbox(str(mboxUrl))))
+    email_arch = generate_email_archive(email_data)
+    return email_arch
+
+
+def convert_to_json(email_archive):
+    emails_json = list()
+    for e in email_archive:
+        email_obj = dict()
+        email_obj['from'] = e[0]
+        email_obj['to'] = e[1]
+        email_obj['cc'] = e[2]
+        email_obj['subject'] = e[3]
+        email_obj['time'] = e[4]
+        email_obj['content'] = e[5]
+        emails_json.append(email_obj)
+    return emails_json
+
+
+def collaborative_subj(edges):
+    dic_subj = dict()
+
     for k in edges:
-        if k[3] not in dicSubj:
-            dicSubj[k[3]] = []
-            dicSubj[k[3]].append(k[4])
+        if k[3] not in dic_subj:
+            dic_subj[k[3]] = []
+            dic_subj[k[3]].append(k[4])
         else:
-            if k[4] not in dicSubj[k[3]]:
-                dicSubj[k[3]].append(k[4])
-        
-    arrSubj = []
-    for k in dicSubj.keys():
-        if (int(len(dicSubj[k])) > 1):
-            arrSubj.append(k)
-    return arrSubj
+            if k[4] not in dic_subj[k[3]]:
+                dic_subj[k[3]].append(k[4])
+
+    list_subj = list
+    for k in dic_subj.keys():
+        if len(dic_subj[k]) > 1:
+            list_subj.append(k)
+    return list_subj
 
 
 cmdargs = str(sys.argv)
 functionname = str(sys.argv[1])
 
-if(functionname == "getNodesEdges"):
-    
+if functionname == "getNodesEdges":
     mboxUrls = re.split('NEWURL', sys.argv[2])
     mboxMyContacts = re.split('NEWCONTACT', sys.argv[3])
     emailData = []
-    
+
     for mboxUrl in mboxUrls:
-        emailData.append(retrievedata(mailbox.mbox(mboxUrl)))
+        emailData.append(retrieve_data(mailbox.mbox(mboxUrl)))
 
-    
-    emailArch = genEmailArch(emailData)
+    emailArch = generate_email_archive(emailData)
     nodes = genNodes(emailArch)
-    edges = genEdges(emailArch,nodes)
-    allData = genEdgesNodesJson(edges,nodes,mboxMyContacts)
-
-    resultDic = {}
-    resultDic['allEmails'] = convertToJSON(emailArch)
-    resultDic['nodesEdges'] = allData
-    
-    print json.dumps(resultDic, ensure_ascii=False)
+    edges = genEdges(emailArch, nodes)
+    allData = genEdgesNodesJson(edges, nodes, mboxMyContacts)
 
+    result_dict = dict()
+    result_dict['allEmails'] = convert_to_json(emailArch)
+    result_dict['nodesEdges'] = allData
+    print(json.dumps(result_dict, ensure_ascii=False))
 
-elif(functionname == 'getOriginalEmails'):
-    
+elif functionname == 'getOriginalEmails':
     mboxUrls = re.split('<newurl>', sys.argv[2])
     datasetName = str(sys.argv[3])
-    subjectTime = sys.argv[4]+" "+sys.argv[5]
+    subjectTime = sys.argv[4] + " " + sys.argv[5]
     subject = str(sys.argv[6])
-    
-    for i in range(7,len(sys.argv)):
+
+    for i in range(7, len(sys.argv)):
         subject = subject + " " + sys.argv[i]
-    
-    globalDic = {}
-    globalDic['name'] = subject
-    globalDic['msgs'] = getEmailsOfSub(subject,subjectTime,mboxUrls)
-    
-    print json.dumps(globalDic, ensure_ascii=False)
 
-elif(functionname == 'getAllEmails'):
-    
+    global_dict = dict()
+    global_dict['name'] = subject
+    global_dict['msgs'] = getEmailsOfSub(subject, subjectTime, mboxUrls)
+
+    print(json.dumps(global_dict, ensure_ascii=False))
+
+elif functionname == 'getAllEmails':
     mboxUrls = re.split('NEWURL', sys.argv[2])
     emailData = []
-    
+
     for mboxUrl in mboxUrls:
-        emailData.append(retrieveOriginaldata(mailbox.mbox(mboxUrl)))
+        emailData.append(retrieve_original_data(mailbox.mbox(mboxUrl)))
+
+    emailArch = generate_email_archive(emailData)
 
-    emailArch = genEmailArch(emailData)
-    
-    resultDic = {}
-    resultDic['allEmails'] = convertToJSON(emailArch)
-    print json.dumps(resultDic, ensure_ascii=False)
+    result_dict = dict()
+    result_dict['allEmails'] = convert_to_json(emailArch)
+    print(json.dumps(result_dict, ensure_ascii=False))
diff --git a/server/nlp-proc.py b/server/nlp-proc.py
index c494503..a879f11 100644
--- a/server/nlp-proc.py
+++ b/server/nlp-proc.py
@@ -1,350 +1,337 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+from __future__ import print_function
 
+import re
 import sys
-sys.path.append('/anaconda/envs/py27/lib/python2.7/site-packages/')
-#sys.path.append('/usr/local/lib/python2.7/site-packages')
+import json
+import collections
 
-#for smartdata server
-#sys.path.append('/usr/local/lib/python2.7/dist-packages')
+from datetime import datetime
 
-import json
-import re
-import numpy as np
+from nltk.corpus import stopwords
 
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.decomposition import TruncatedSVD
 from sklearn.metrics.pairwise import linear_kernel
-import collections
 
-import nltk
-from nltk.corpus import stopwords
+from .dataGen import getAllArchive
+from .dataGen import getAllArchiveText
 
-from dataGen import getAllArchive
-from dataGen import getAllArchiveText
-from datetime import datetime
 
+def set_stop_words(dataset):
 
-def setStopWords(dataset):
-    
     stop_words = None
-    otherSWords = []
-    
-  
+
     if dataset == "clinton":
         stop_words = set(stopwords.words('english'))
-        otherSWords = ['UNCLASSIFIED','U.S.','Department', 'State', 'Case', 'No.','No','US','Doc', 'Date','From','To','Subject','Clinton','clinton','sent','Sent','Send','Ok','ok','pm','am']
-        stop_words.update(list(set(otherSWords)))
-        
-        #arrContent = getContacts()
-        #allNames = []
-        #for row in arrContent :
-        #    split1 = row[0].split("@")
-        #    for n in split1:
-        #        split2= n.split(".")
-        #        for m in split2:
-        #            allNames.append(m)
-        #    #allNames.append(unicode(names, "utf-8",errors='ignore'))
-        #allNames = list(set(allNames))
-        
-        stop_words.update(allNames)
-    
+        other_s_words = ['UNCLASSIFIED', 'U.S.', 'Department', 'State', 'Case', 'No.', 'No', 'US', 'Doc', 'Date',
+                         'From', 'To', 'Subject', 'Clinton', 'clinton', 'sent', 'Sent', 'Send', 'Ok', 'ok', 'pm', 'am']
+        stop_words.update(list(set(other_s_words)))
+
+        # XXX: what? this wouldn't work for clinton
+        # stop_words.update(allNames)
+        raise ValueError('AllNames var is not defined')
     elif dataset == "enron":
         stop_words = set(stopwords.words('english'))
-        otherSWords = ['enron','No.','No','US','Ok','ok','pm','am','http','link','www','com','html','travelocity']
-        stop_words.update(list(set(otherSWords)))
-        
-        #stop_words.update(allNames)
-        
+        other_s_words = ['enron', 'No.', 'No', 'US', 'Ok', 'ok', 'pm',
+                         'am', 'http', 'link', 'www', 'com', 'html', 'travelocity']
+        stop_words.update(list(set(other_s_words)))
     elif dataset == "uniboIvan":
         stop_words = set(stopwords.words('italian'))
         eng_stop_words = set(stopwords.words('english'))
         stop_words.update(list(eng_stop_words))
-        
-        otherSWords = ['Grazie','grazie','Saluti','saluti','Salve','salve','Distinti','distinti','Cordiali','cordiali','http','pm','am','www']
-        stop_words.update(list(set(otherSWords)))
-    
-    #additional stop words
-    htmlTags =["a","abbr","acronym","address","area","b","base","bdo","big","blockquote","body","br","button","caption","cite","code","col","colgroup","dd","del","dfn","div","dl","DOCTYPE","dt","em","fieldset","form","h1","h2","h3","h4","h5","h6","head","html","hr","i","img","input","ins","kbd","label","legend","li","link","map","meta","noscript","object","ol","optgroup","option","p","param","pre","q","samp","script","select","small","span","strong","style","sub","sup","table","tbody","td","textarea","tfoot","th","thead","title","tr","tt","ul","var"]
-    otherHtmlParams = ["font","width","height","href","gif","color","size","00","image","net","asp"]
-    irrNumbers = []
+
+        other_s_words = ['Grazie', 'grazie', 'Saluti', 'saluti', 'Salve', 'salve',
+                         'Distinti', 'distinti', 'Cordiali', 'cordiali', 'http', 'pm', 'am', 'www']
+        stop_words.update(list(set(other_s_words)))
+
+    # additional stop words
+    html_tags = ["a", "abbr", "acronym", "address", "area", "b", "base", "bdo", "big", "blockquote", "body", "br",
+                 "button", "caption", "cite", "code", "col", "colgroup", "dd", "del", "dfn", "div", "dl", "DOCTYPE",
+                 "dt", "em", "fieldset", "form", "h1", "h2", "h3", "h4", "h5", "h6", "head", "html", "hr", "i", "img",
+                 "input", "ins", "kbd", "label", "legend", "li", "link", "map", "meta", "noscript", "object", "ol",
+                 "optgroup", "option", "p", "param", "pre", "q", "samp", "script", "select", "small", "span", "strong",
+                 "style", "sub", "sup", "table", "tbody", "td", "textarea", "tfoot", "th", "thead", "title", "tr", "tt",
+                 "ul", "var"]
+    other_html_params = ["font", "width", "height", "href", "gif", "color", "size", "00", "image", "net", "asp"]
+    irr_numbers = []
     for i in range(101):
-        irrNumbers.append(str(i))
-    
-    stop_words.update(list(set(htmlTags)))
-    stop_words.update(list(set(otherHtmlParams)))
-    stop_words.update(list(set(irrNumbers)))
-    
+        irr_numbers.append(str(i))
+
+    stop_words.update(list(set(html_tags)))
+    stop_words.update(list(set(other_html_params)))
+    stop_words.update(list(set(irr_numbers)))
+
     return stop_words
 
-def inStopwords(word, stopwords):
-    for sw in stopwords:
-        if iequal(word,sw):
+
+def in_stop_words(word, stop_words):
+    for stop_word in stop_words:
+        if iequal(word, stop_word):
             return True
     return False
 
-def bestwords(strtext,numWords,dataset):
 
-    stop_words = setStopWords(dataset)
-    text = strtext
-    
+def best_words(string_text, num_words, dataset):
+
+    stop_words = set_stop_words(dataset)
+    text = string_text
+
     words = re.findall('\w+', text)
-    filterWords = []
+    filter_words = []
     for w in words:
-        if not inStopwords(w, stop_words):
-            filterWords.append(w)
-        
-    myColl = collections.Counter(filterWords)
-    best10 = myColl.most_common(numWords)
-    
-    dicbest = []
-    for elem in best10:
-        elemJson = {}
-        elemJson['word'] = elem[0]
-        elemJson['value'] = elem[1]
-        dicbest.append(elemJson)
-        #dicbest[elem[0]] = elem[1]
-    
-    globalDic = {}
-    globalDic['words'] = dicbest
-    #return json.dumps(dicbest, ensure_ascii=False)    
-    return globalDic
-
-def termsCountMatrix(dataset,datasetUrls,voc):
-    
-    stopset = setStopWords(dataset)
-    #returns a dictionary where keys are the pairs (subject,time)
-    dicEmails = getAllArchiveText(datasetUrls)
-    
-    #create array of all emails retrieved
-    arrEmails = []
-    for k in dicEmails.keys():
-        tupla = (k,dicEmails[k])
-        arrEmails.append(tupla)
-        
-    #create corpus from all emails
-    myCorpus = []
-    for textElem in arrEmails:
-        msgContent = re.sub('[^0-9\'.a-zA-Z]+', ' ', textElem[1])
-        myCorpus.append(unicode(msgContent, "utf-8",errors='ignore'))
-    
-    vect = CountVectorizer(analyzer='word', vocabulary = list(set(voc)), ngram_range=(1,3), stop_words = stopset)
-    count_matrix =  vect.fit_transform(myCorpus)
+        if not in_stop_words(w, stop_words):
+            filter_words.append(w)
+
+    my_coll = collections.Counter(filter_words)
+    best10 = my_coll.most_common(num_words)
+
+    dicbest = list()
+    for item in best10:
+        item_json = dict()
+        item_json['word'] = item[0]
+        item_json['value'] = item[1]
+        dicbest.append(item_json)
+
+    global_dic = dict()
+    global_dic['words'] = dicbest
+    return global_dic
+
+
+def terms_count_matrix(dataset, dataset_urls, voc):
+
+    stopset = set_stop_words(dataset)
+    # returns a dictionary where keys are the pairs (subject,time)
+    dic_emails = getAllArchiveText(dataset_urls)
+
+    # create array of all emails retrieved
+    arr_emails = []
+    for k in dic_emails.keys():
+        tupla = (k, dic_emails[k])
+        arr_emails.append(tupla)
+
+    # create corpus from all emails
+    my_corpus = []
+    for textElem in arr_emails:
+        msg_content = re.sub('[^0-9\'.a-zA-Z]+', ' ', textElem[1])
+        my_corpus.append(u'{}'.format(msg_content))
+
+    vect = CountVectorizer(analyzer='word', vocabulary=list(
+        set(voc)), ngram_range=(1, 3), stop_words=stopset)
+    count_matrix = vect.fit_transform(my_corpus)
     feature_names = vect.get_feature_names()
     dense = count_matrix.todense()
-    
-    i=0
-    dicTerms = {}
-    for timeItem in dense :
-        
-        #array of all terms freq counts for each email
+
+    i = 0
+    dic_terms = dict()
+    for timeItem in dense:
+
+        # array of all terms freq counts for each email
         email_scores = timeItem.tolist()[0]
         num_terms = len(email_scores)
-        
-        #phrases_scores: creates an array of pairs (term_array_index, term_score) for every email
-        phrase_scores = [pair for pair in zip(range(0, num_terms), email_scores) if pair[1] > 0]
-        
-        #sort the phrases_scores
-        sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)
-        
+
+        # phrases_scores: creates an array of pairs (term_array_index, term_score) for every email
+        phrase_scores = [pair for pair in zip(
+            range(0, num_terms), email_scores) if pair[1] > 0]
+
+        # sort the phrases_scores
+        # XXX: sorted_phrase_scores is not used
+        # sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)
+
         for phrase, score in [(feature_names[word_id], score) for (word_id, score) in phrase_scores]:
-            data = {'score':score,'subject':arrEmails[i][0][0],'time':arrEmails[i][0][1]}
-            
-            if dicTerms.has_key(phrase):
-                dicTerms[phrase].append(data)
+            data = {'score': score,
+                    'subject': arr_emails[i][0][0], 'time': arr_emails[i][0][1]}
+
+            if dic_terms.has_key(phrase):
+                dic_terms[phrase].append(data)
             else:
-                dicTerms[phrase] = [data]
-                
-        i= i + 1
-    
-    arrResTerms = []
-    for termKey in dicTerms:
-        #arrResTerms[dicTerms[termKey]['name']] = dicTerms[termKey]
-        arrResTerms.append(dicTerms[termKey])
-        
-    #returns a JSON {phrase_word:[array-of-emails]}
-    return dicTerms
-
-
-def wordsXtime(dataset,wordsNum,datasetUrls):
-     
-    stopset = setStopWords(dataset)
-    
-    dicEmails = getAllArchiveText(datasetUrls)
-    
-    #create array of all emails retrieved
-    arrEmails = []
-    for k in dicEmails.keys():
-        tupla = (k,dicEmails[k])
-        arrEmails.append(tupla)
-    
-    #create dictionary according to the dates
-    dicTime = {}
-    for elem in arrEmails:
-        #time = elem[0][1]
-        time = datetime.strptime(elem[0][1],'%Y/%m/%d %H:%M:%S')
-        stepTime = time.strftime('%Y-%m')
-        if stepTime in dicTime :
-            dicTime[stepTime].append(elem[1])
+                dic_terms[phrase] = [data]
+        i += 1
+
+    arr_res_terms = []
+    for termKey in dic_terms:
+        # arr_res_terms[dicTerms[termKey]['name']] = dicTerms[termKey]
+        arr_res_terms.append(dic_terms[termKey])
+
+    # returns a JSON {phrase_word:[array-of-emails]}
+    return dic_terms
+
+
+def words_per_time(dataset, words_num, dataset_urls):
+
+    stopset = set_stop_words(dataset)
+    dic_emails = getAllArchiveText(dataset_urls)
+
+    # create array of all emails retrieved
+    arr_emails = []
+    for k in dic_emails.keys():
+        tupla = (k, dic_emails[k])
+        arr_emails.append(tupla)
+
+    # create dictionary according to the dates
+    dic_time = {}
+    for elem in arr_emails:
+        time = datetime.strptime(elem[0][1], '%Y/%m/%d %H:%M:%S')
+        step_time = time.strftime('%Y-%m')
+        if step_time in dic_time:
+            dic_time[step_time].append(elem[1])
         else:
-            dicTime[stepTime] = [elem[1]]
-            
-    
-    #convert to arr to keep indexes
-    arrTimeContent = []
-    for k in dicTime.keys():
-        tupla = (k,dicTime[k])
-        arrTimeContent.append(tupla)
-        
-    #create corpus
-    myCorpus = []
-    for elem in arrTimeContent:
-        allMsgsContent = ""
-        for textMsg in elem[1]:
-            msgContent = re.sub('[^0-9\'.a-zA-Z]+', ' ', textMsg)
-            allMsgsContent = allMsgsContent + " " + msgContent + " "
-        myCorpus.append(unicode(allMsgsContent, "utf-8",errors='ignore'))
-    
-    #tf-idf vars    
-    vect = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0, stop_words = stopset)
-    tfidf_matrix =  vect.fit_transform(myCorpus)
+            dic_time[step_time] = [elem[1]]
+
+    # convert to arr to keep indexes
+    arr_time_content = []
+    for k in dic_time.keys():
+        tupla = (k, dic_time[k])
+        arr_time_content.append(tupla)
+
+    # create corpus
+    my_corpus = []
+    for elem in arr_time_content:
+        all_msgs_content = ""
+        for text_msg in elem[1]:
+            msg_content = re.sub('[^0-9\'.a-zA-Z]+', ' ', text_msg)
+            all_msgs_content = all_msgs_content + " " + msg_content + " "
+        my_corpus.append(u'{}'.format(all_msgs_content))
+
+    # tf-idf vars
+    vect = TfidfVectorizer(analyzer='word', ngram_range=(
+        1, 3), min_df=0, stop_words=stopset)
+    tfidf_matrix = vect.fit_transform(my_corpus)
     feature_names = vect.get_feature_names()
     dense = tfidf_matrix.todense()
-    
-    #build the final arr of Timeseries
-    globalArrTime = []
-    i=0
-    for timeItem in dense :
+
+    # build the final arr of Timeseries
+    global_arr_time = []
+    i = 0
+    for timeItem in dense:
         email = timeItem.tolist()[0]
-        phrase_scores = [pair for pair in zip(range(0, len(email)), email) if pair[1] > 0]
+        phrase_scores = [pair for pair in zip(
+            range(0, len(email)), email) if pair[1] > 0]
         sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)
-        
-        dicTime = {}
-        dicTime['name'] = str(arrTimeContent[i][0])
-        dicTime['words'] = []
-        
-        #if dicTime['name'] == "2001-05":
-        #    for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][:100]:
-        #        print phrase+" - "+str(score)
-        
-        for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][:wordsNum * 10]:
-            data = {'name':phrase,'score':score}
-            dicTime['words'].append(data)
-        #print dicTime
-        
-        globalArrTime.append(dicTime)
+
+        dic_time = dict()
+        dic_time['name'] = str(arr_time_content[i][0])
+        dic_time['words'] = []
+
+        for phrase, score in [(feature_names[word_id], score)
+                              for (word_id, score) in sorted_phrase_scores][:words_num * 10]:
+            data = {'name': phrase, 'score': score}
+            dic_time['words'].append(data)
+        # print dicTime
+
+        global_arr_time.append(dic_time)
         i = i + 1
-        
-    finalArr = globalArrTime
-    myVoc = []
-    #get Best Words Num
-    finalArr = []
-    for dt in globalArrTime:
-        
-        dicTime = {}
-        dicTime['name'] = dt['name']
-        dicTime['words'] = []
-        wordsXt = wordsNum
+
+    # get Best Words Num
+    my_voc = []
+    final_arr = []
+    for dt in global_arr_time:
+        dic_time = dict()
+        dic_time['name'] = dt['name']
+        dic_time['words'] = []
+        words_xt = words_num
         for wScore in dt['words']:
-            if checkGramTime(wScore,dt['words']):
-                dicTime['words'].append(wScore)
-                myVoc.append(wScore['name'])
-                wordsXt = wordsXt - 1    
-            if wordsXt == 0:
+            if check_gram_time(wScore, dt['words']):
+                dic_time['words'].append(wScore)
+                my_voc.append(wScore['name'])
+                words_xt = words_xt - 1
+            if words_xt == 0:
                 break
-        finalArr.append(dicTime)
-    
-    globalDic = {}
-    globalDic['wordsXtime'] = finalArr
-    globalDic['termsCount'] = termsCountMatrix(dataset,datasetUrls,myVoc)
-    return globalDic
-    
+        final_arr.append(dic_time)
+
+    global_dic = dict()
+    global_dic['wordsXtime'] = final_arr
+    global_dic['termsCount'] = terms_count_matrix(dataset, dataset_urls, my_voc)
+    return global_dic
+
 
-def wordClustering(dataset,conceptNum,wordsNum,datasetUrls):
+def word_clustering(dataset, conceptNum, wordsNum, datasetUrls):
 
-    stopset = setStopWords(dataset)
-    
-    dicEmails = getAllArchiveText(datasetUrls)
+    stop_set = set_stop_words(dataset)
 
-    #create array of all emails retrieved
+    dic_emails = getAllArchiveText(datasetUrls)
+
+    # create array of all emails retrieved
     arrEmails = []
-    for k in dicEmails.keys():
-        tupla = (k,dicEmails[k])
+    for k in dic_emails.keys():
+        tupla = (k, dic_emails[k])
         arrEmails.append(tupla)
-            
-    myCorpus = []
+
+    my_corpus = []
     for elem in arrEmails:
-        filteredText = re.sub('[^0-9\'.a-zA-Z]+', ' ', elem[1])
-        myCorpus.append(unicode(filteredText, "utf-8",errors='ignore'))
-        
-    vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1,3), min_df = 0, stop_words = stopset)
-    X = vectorizer.fit_transform(myCorpus)
+        filtered_text = re.sub('[^0-9\'.a-zA-Z]+', ' ', elem[1])
+        my_corpus.append(u'{}'.format(filtered_text))
+
+    vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(
+        1, 3), min_df=0, stop_words=stop_set)
+    X = vectorizer.fit_transform(my_corpus)
 
-    
     # Truncate the matrix to a number of components(concepts)
     lsa = TruncatedSVD(n_components=conceptNum, n_iter=100)
     lsa.fit(X)
-    
-    allData = []
+
+    all_data = list()
     terms = vectorizer.get_feature_names()
     for i, comp in enumerate(lsa.components_):
-        #merge 2 lists together terms and comp
-        termsInComp = zip (terms,comp)
-        sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:wordsNum*3]
-        
-        cluster = {}
-        #cluster['name'] = "Concept %d" % i
-        cluster['name'] = str(sortedTerms[0][0])
+        # merge 2 lists together terms and comp
+        terms_in_comp = zip(terms, comp)
+        sorted_terms = sorted(terms_in_comp, key=lambda x: x[1], reverse=True)[:wordsNum*3]
+
+        cluster = dict()
+        cluster['name'] = str(sorted_terms[0][0])
         cluster['id'] = str(i)
-        clusterData =[]
-        #print "Concept %d:" % i
-        
-        wordsXclus = wordsNum
-        for term in sortedTerms:
-            data = {}
+        cluster_data = []
+
+        words_xclus = wordsNum
+        for term in sorted_terms:
+            data = dict()
             data['name'] = term[0]
             data['score'] = term[1]
-            if checkGram(data,sortedTerms):
-                clusterData.append(data)
-                wordsXclus = wordsXclus - 1    
-            if wordsXclus == 0:
+            if check_gram(data, sorted_terms):
+                cluster_data.append(data)
+                words_xclus = words_xclus - 1
+            if words_xclus == 0:
                 break
-            #clusterData.append(data)
-            #print term
-        cluster['children'] = clusterData
-        allData.append(cluster)
-    
-    arrCosSim = []
-    for i in range(0,len(arrEmails)):
-        cosine_similarities = linear_kernel(X[i:i+1], lsa.components_).flatten()
-        
-        objDic = {}
-        objDic['subject'] = arrEmails[i][0][0]
-        objDic['time'] = arrEmails[i][0][1]
-        objDic['cosSim'] = []
-        for i in range(0,len(cosine_similarities)-1):
-            objCosScore = {}
-            objCosScore['cluster'] = i
-            objCosScore['score'] = cosine_similarities[i]
-            objDic['cosSim'].append(objCosScore)
-        
-        arrCosSim.append(objDic)
-    
-    returnedDic = {}
-    returnedDic['name'] = 'clusters'
-    returnedDic['children'] = allData
-    returnedDic['emailsClusters'] = arrCosSim
-    
-    return returnedDic
-
-def checkGram(data,sortedTerms):
-    for elem in sortedTerms:
+        cluster['children'] = cluster_data
+        all_data.append(cluster)
+
+    arr_cos_sim = []
+    for i in range(0, len(arrEmails)):
+        cosine_similarities = linear_kernel(
+            X[i:i+1], lsa.components_).flatten()
+
+        obj_dic = {}
+        obj_dic['subject'] = arrEmails[i][0][0]
+        obj_dic['time'] = arrEmails[i][0][1]
+        obj_dic['cosSim'] = []
+        for i in range(0, len(cosine_similarities)-1):
+            obj_cos_score = {}
+            obj_cos_score['cluster'] = i
+            obj_cos_score['score'] = cosine_similarities[i]
+            obj_dic['cosSim'].append(obj_cos_score)
+
+        arr_cos_sim.append(obj_dic)
+
+    returned_dic = dict()
+    returned_dic['name'] = 'clusters'
+    returned_dic['children'] = all_data
+    returned_dic['emailsClusters'] = arr_cos_sim
+
+    return returned_dic
+
+
+def check_gram(data, sorted_terms):
+    for elem in sorted_terms:
         if data['name'] in elem[0]:
             if len(elem[0]) > len(data['name']):
                 if data['score'] == elem[1]:
                     return False
     return True
 
-def checkGramTime(data,sortedTerms):
+
+def check_gram_time(data, sortedTerms):
     for elem in sortedTerms:
         if data['name'] in elem['name']:
             if len(elem['name']) > len(data['name']):
@@ -352,6 +339,7 @@ def checkGramTime(data,sortedTerms):
                     return False
     return True
 
+
 def iequal(a, b):
     try:
         return a.upper() == b.upper()
@@ -359,67 +347,45 @@ def iequal(a, b):
         return a == b
 
 
-#When this file is called
+# When this file is called
 cmdargs = str(sys.argv)
 functionname = str(sys.argv[1])
-#datasetUrl = 'server/data/mbox/enron/enron-smith-m.mbox'
+# datasetUrl = 'server/data/mbox/enron/enron-smith-m.mbox'
 
-if(functionname == "bestwords"):
+if functionname == "bestwords":
     mboxUrls = re.split('<newurl>', sys.argv[2])
     datasetName = str(sys.argv[3])
     numWords = int(sys.argv[4])
     allText = sys.argv[5]
-    
-    globalDic = bestwords(allText,numWords,datasetName)
-    
-    print json.dumps(globalDic, ensure_ascii=False)
-
-elif(functionname == "wordsXtime"):
+    global_dict = best_words(allText, numWords, datasetName)
+    print(json.dumps(global_dict, ensure_ascii=False))
+elif functionname == "wordsXtime":
     mboxUrls = re.split('NEWURL', sys.argv[2])
     datasetName = str(sys.argv[3])
     wordsNum = int(sys.argv[4])
-    
-    globalDic = {}
-    globalDic['name'] = 'wordsXtime'
-    #globalDic['columns'] = wordsXtime(datasetName,wordsNum,mboxUrls)
-    res = wordsXtime(datasetName,wordsNum,mboxUrls)
-    
-    print json.dumps(res, ensure_ascii=False)
-    
-elif(functionname == "clusterwords"):
+    global_dict = dict()
+    global_dict['name'] = 'wordsXtime'
+    res = words_per_time(datasetName, wordsNum, mboxUrls)
+    print(json.dumps(res, ensure_ascii=False))
+elif functionname == "clusterwords":
     mboxUrls = re.split('NEWURL', sys.argv[2])
     datasetName = str(sys.argv[3])
     conceptNum = int(sys.argv[4])
     wordsNum = int(sys.argv[5])
+    global_dict = word_clustering(datasetName, conceptNum, wordsNum, mboxUrls)
+    print(json.dumps(global_dict, ensure_ascii=False))
 
-    globalDic = wordClustering(datasetName,conceptNum,wordsNum,mboxUrls)
-
-    print json.dumps(globalDic, ensure_ascii=False)
-        
-elif(functionname == "getStopWords"):
-    
+elif functionname == "getStopWords":
     datasetName = str(sys.argv[2])
-    
-    arrSW = []
-    for sw in setStopWords(datasetName):
-        arrSW.append(sw)
-    
-    globalDic = {}
-    globalDic['stopWords'] = arrSW
-    
-    print json.dumps(globalDic, ensure_ascii=False)
-    
-
-'''   
-elif(functionname == "termsCountMatrix"):
-    mboxUrls = re.split('<newurl>', sys.argv[2])
-    datasetName = str(sys.argv[3])
+    arr_sw = []
+    for sw in set_stop_words(datasetName):
+        arr_sw.append(sw)
+    global_dict = dict()
+    global_dict['stopWords'] = arr_sw
+
+    print(json.dumps(global_dict, ensure_ascii=False))
 
-    globalDic = {}
-    globalDic['termsCountMatrix'] = termsCountMatrix(datasetName,mboxUrls)
-    print json.dumps(globalDic, ensure_ascii=False)
-'''    
 
-#exampleCall: python2.7 server/nlp-proc.py clusterwords server/data/mbox/enron/enron-smith-m.mbox enron 10 10
-#returns : {name:'clusters',children:[{'name':clusterX,'children':childrenX}],
-#           'emailsClusters':[{'subject':subjX,'time':timeX,'cosSim':[{}]}]}
\ No newline at end of file
+# exampleCall: python2.7 server/nlp-proc.py clusterwords server/data/mbox/enron/enron-smith-m.mbox enron 10 10
+# returns : {name:'clusters',children:[{'name':clusterX,'children':childrenX}],
+#           'emailsClusters':[{'subject':subjX,'time':timeX,'cosSim':[{}]}]}