1.Selenium
    2.Pyautogui
    3.BeautifulSoup
    The browser used here is Google Chrome which is automated with selenium with the help of ChromeDriver.On running this code we will get a CMD pop up from chromedriver and it will open the chrome browser.It will be launching web whatsapp by default and shows a QR code we can scan it in our mobile and the mobile should remain logged in until the entire process is completed.This is established only for the sake of gathering Knowledge and not to harm others.
     1.Launch Chrome
     2.Get connected with whatsappweb
     3.Retrieve the contacts synced with whatsapp
     4.Retrieve the message of each contact in a sorted manner
     5.Retrieve the time,type,content and Date of each message
     6.Time is in structured format HH:MM
     7.Type is either Sent or Recieved
     8.Content is in unstructured format
     9.Date is in days or in DD/MM/YYYY
     10.Excluding Status,Images,Audios,Videos,Emojis etc..
     11.Extracting only text contents
     This code retrieves at a average of 100 chats of each person this can be increased or decreased either by manipulating the code or by increasing the internet speed.The average internet speed required to execute this code without any flaws is 100kbps.Further the data will be stored in chatlist which can be saved in CSV format.
     For the purpose of analysis it has been left off as it is.The data stored in chatlist is in format of CSV2,which can be simply to written into a file to view in Excel and in other spreadsheet applications.
     The output will contain time elapsed for each process along with description.And it also contains the number of messages that has been sent and recieved.
##Created on 06-Dec-2019
##@author: Sudhan
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import sys
import pyautogui
import datetime
import re
from time import *
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
st=time()
def eta(seconds):
sec=seconds-st
return "ETA: "+str(datetime.timedelta(seconds=sec))
def valid_date(datestring):
try:
mat=re.match('(\d{2})[/.-](\d{2})[/.-](\d{4})$', datestring)
if mat is not None:
datetime.datetime(*(map(int, mat.groups()[-1::-1])))
return True
except ValueError:
pass
return False
def scroll():
for i in range(8):
pyautogui.press('down')
sleep(1)
def findmsg(name):
sleep(2);partial=0
try:
driver.find_element_by_class_name('_1ays2').click()
except:
return None
sleep(2)
for i in range(40):
sleep(1)
pyautogui.press('up')
msg=[name];prev='Unknown';
sleep(8)
htmlcode=(driver.page_source).encode('utf-8')
soup = BeautifulSoup(htmlcode,features="html.parser")
cnt=0
for tag in soup.find_all('span'):
classid=tag.get('class')
if classid==['_F7Vk', 'selectable-text', 'invisible-space', 'copyable-text']:
msg.append([tag.text.translate(non_bmp_map).replace('\n', '')])
if classid==['_3fnHB']:
try:
if msg[-1][-1] in [1,2]:
msg[-1].append(tag.text)
except:
par
tial=1
if classid in [['EopGb', '_3HIqo'],['EopGb']]:
try:
msg[-1].append(len(classid))
except:
partial=1
if classid == ['_F7Vk']:
try:
if tag.text in ['MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY', 'SUNDAY', 'TODAY', 'YESTERDAY'] or valid_date(tag.text):
msg[-1].append(tag.text)
except:
if tag.text in ['MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY', 'SUNDAY', 'TODAY', 'YESTERDAY'] or valid_date(tag.text):
prev=tag.text
partial=1
for i in msg:
if len(i)>4:
i=i[:4]
for i in msg[1:]:
if len(i)==3:
i.append(prev)
else:
prev=i[-1]
chats.append(msg)
driver = webdriver.Chrome()
driver.get("https://web.whatsapp.com/")
sleep(13)
chats=[]
print("Chrome has been automated",eta(time()))
## Chrome has been automated ETA: 0:00:21.044834
elem = driver.find_elements_by_class_name('_3j8Pd')
elem[1].click()
sleep(10)
print("Web Whatsapp Authetication success",eta(time()))
## Web Whatsapp Authetication success ETA: 0:00:32.019598
mycon=set()
while(True):
scroll()
contacts = driver.find_elements_by_css_selector('._3NWy8 span')
newcon=set([j.text for j in contacts])
if len(newcon|mycon)==len(mycon):
break
else:
mycon=newcon|mycon
contact=sorted(list(mycon),key=str.casefold)
rotate=dict()
print(len(contact),"contacts has been retrieved",eta(time()))
## 108 contacts has been retrieved ETA: 0:02:50.282969
match=dict()
for i in contact:
match[i.lower()]=i
contacts=[i.lower() for i in contact]
for i in range(len(contacts)):
cnt=0
for j in range(i):
if contacts[i] in contacts[j]:
cnt+=1
rotate[match[contacts[i]]]=cnt
driver.find_element_by_class_name('qfKkX').click()
sleep(2)
scrap=0
for i in contact:
driver.find_elements_by_class_name('_3j8Pd')[1].click()
sleep(2)
driver.find_element_by_class_name('_1XCAr').click()
pyautogui.typewrite(i)
sleep(2)
for j in range(rotate[i]+1):
pyautogui.press('down')
pyautogui.press('enter')
findmsg(i)
try:
driver.find_element_by_class_name('_1XCAr').click()
except:
pyautogui.press('esc')
driver.find_element_by_class_name('_1XCAr').click()
scrap+=1
if scrap==1:
print('['+i,"Success",eta(time()),end='')
elif scrap==len(contact):
print(','+i,"Success"+eta(time())+']')
else:
print(','+i,"Success",eta(time()),end='')
## [abhisak Success ETA: 0:03:53.117132,Ajay Success ETA: 0:04:53.980288,Ak Success ETA: 0:05:55.073330,Akka Success ETA: 0:06:55.873801,Anu Success ETA: 0:07:56.614228,Aravind Success ETA: 0:08:57.341624,Ashish Success ETA: 0:09:58.305027,Ashit Success ETA: 0:10:59.042801,Ashwathy Mam Success ETA: 0:11:59.745627,Asis Success ETA: 0:13:00.634333,Bala Success ETA: 0:14:01.523082,balaji Success ETA: 0:15:02.410045,blesson cse a Success ETA: 0:16:03.316334,Buvan Eie Success ETA: 0:17:04.254713,DANCES Success ETA: 0:18:05.123447,Daril Success ETA: 0:19:06.012867,Deepak Success ETA: 0:20:06.972385,Deepak A Success ETA: 0:21:07.770144,Dhamu Success ETA: 0:22:08.510272,Dharma Eie Success ETA: 0:23:09.384773,Dhinesh EEE Success ETA: 0:24:10.301919,Dhruvil Success ETA: 0:25:11.189713,Dinesh Success ETA: 0:26:12.061433,divya eie Success ETA: 0:27:13.002720,Dr. Arun Success ETA: 0:28:13.731812,Fiancy Success ETA: 0:29:14.460051,Friend Success ETA: 0:30:15.330926,friends Success ETA: 0:31:16.234849,Ganesh Success ETA: 0:32:17.124591,Ganesh Jio Success ETA: 0:33:17.995259,Gautam Success ETA: 0:34:18.866185,gokul Success ETA: 0:35:21.494336,Gowtham Bro Success ETA: 0:36:22.176876,hari cse Success ETA: 0:37:24.563699,hari krishna raju Success ETA: 0:38:25.266571,hari krishnan Success ETA: 0:39:25.939407,Harshit Success ETA: 0:40:26.713989,HOD Success ETA: 0:41:27.468038,Home Success ETA: 0:42:28.235691,jeeva cse. A Success ETA: 0:43:28.941026,jerin Success ETA: 0:44:29.663998,john sch Success ETA: 0:45:30.350604,Jothi Bro Success ETA: 0:46:31.105386,Kaif Success ETA: 0:47:31.776873,Kanimuthu Sir Success ETA: 0:48:32.506942,Karpagavalli Success ETA: 0:49:33.220556,Karthik Raja Kce Success ETA: 0:50:34.024288,Kce-Anudeep Success ETA: 0:51:34.845723,kohila mam Success ETA: 0:52:35.583319,Kowshi Success ETA: 0:53:36.338328,Kv2 Success ETA: 0:54:37.159393,Maari Success ETA: 0:55:37.882022,maari 2 Success ETA: 0:56:38.618111,Maari Eie Success ETA: 0:57:39.386653,Maddy Success ETA: 0:58:40.311377,Mahesh Jio Success ETA: 0:59:41.295440,mani kandan Success ETA: 1:00:42.029549,Mano Success ETA: 1:01:42.759834,Mapla Success ETA: 1:02:43.647313,Megala Mam Success ETA: 1:03:44.543679,mohana mam Success ETA: 1:04:45.290926,naveen Success ETA: 1:05:46.161672,Naveenhack Success ETA: 1:06:47.049558,Nirmal A Success ETA: 1:07:47.972131,Nishanth Anna Success ETA: 1:08:48.859460,Nithish Success ETA: 1:09:49.884680,Nithish Iit Success ETA: 1:10:50.853637,Pavithra Success ETA: 1:11:51.823512,Pavithran Success ETA: 1:12:52.782465,Pavithran A Success ETA: 1:13:53.699356,periyappa home Success ETA: 1:14:54.587946,pluto Success ETA: 1:15:57.239939,Pradeep Success ETA: 1:16:57.980375,prakash Success ETA: 1:17:58.868630,Pranav Success ETA: 1:19:01.547460,Pranesh Success ETA: 1:20:02.293422,Prasanna Success ETA: 1:21:03.190193,prasanna pay Success ETA: 1:22:03.921945,prem Success ETA: 1:23:04.808216,Ragul Success ETA: 1:24:05.699165,ragul b Success ETA: 1:25:06.567277,Ramamani Tutor Mam Success ETA: 1:26:07.455086,ramesh Success ETA: 1:27:08.460970,Ranjith Success ETA: 1:28:09.385641,Rasi Success ETA: 1:29:10.104570,Richard Success ETA: 1:30:10.974034,Rogith S Success ETA: 1:31:11.862360,Ruby Miss Success ETA: 1:32:12.847662,Sabareesan Akka Success ETA: 1:33:13.554148,Salman Farsi Success ETA: 1:34:14.443640,Sami Success ETA: 1:35:15.314432,samir ece Success ETA: 1:36:16.302520,santhonsh ece Success ETA: 1:37:17.189750,saravana A Success ETA: 1:38:18.078649,Saravana EIE 2 Success ETA: 1:39:18.983479,Sathish Success ETA: 1:40:19.921243,sister Success ETA: 1:41:20.894183,Sri Hari Kce Success ETA: 1:42:21.864219,Surya Success ETA: 1:43:22.751519,Thangachi Success ETA: 1:44:23.483157,Thirudan Success ETA: 1:45:24.211542,Velu Success ETA: 1:46:24.898911,Vicky Success ETA: 1:47:25.588525,vickyyy Success ETA: 1:48:26.348264,vignesh mech Success ETA: 1:49:27.029506,Vimal Success ETA: 1:50:27.718444,Xerox Success ETA: 1:51:28.389449,Yogesh SuccessETA: 1:52:29.083553]
print("Messages has been successfully retrieved",eta(time()))
## Messages has been successfully retrieved ETA: 1:52:29.144389
chatlist=[]
sen=0;rec=0
for i in chats:
details=i[0].split('\n')[0]
for j in range(1,len(i)):
if i[j][1]==1:
type='Recieved';
rec+=1
else:
type='Sent';
sen+=1
chatlist.append(';'.join([details+' '+':',str(i[j][0]),type,str(i[j][2]),str(i[j][3])]))
print("No. of Messages Sent :",sen,eta(time()));
## No. of Messages Sent : 557 ETA: 1:52:29.300971
print("No. of Messages Recieved:",rec,eta(time()))
## No. of Messages Recieved: 369 ETA: 1:52:29.312937
print("The messages have been loaded into chat",eta(time()))
## The messages have been loaded into chat ETA: 1:52:29.323910
  Load the required libraries.
  The libraries that are used in this project are
     1.tidytext
     2.dplyr
     3.ggplot2
     4.viridis
     5.tidyr
     6.fmsb
     7.caTools
     8.wordcloud
     9.wordcloud2
     10.RColorBrewer
   Coverting all factors into character type.Viewing the structure of the dataset.
library(tidytext)
library(dplyr)
library(ggplot2)
library(viridis)
library(tidyr)
library(fmsb)
library(caret)
library(caTools)
library(wordcloud)
library(RColorBrewer)
library(wordcloud2)
dataset=as.data.frame(py$chatlist,stringsAsFactors=FALSE)
dataset <- data.frame(do.call('rbind', strsplit(as.character(dataset$`py$chatlist`),';',fixed=TRUE)))
dataset <- dataset%>%mutate_all(as.character)
str(dataset)
## 'data.frame': 926 obs. of 5 variables:
## $ X1: chr "Ak :" "Ak :" "Ak :" "Ak :" ...
## $ X2: chr "neenga than na div 1 lam poi mass katuveenga" "apo nan 2yrs ah competitive coding practice pannitu irunden ipo nee 2yrs ah competitive coding practice panni L"| __truncated__ "amana ...." "ana neenga eppavume periya aalu nan namburena" ...
## $ X3: chr "Sent" "Recieved" "Sent" "Sent" ...
## $ X4: chr "19:26" "20:40" "21:43" "21:44" ...
## $ X5: chr "16/12/2019" "16/12/2019" "16/12/2019" "16/12/2019" ...
    The column names are modified as Contact,Text,Type,Time,Date.
colnames(dataset)=c('Contact','Text','Type','Time','Date')
names(dataset)
## [1] "Contact" "Text" "Type" "Time" "Date"
    The data contains some misplaced values in Date column so removing them.The date column should contain only MONDAY,TUESDAY,WEDNESDAY,THURSDAY,FRIDAY,SATURDAY,SUNDAY,TODAY,YESTERDAY or the date may be expressed in DD/MM/YYYY format.
dataset=dataset[dataset$Date %in% c('MONDAY', 'TUESDAY', 'WEDNESDAY',
'THURSDAY', 'FRIDAY', 'SATURDAY',
'SUNDAY', 'TODAY', 'YESTERDAY') |
!is.na(as.Date(as.character(dataset$Date),format="%d/%m/%Y")) ,]
dataset[,1]=gsub(' :','',dataset[,1])
dataset[c(1:10),]
## Contact
## 1 Ak
## 2 Ak
## 3 Ak
## 4 Ak
## 5 Ak
## 6 Ak
## 7 Ak
## 8 Ak
## 9 Ak
## 10 Ak
## Text
## 1 neenga than na div 1 lam poi mass katuveenga
## 2 apo nan 2yrs ah competitive coding practice pannitu irunden ipo nee 2yrs ah competitive coding practice panni Level 5 pa iruka la
## 3 amana ....
## 4 ana neenga eppavume periya aalu nan namburena
## 5 unga skill enna theriyuma
## 6 nan observe pannathula
## 7 u are very good in math
## 8 python ungaluku periya support
## 9 apdi la illa da namba evlo work panrom apdingrathula dan iruku
## 10 experience matters a lot
## Type Time Date
## 1 Sent 19:26 16/12/2019
## 2 Recieved 20:40 16/12/2019
## 3 Sent 21:43 16/12/2019
## 4 Sent 21:44 16/12/2019
## 5 Sent 21:45 16/12/2019
## 6 Sent 21:45 16/12/2019
## 7 Sent 21:45 16/12/2019
## 8 Sent 21:45 16/12/2019
## 9 Recieved 21:45 16/12/2019
## 10 Recieved 21:46 16/12/2019
    Giving line number to each line because we are going split the text to words.Inorder to find which text does the word belong to we are allocating the line numbers for each text.
data=dataset%>%mutate(Line=row_number())
data[c(1:10),]
## Contact
## 1 Ak
## 2 Ak
## 3 Ak
## 4 Ak
## 5 Ak
## 6 Ak
## 7 Ak
## 8 Ak
## 9 Ak
## 10 Ak
## Text
## 1 neenga than na div 1 lam poi mass katuveenga
## 2 apo nan 2yrs ah competitive coding practice pannitu irunden ipo nee 2yrs ah competitive coding practice panni Level 5 pa iruka la
## 3 amana ....
## 4 ana neenga eppavume periya aalu nan namburena
## 5 unga skill enna theriyuma
## 6 nan observe pannathula
## 7 u are very good in math
## 8 python ungaluku periya support
## 9 apdi la illa da namba evlo work panrom apdingrathula dan iruku
## 10 experience matters a lot
## Type Time Date Line
## 1 Sent 19:26 16/12/2019 1
## 2 Recieved 20:40 16/12/2019 2
## 3 Sent 21:43 16/12/2019 3
## 4 Sent 21:44 16/12/2019 4
## 5 Sent 21:45 16/12/2019 5
## 6 Sent 21:45 16/12/2019 6
## 7 Sent 21:45 16/12/2019 7
## 8 Sent 21:45 16/12/2019 8
## 9 Recieved 21:45 16/12/2019 9
## 10 Recieved 21:46 16/12/2019 10
    The chats text column is divided into words along with its text line number.
data=data%>%unnest_tokens(word,Text)
data[c(1:10),]
## Contact Type Time Date Line word
## 1 Ak Sent 19:26 16/12/2019 1 neenga
## 1.1 Ak Sent 19:26 16/12/2019 1 than
## 1.2 Ak Sent 19:26 16/12/2019 1 na
## 1.3 Ak Sent 19:26 16/12/2019 1 div
## 1.4 Ak Sent 19:26 16/12/2019 1 1
## 1.5 Ak Sent 19:26 16/12/2019 1 lam
## 1.6 Ak Sent 19:26 16/12/2019 1 poi
## 1.7 Ak Sent 19:26 16/12/2019 1 mass
## 1.8 Ak Sent 19:26 16/12/2019 1 katuveenga
## 2 Ak Recieved 20:40 16/12/2019 2 apo
    Combining the words along with its sentiments.The sentiments are adapted from NRC library,which has 13,891 words along with their sentiments.
textsentiment=data %>%
inner_join(get_sentiments("nrc"))
textsentiment[c(1:10),]
## Contact Type Time Date Line word sentiment
## 1 Ak Recieved 20:40 16/12/2019 2 practice positive
## 2 Ak Recieved 20:40 16/12/2019 2 practice positive
## 3 Ak Recieved 20:40 16/12/2019 2 level positive
## 4 Ak Recieved 20:40 16/12/2019 2 level trust
## 5 Ak Sent 21:45 16/12/2019 7 good anticipation
## 6 Ak Sent 21:45 16/12/2019 7 good joy
## 7 Ak Sent 21:45 16/12/2019 7 good positive
## 8 Ak Sent 21:45 16/12/2019 7 good surprise
## 9 Ak Sent 21:45 16/12/2019 7 good trust
## 10 Ak Sent 21:47 16/12/2019 16 aye positive
    Trying a bar plot over dates to get the count of messages in each day.
dataset$Date=as.factor(dataset$Date)
ggplot(data=dataset,aes(x=Date,fill=Date))+geom_bar()+coord_flip()+labs(x="Dates",y="No. of messages",title = "Bar plot over Dates")
    Plotting a line graph for this week with count of messages on each day.
new=data%>%filter(Date %in% c('MONDAY', 'TUESDAY', 'WEDNESDAY',
'THURSDAY', 'FRIDAY', 'SATURDAY',
'SUNDAY', 'TODAY', 'YESTERDAY'))%>%count(Date,Type)
ggplot(new,aes(x=Date,y=n,color=Type,group=Type))+geom_point()+geom_line()
    Plotting a line graph for this month with count of messages on each date in current month.
myDate = as.POSIXct(Sys.Date())
month=format(myDate,"%m")
new=data%>%filter(!is.na(as.Date(as.character(data$Date),format="%d/%m/%Y")))%>%count(Date,Type)
new=new%>%filter(month==format(as.Date(Date),"%m"))
new=new%>%mutate(Date=substr(Date, start = 1, stop = 2))
ggplot(new,aes(x=Date,y=n,color=Type,group=Type))+geom_point()+geom_line()
    Ploting a pie chart over my sentiments in an increasing order
sentimentcount=textsentiment%>%
group_by(sentiment)%>%count()%>%arrange(desc(n))
ggplot(sentimentcount, aes(x=reorder(sentiment,n), y=n,fill=sentiment)) +
geom_bar(stat="identity", width=1, color="white") +
coord_polar("y", start=0)+theme_bw()+labs(x='Sorted Sentiments',y='Usage',title = "Sentiments Contribution")
    Ploting top 20 chats of both Types(Sent and Recieved) in a decreasing order
msgtype=data%>%group_by(Contact)%>%count(Type)
msgtype=data.frame(msgtype)
colnames(msgtype)=c('Contact','Type','Count')
top20=msgtype%>%group_by(Type)%>%top_n(n=20,wt=Count)%>%ungroup()
scale_x_reordered <- function(..., sep = "___") {
reg <- paste0(sep, ".+$")
ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...)
}
ggplot(top20,aes(x=reorder_within(Contact,Count,Type),y=Count,fill=Contact))+scale_x_reordered()+
facet_wrap(~Type,scales = "free")+geom_bar(stat = "identity")+coord_flip()+labs(x="Contacts",y="Messages Count",
title="Top 20 Chats Sent & Recieved")
    Picking up the top 5 Contacts in sent messages category and finding the top 5 words based on their usage frequency.
top5sent=dataset[dataset$Type=="Sent",]%>%count(Contact)%>%top_n(5)%>%head(5)
data[data$Contact %in% top5sent$Contact & data$Type=="Sent",]%>%
count(Contact, word, sort = TRUE) %>%
group_by(Contact) %>%
top_n(5) %>%
ggplot(aes(x = reorder_within(word, n, Contact), y = n, fill = Contact)) +
geom_col(show.legend = FALSE) +
coord_flip() +
facet_wrap(~Contact, ncol = 2, scales = "free_y") +
scale_x_reordered() +
labs(title = "Most oftenly Sent words",x="Words",y="Count")
    Picking up the top 5 Contacts in recieved messages category and finding the top 5 words based on their usage frequency.
top5Recieved=dataset[dataset$Type=="Recieved",]%>%count(Contact)%>%top_n(5)%>%head(5)
data[data$Contact %in% top5Recieved$Contact & data$Type=="Recieved",]%>%
count(Contact, word, sort = TRUE) %>%
group_by(Contact) %>%
top_n(4) %>%
ggplot(aes(x = reorder_within(word, n, Contact), y = n, fill = Contact)) +
geom_col(show.legend = FALSE) +
coord_flip() +
facet_wrap(~Contact, ncol = 2, scales = "free_y") +
scale_x_reordered() +
labs(title="Most oftenly Recieved words",x="Words",y="Count")
    Finding the sentiments associated with my words over different timings such as Morning,Afternoon,Evening,Night and Mid Night.
times=textsentiment%>%separate(Time,c("Hour","Minute"), sep = ":")%>%unite(Time,Hour, Minute,sep = ".")
times$Time=as.double(unlist(times$Time))
times=times %>% mutate(group = case_when(
Time > 6 & Time <= 12~ 'Morning',
Time > 12 & Time <= 16 ~ 'Afternoon',
Time > 16 & Time <= 21 ~ 'Evening',
Time > 21 & Time <= 24 ~ 'Night',
Time > 0 & Time <= 6~ 'Mid Night'))
times%>%count(sentiment,group)%>%group_by(group)%>%ungroup()%>%
ggplot(aes(x=reorder(sentiment,n),y=n,fill=group))+geom_bar(stat="identity")+facet_wrap(~group,ncol=2,scales="free")+coord_flip()+labs(x="Sentiments",y="Count",title = "Sentiments Contribution Over Time")
    Plotting a Radar chart with my first 5 contacts and their contribution with my sentiments such as joy,antincipation,trust,positive and surprise.
radar=as.data.frame.matrix(head(table(textsentiment$Contact,textsentiment$sentiment),5)[,-c(1,3,4,6,8)])
radar <- rbind(rep(30,5) , rep(0,5) , radar)
colors_border=c( rgb(0.2,0.5,0.5,0.9), rgb(0.8,0.2,0.5,0.9) , rgb(0.7,0.5,0.1,0.9) )
colors_in=c( rgb(0.2,0.5,0.5,0.4), rgb(0.8,0.2,0.5,0.4) , rgb(0.7,0.5,0.1,0.4) )
radarchart( radar, axistype=1 ,
pcol=colors_border , pfcol=colors_in , plwd=4 , plty=1,
cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,20,5), cglwd=0.8,title=paste("Radar chart with Sentiments"),
vlcex=0.8)
legend(x=1.0, y=1, legend = rownames(radar[-c(1,2),]), bty = "n", pch=20 ,
col=colors_in , text.col = "black", cex=1.2, pt.cex=3)
    Trying to learn something about the data.
learndata=dataset
    Lets try to guess the type of message whether it has been sent or recieved
    Shuffle the data because it is arranged by authorwise inorder to get some good analysis results.
set.seed(7356)
rows <- sample(nrow(learndata))
learndata=learndata[rows,]
learndata[c(1:10),]
## Contact
## 377 Harshit
## 749 ramesh
## 715 Pranesh
## 838 Ruby Miss
## 334 Gowtham Bro
## 758 Ranjith
## 462 jerin
## 184 Ashit
## 769 Ranjith
## 431 HOD
## Text
## 377 the words u r frequently using and the emotional containers of the top 10 person
## 749 hii da rameshu
## 715 poi paruda
## 838 good night sweet dreams mam
## 334 nan ivalo nalla irukka avunga than karanam so
## 758 regarding WhatsApp
## 462 Athula on pannikoda theva paduratha
## 184 yes bro
## 769 do u have any unimplemented idea ...
## 431 I am suthan
## Type Time Date
## 377 Sent 17:14 17/12/2019
## 749 Sent 22:10 06/12/2019
## 715 Sent 20:02 YESTERDAY
## 838 Sent 23:08 13/12/2019
## 334 Sent 08:37 08/12/2019
## 758 Sent 10:38 WEDNESDAY
## 462 Sent 16:23 YESTERDAY
## 184 Sent 18:25 16/12/2019
## 769 Sent 10:40 WEDNESDAY
## 431 Sent 19:22 19/12/2019
    Replacing the Type column with 0’s and 1’s(1 represents Sent category and 0 represents Recieved category)
learndata$Type <- ifelse(learndata$Type=='Sent', 1, 0)
learndata[c(1:10),]
## Contact
## 377 Harshit
## 749 ramesh
## 715 Pranesh
## 838 Ruby Miss
## 334 Gowtham Bro
## 758 Ranjith
## 462 jerin
## 184 Ashit
## 769 Ranjith
## 431 HOD
## Text
## 377 the words u r frequently using and the emotional containers of the top 10 person
## 749 hii da rameshu
## 715 poi paruda
## 838 good night sweet dreams mam
## 334 nan ivalo nalla irukka avunga than karanam so
## 758 regarding WhatsApp
## 462 Athula on pannikoda theva paduratha
## 184 yes bro
## 769 do u have any unimplemented idea ...
## 431 I am suthan
## Type Time Date
## 377 1 17:14 17/12/2019
## 749 1 22:10 06/12/2019
## 715 1 20:02 YESTERDAY
## 838 1 23:08 13/12/2019
## 334 1 08:37 08/12/2019
## 758 1 10:38 WEDNESDAY
## 462 1 16:23 YESTERDAY
## 184 1 18:25 16/12/2019
## 769 1 10:40 WEDNESDAY
## 431 1 19:22 19/12/2019
    The data is divided into training and testing sets.The training set data is used for learning and the testing set is used to test the found algorithm.The training set extracts about 80% of the original data and the testing set takes up the remaining 20%.
set.seed(300)
split = sample.split(learndata$Type,
SplitRatio = 0.80)
training_set =subset(learndata, split==TRUE)
test_set =subset(learndata, split==FALSE)
training_set[c(1:10),]
## Contact Text Type Time
## 749 ramesh hii da rameshu 1 22:10
## 715 Pranesh poi paruda 1 20:02
## 838 Ruby Miss good night sweet dreams mam 1 23:08
## 334 Gowtham Bro nan ivalo nalla irukka avunga than karanam so 1 08:37
## 758 Ranjith regarding WhatsApp 1 10:38
## 462 jerin Athula on pannikoda theva paduratha 1 16:23
## 184 Ashit yes bro 1 18:25
## 769 Ranjith do u have any unimplemented idea ... 1 10:40
## 443 jerin Off aaguthuda service error nu 0 12:52
## 125 Anu I'lla jee 0 20:14
## Date
## 749 06/12/2019
## 715 YESTERDAY
## 838 13/12/2019
## 334 08/12/2019
## 758 WEDNESDAY
## 462 YESTERDAY
## 184 16/12/2019
## 769 WEDNESDAY
## 443 YESTERDAY
## 125 YESTERDAY
test_set[c(1:10),]
## Contact
## 377 Harshit
## 431 HOD
## 702 pluto
## 366 Harshit
## 925 Yogesh
## 198 Ashit
## 579 Megala Mam
## 806 Rasi
## 522 Mahesh Jio
## 174 Ashish
## Text
## 377 the words u r frequently using and the emotional containers of the top 10 person
## 431 I am suthan
## 702 send English paragraphs da
## 366 Ok bro
## 925 If not received communicate in your group.
## 198 with a and b
## 579 Okk
## 806 summada rasi
## 522 https://www.instagram.com/p/B570YeMhtL0/?igshid=128nfrm2zq82g
## 174 Hmm okay buddy
## Type Time Date
## 377 1 17:14 17/12/2019
## 431 1 19:22 19/12/2019
## 702 1 13:03 MONDAY
## 366 0 17:10 17/12/2019
## 925 1 10:40 16/12/2019
## 198 1 18:27 16/12/2019
## 579 0 10:42 WEDNESDAY
## 806 1 18:23 TUESDAY
## 522 0 16:21 16/12/2019
## 174 0 17:46 16/12/2019
    The classifier is created with GENERALIZED LINEAR MODEL based on Binomial family with y as Type and prediction is based on Contact and Time.
options(max.print=30)
classifier = glm(formula = Type ~ Contact+Time,
family = binomial(),
data = training_set)
summary(classifier)
##
## Call:
## glm(formula = Type ~ Contact + Time, family = binomial(), data = training_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.49 0.00 0.00 0.00 8.49
##
## Coefficients: (13 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.086e+15 9.746e+07 -1.115e+07 <2e-16 ***
## ContactAkka -3.417e+15 7.067e+07 -4.835e+07 <2e-16 ***
## ContactAnu -4.458e+15 8.004e+07 -5.570e+07 <2e-16 ***
## ContactAshish 1.086e+15 1.183e+08 9.181e+06 <2e-16 ***
## ContactAshit 4.553e+15 1.202e+08 3.787e+07 <2e-16 ***
## ContactAsis -3.303e+15 4.901e+07 -6.739e+07 <2e-16 ***
## [ reached getOption("max.print") -- omitted 349 rows ]
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 983.81 on 731 degrees of freedom
## Residual deviance: 13768.68 on 390 degrees of freedom
## AIC: 14453
##
## Number of Fisher Scoring iterations: 25
    Now running the classifier against the test data to predict whether the message is either of Sent category or Recieved category.If the result is above 0.5 then it belongs to Sent category or else it belongs to Recieved category.
options(max.print=200)
test_set=test_set[test_set$Contact %in% training_set$Contact & test_set$Time %in% training_set$Time,]
prob_pred = predict(classifier,
type='response',
newdata=test_set[-3])
y_pred=ifelse(prob_pred>0.5,1,0)
y_pred
## 377 431 702 366 198 579 806 174 605 408 200 141 637 322 465 40 313 801 172 486
## 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0 1 1 1 1
## 370 614 54 672 721 468 571 191 343 16 168 667 666 470 325 245 368 7 267 622
## 0 1 0 1 1 0 1 1 0 1 0 1 1 0 0 1 0 1 0 0
## 249 103 512 568 673 186 192 281 398 188 335 641 298 148 395 789 143 93 416 158
## 1 0 0 1 1 1 1 1 1 1 1 0 0 0 1 0 0 0 0 0
## 195 634 225 170 283 450 367 797 642 204 312 271 180 578 818 84 403 489 819 346
## 1 1 0 1 1 1 1 0 0 1 0 1 1 1 1 0 0 0 1 0
## 20 258 765 484 862 909 35 661 324 320 69 479 477 885 651 472 761 328 208 321
## 1 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 1 0 0 1
## 351 61 181 709 752 509 301 815 663 559 747 187 519 546 436 606 531 361 683 692
## 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1
## 292 555 347 694 22 341 736 43 111 11 919 251 273 399 260 616 490 656 811 710
## 1 1 0 1 1 0 0 1 1 1 1 1 1 1 0 1 0 1 1 0
## 766 162 517 363 21 608 138 239 790 268 619 907
## 0 0 0 1 1 0 1 1 0 0 0 1
    Creating Confusion matrix to check the accuracy of results.
cm=table(test_set[,3],y_pred)
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
## y_pred
## 0 1
## 0 19 44
## 1 34 55
##
## Accuracy : 0.4868
## 95% CI : (0.405, 0.5692)
## No Information Rate : 0.6513
## P-Value [Acc > NIR] : 1.0000
##
## Kappa : -0.0823
##
## Mcnemar's Test P-Value : 0.3082
##
## Sensitivity : 0.3585
## Specificity : 0.5556
## Pos Pred Value : 0.3016
## Neg Pred Value : 0.6180
## Prevalence : 0.3487
## Detection Rate : 0.1250
## Detection Prevalence : 0.4145
## Balanced Accuracy : 0.4570
##
## 'Positive' Class : 0
##
    Plotting a wordcloud for the sent messages.
datasent <- data %>%filter(Type=="Sent")%>%count(word, sort=TRUE)
datasent[c(1:10),]
## # A tibble: 10 x 2
## word n
## <chr> <int>
## 1 mam 100
## 2 mmm 75
## 3 good 59
## 4 i 49
## 5 da 47
## 6 night 31
## 7 bro 30
## 8 dreams 27
## 9 sweet 27
## 10 sir 26
set.seed(1234) # for reproducibility
layout(matrix(c(1, 2), nrow=2), heights=c(1, 4))
par(mar=rep(0, 4))
plot.new()
text(x=0.5, y=0.8, "Wordcloud for Sent",cex=2.0)
wordcloud(words = datasent$word, freq = datasent$n, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.50, main="Title",
colors=brewer.pal(12, "Paired"))
    Plotting a wordcloud for the recieved messages.
datarecieved<- data %>%filter(Type=="Recieved")%>%count(word, sort=TRUE)
datarecieved[c(1:10),]
## # A tibble: 10 x 2
## word n
## <chr> <int>
## 1 da 45
## 2 ok 27
## 3 good 24
## 4 bro 23
## 5 i 20
## 6 hmm 18
## 7 mmm 17
## 8 you 14
## 9 la 12
## 10 oh 12
set.seed(1234) # for reproducibility
layout(matrix(c(1, 2), nrow=2), heights=c(1, 4))
par(mar=rep(0, 4))
plot.new()
text(x=0.5, y=0.8, "Wordcloud for Recieved",cex=2.0)
wordcloud(words = datarecieved$word, freq = datarecieved$n, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.50,
colors=brewer.pal(12, "Paired"))