Whatsapp Web Scraping With Python & Analysis With R

Requirements

1.Selenium
2.Pyautogui
3.BeautifulSoup
The browser used here is Google Chrome which is automated with selenium with the help of ChromeDriver.On running this code we will get a CMD pop up from chromedriver and it will open the chrome browser.It will be launching web whatsapp by default and shows a QR code we can scan it in our mobile and the mobile should remain logged in until the entire process is completed.This is established only for the sake of gathering Knowledge and not to harm others.

Algorithm

1.Launch Chrome
2.Get connected with whatsappweb
3.Retrieve the contacts synced with whatsapp
4.Retrieve the message of each contact in a sorted manner
5.Retrieve the time,type,content and Date of each message
6.Time is in structured format HH:MM
7.Type is either Sent or Recieved
8.Content is in unstructured format
9.Date is in days or in DD/MM/YYYY
10.Excluding Status,Images,Audios,Videos,Emojis etc..
11.Extracting only text contents
This code retrieves at a average of 100 chats of each person this can be increased or decreased either by manipulating the code or by increasing the internet speed.The average internet speed required to execute this code without any flaws is 100kbps.Further the data will be stored in chatlist which can be saved in CSV format.
For the purpose of analysis it has been left off as it is.The data stored in chatlist is in format of CSV2,which can be simply to written into a file to view in Excel and in other spreadsheet applications.
The output will contain time elapsed for each process along with description.And it also contains the number of messages that has been sent and recieved.


##Created on 06-Dec-2019
##@author: Sudhan
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import sys
import pyautogui
import datetime
import re
from time import *
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
st=time()
def eta(seconds):
    sec=seconds-st
    return "ETA: "+str(datetime.timedelta(seconds=sec))
def valid_date(datestring):
        try:
                mat=re.match('(\d{2})[/.-](\d{2})[/.-](\d{4})$', datestring)
                if mat is not None:
                        datetime.datetime(*(map(int, mat.groups()[-1::-1])))
                        return True
        except ValueError:
                pass
        return False
def scroll():
    for i in range(8):
        pyautogui.press('down')
        sleep(1)
def findmsg(name):
    sleep(2);partial=0
    try:
        driver.find_element_by_class_name('_1ays2').click()
    except:
        return None
    sleep(2)
    for i in range(40):
        sleep(1)
        pyautogui.press('up')
    msg=[name];prev='Unknown';
    sleep(8)
    htmlcode=(driver.page_source).encode('utf-8')
    soup = BeautifulSoup(htmlcode,features="html.parser")
    cnt=0
    for tag in soup.find_all('span'):
        classid=tag.get('class')
        if classid==['_F7Vk', 'selectable-text', 'invisible-space', 'copyable-text']:
            msg.append([tag.text.translate(non_bmp_map).replace('\n', '')])
        if classid==['_3fnHB']:
            try:
                if msg[-1][-1] in [1,2]:
                    msg[-1].append(tag.text)
            except:
                par
                tial=1
        if classid in [['EopGb', '_3HIqo'],['EopGb']]:
            try:
                msg[-1].append(len(classid))
            except:
                partial=1
        if classid == ['_F7Vk']:
            try:
                if tag.text in ['MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY', 'SUNDAY', 'TODAY', 'YESTERDAY'] or valid_date(tag.text):
                    msg[-1].append(tag.text)
            except:
                if tag.text in ['MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY', 'SUNDAY', 'TODAY', 'YESTERDAY'] or valid_date(tag.text):
                    prev=tag.text
                partial=1
    for i in msg:
        if len(i)>4:
            i=i[:4]
    
    for i in msg[1:]:
        if len(i)==3:
            i.append(prev)
        else:
            prev=i[-1]
    chats.append(msg)
driver = webdriver.Chrome()
driver.get("https://web.whatsapp.com/")
sleep(13)
chats=[]
print("Chrome has been automated",eta(time()))

## Chrome has been automated ETA: 0:00:21.044834

elem = driver.find_elements_by_class_name('_3j8Pd')
elem[1].click()
sleep(10)
print("Web Whatsapp Authetication success",eta(time()))

## Web Whatsapp Authetication success ETA: 0:00:32.019598

mycon=set()
while(True):
    scroll()
    contacts = driver.find_elements_by_css_selector('._3NWy8 span')
    newcon=set([j.text for j in contacts])
    if len(newcon|mycon)==len(mycon):
        break
    else:
        mycon=newcon|mycon
contact=sorted(list(mycon),key=str.casefold)
rotate=dict()
print(len(contact),"contacts has been retrieved",eta(time()))

## 108 contacts has been retrieved ETA: 0:02:50.282969

match=dict()
for i in contact:
    match[i.lower()]=i
contacts=[i.lower() for i in contact]
for i in range(len(contacts)):
    cnt=0
    for j in range(i):
        if contacts[i] in contacts[j]:
            cnt+=1
    rotate[match[contacts[i]]]=cnt
driver.find_element_by_class_name('qfKkX').click()
sleep(2)
scrap=0
for i in contact:
    driver.find_elements_by_class_name('_3j8Pd')[1].click()
    sleep(2)
    driver.find_element_by_class_name('_1XCAr').click()
    pyautogui.typewrite(i)
    sleep(2)
    for j in range(rotate[i]+1):
        pyautogui.press('down')
    pyautogui.press('enter')
    findmsg(i)
    try:
        driver.find_element_by_class_name('_1XCAr').click()
    except:
        pyautogui.press('esc')
        driver.find_element_by_class_name('_1XCAr').click()
    scrap+=1
    if scrap==1:
        print('['+i,"Success",eta(time()),end='')
    elif scrap==len(contact):
        print(','+i,"Success"+eta(time())+']')
    else:
        print(','+i,"Success",eta(time()),end='')

## [abhisak Success ETA: 0:03:53.117132,Ajay Success ETA: 0:04:53.980288,Ak Success ETA: 0:05:55.073330,Akka Success ETA: 0:06:55.873801,Anu Success ETA: 0:07:56.614228,Aravind Success ETA: 0:08:57.341624,Ashish Success ETA: 0:09:58.305027,Ashit Success ETA: 0:10:59.042801,Ashwathy Mam Success ETA: 0:11:59.745627,Asis Success ETA: 0:13:00.634333,Bala Success ETA: 0:14:01.523082,balaji Success ETA: 0:15:02.410045,blesson cse a Success ETA: 0:16:03.316334,Buvan Eie Success ETA: 0:17:04.254713,DANCES Success ETA: 0:18:05.123447,Daril Success ETA: 0:19:06.012867,Deepak Success ETA: 0:20:06.972385,Deepak A Success ETA: 0:21:07.770144,Dhamu Success ETA: 0:22:08.510272,Dharma Eie Success ETA: 0:23:09.384773,Dhinesh EEE Success ETA: 0:24:10.301919,Dhruvil Success ETA: 0:25:11.189713,Dinesh Success ETA: 0:26:12.061433,divya eie Success ETA: 0:27:13.002720,Dr. Arun Success ETA: 0:28:13.731812,Fiancy Success ETA: 0:29:14.460051,Friend Success ETA: 0:30:15.330926,friends Success ETA: 0:31:16.234849,Ganesh Success ETA: 0:32:17.124591,Ganesh Jio Success ETA: 0:33:17.995259,Gautam Success ETA: 0:34:18.866185,gokul Success ETA: 0:35:21.494336,Gowtham Bro Success ETA: 0:36:22.176876,hari cse Success ETA: 0:37:24.563699,hari krishna raju Success ETA: 0:38:25.266571,hari krishnan Success ETA: 0:39:25.939407,Harshit Success ETA: 0:40:26.713989,HOD Success ETA: 0:41:27.468038,Home Success ETA: 0:42:28.235691,jeeva cse. A Success ETA: 0:43:28.941026,jerin Success ETA: 0:44:29.663998,john sch Success ETA: 0:45:30.350604,Jothi Bro Success ETA: 0:46:31.105386,Kaif Success ETA: 0:47:31.776873,Kanimuthu Sir Success ETA: 0:48:32.506942,Karpagavalli Success ETA: 0:49:33.220556,Karthik Raja Kce Success ETA: 0:50:34.024288,Kce-Anudeep Success ETA: 0:51:34.845723,kohila mam Success ETA: 0:52:35.583319,Kowshi Success ETA: 0:53:36.338328,Kv2 Success ETA: 0:54:37.159393,Maari Success ETA: 0:55:37.882022,maari 2 Success ETA: 0:56:38.618111,Maari Eie Success ETA: 0:57:39.386653,Maddy Success ETA: 0:58:40.311377,Mahesh Jio Success ETA: 0:59:41.295440,mani kandan Success ETA: 1:00:42.029549,Mano Success ETA: 1:01:42.759834,Mapla Success ETA: 1:02:43.647313,Megala Mam Success ETA: 1:03:44.543679,mohana mam Success ETA: 1:04:45.290926,naveen Success ETA: 1:05:46.161672,Naveenhack Success ETA: 1:06:47.049558,Nirmal A Success ETA: 1:07:47.972131,Nishanth Anna Success ETA: 1:08:48.859460,Nithish Success ETA: 1:09:49.884680,Nithish Iit Success ETA: 1:10:50.853637,Pavithra Success ETA: 1:11:51.823512,Pavithran Success ETA: 1:12:52.782465,Pavithran A Success ETA: 1:13:53.699356,periyappa home Success ETA: 1:14:54.587946,pluto Success ETA: 1:15:57.239939,Pradeep Success ETA: 1:16:57.980375,prakash Success ETA: 1:17:58.868630,Pranav Success ETA: 1:19:01.547460,Pranesh Success ETA: 1:20:02.293422,Prasanna Success ETA: 1:21:03.190193,prasanna pay Success ETA: 1:22:03.921945,prem Success ETA: 1:23:04.808216,Ragul Success ETA: 1:24:05.699165,ragul b Success ETA: 1:25:06.567277,Ramamani Tutor Mam Success ETA: 1:26:07.455086,ramesh Success ETA: 1:27:08.460970,Ranjith Success ETA: 1:28:09.385641,Rasi Success ETA: 1:29:10.104570,Richard Success ETA: 1:30:10.974034,Rogith S Success ETA: 1:31:11.862360,Ruby Miss Success ETA: 1:32:12.847662,Sabareesan Akka Success ETA: 1:33:13.554148,Salman Farsi Success ETA: 1:34:14.443640,Sami Success ETA: 1:35:15.314432,samir ece Success ETA: 1:36:16.302520,santhonsh ece Success ETA: 1:37:17.189750,saravana A Success ETA: 1:38:18.078649,Saravana EIE 2 Success ETA: 1:39:18.983479,Sathish Success ETA: 1:40:19.921243,sister Success ETA: 1:41:20.894183,Sri Hari Kce Success ETA: 1:42:21.864219,Surya Success ETA: 1:43:22.751519,Thangachi Success ETA: 1:44:23.483157,Thirudan Success ETA: 1:45:24.211542,Velu Success ETA: 1:46:24.898911,Vicky Success ETA: 1:47:25.588525,vickyyy Success ETA: 1:48:26.348264,vignesh mech Success ETA: 1:49:27.029506,Vimal Success ETA: 1:50:27.718444,Xerox Success ETA: 1:51:28.389449,Yogesh SuccessETA: 1:52:29.083553]

print("Messages has been successfully retrieved",eta(time()))

## Messages has been successfully retrieved ETA: 1:52:29.144389

chatlist=[]
sen=0;rec=0
for i in chats:
    details=i[0].split('\n')[0]
    for j in range(1,len(i)):
        if i[j][1]==1:
            type='Recieved';
            rec+=1
        else:
            type='Sent';
            sen+=1
        chatlist.append(';'.join([details+' '+':',str(i[j][0]),type,str(i[j][2]),str(i[j][3])]))
print("No. of Messages Sent    :",sen,eta(time()));

## No. of Messages Sent    : 557 ETA: 1:52:29.300971

print("No. of Messages Recieved:",rec,eta(time()))

## No. of Messages Recieved: 369 ETA: 1:52:29.312937

print("The messages have been loaded into chat",eta(time()))

## The messages have been loaded into chat ETA: 1:52:29.323910

Dataset Preperation

Load the required libraries.
The libraries that are used in this project are
1.tidytext
2.dplyr
3.ggplot2
4.viridis
5.tidyr
6.fmsb
7.caTools
8.wordcloud
9.wordcloud2
10.RColorBrewer
Coverting all factors into character type.Viewing the structure of the dataset.

library(tidytext)
library(dplyr)
library(ggplot2)
library(viridis)
library(tidyr)
library(fmsb)
library(caret) 
library(caTools)
library(wordcloud)
library(RColorBrewer)
library(wordcloud2)
dataset=as.data.frame(py$chatlist,stringsAsFactors=FALSE)
dataset <- data.frame(do.call('rbind', strsplit(as.character(dataset$`py$chatlist`),';',fixed=TRUE)))
dataset <- dataset%>%mutate_all(as.character)
str(dataset)

## 'data.frame':    926 obs. of  5 variables:
##  $ X1: chr  "Ak :" "Ak :" "Ak :" "Ak :" ...
##  $ X2: chr  "neenga than na div 1 lam poi mass katuveenga" "apo nan 2yrs ah competitive coding practice pannitu irunden ipo nee 2yrs ah competitive coding practice panni L"| __truncated__ "amana ...." "ana neenga eppavume periya aalu nan namburena" ...
##  $ X3: chr  "Sent" "Recieved" "Sent" "Sent" ...
##  $ X4: chr  "19:26" "20:40" "21:43" "21:44" ...
##  $ X5: chr  "16/12/2019" "16/12/2019" "16/12/2019" "16/12/2019" ...

Modifying Column Names

The column names are modified as Contact,Text,Type,Time,Date.

colnames(dataset)=c('Contact','Text','Type','Time','Date')
names(dataset)

## [1] "Contact" "Text"    "Type"    "Time"    "Date"

Data cleaning

The data contains some misplaced values in Date column so removing them.The date column should contain only MONDAY,TUESDAY,WEDNESDAY,THURSDAY,FRIDAY,SATURDAY,SUNDAY,TODAY,YESTERDAY or the date may be expressed in DD/MM/YYYY format.

dataset=dataset[dataset$Date %in% c('MONDAY', 'TUESDAY', 'WEDNESDAY', 
                                    'THURSDAY', 'FRIDAY', 'SATURDAY', 
                                    'SUNDAY', 'TODAY', 'YESTERDAY') |
                  !is.na(as.Date(as.character(dataset$Date),format="%d/%m/%Y")) ,]
dataset[,1]=gsub(' :','',dataset[,1])
dataset[c(1:10),]

##    Contact
## 1       Ak
## 2       Ak
## 3       Ak
## 4       Ak
## 5       Ak
## 6       Ak
## 7       Ak
## 8       Ak
## 9       Ak
## 10      Ak
##                                                                                                                                 Text
## 1                                                                                       neenga than na div 1 lam poi mass katuveenga
## 2  apo nan 2yrs ah competitive coding practice pannitu irunden ipo nee 2yrs ah competitive coding practice panni Level 5 pa iruka la
## 3                                                                                                                         amana ....
## 4                                                                                      ana neenga eppavume periya aalu nan namburena
## 5                                                                                                          unga skill enna theriyuma
## 6                                                                                                             nan observe pannathula
## 7                                                                                                            u are very good in math
## 8                                                                                                     python ungaluku periya support
## 9                                                                     apdi la illa da namba evlo work panrom apdingrathula dan iruku
## 10                                                                                                          experience matters a lot
##        Type  Time       Date
## 1      Sent 19:26 16/12/2019
## 2  Recieved 20:40 16/12/2019
## 3      Sent 21:43 16/12/2019
## 4      Sent 21:44 16/12/2019
## 5      Sent 21:45 16/12/2019
## 6      Sent 21:45 16/12/2019
## 7      Sent 21:45 16/12/2019
## 8      Sent 21:45 16/12/2019
## 9  Recieved 21:45 16/12/2019
## 10 Recieved 21:46 16/12/2019

Allocating line number for better understanding

Giving line number to each line because we are going split the text to words.Inorder to find which text does the word belong to we are allocating the line numbers for each text.

data=dataset%>%mutate(Line=row_number())
data[c(1:10),]

##    Contact
## 1       Ak
## 2       Ak
## 3       Ak
## 4       Ak
## 5       Ak
## 6       Ak
## 7       Ak
## 8       Ak
## 9       Ak
## 10      Ak
##                                                                                                                                 Text
## 1                                                                                       neenga than na div 1 lam poi mass katuveenga
## 2  apo nan 2yrs ah competitive coding practice pannitu irunden ipo nee 2yrs ah competitive coding practice panni Level 5 pa iruka la
## 3                                                                                                                         amana ....
## 4                                                                                      ana neenga eppavume periya aalu nan namburena
## 5                                                                                                          unga skill enna theriyuma
## 6                                                                                                             nan observe pannathula
## 7                                                                                                            u are very good in math
## 8                                                                                                     python ungaluku periya support
## 9                                                                     apdi la illa da namba evlo work panrom apdingrathula dan iruku
## 10                                                                                                          experience matters a lot
##        Type  Time       Date Line
## 1      Sent 19:26 16/12/2019    1
## 2  Recieved 20:40 16/12/2019    2
## 3      Sent 21:43 16/12/2019    3
## 4      Sent 21:44 16/12/2019    4
## 5      Sent 21:45 16/12/2019    5
## 6      Sent 21:45 16/12/2019    6
## 7      Sent 21:45 16/12/2019    7
## 8      Sent 21:45 16/12/2019    8
## 9  Recieved 21:45 16/12/2019    9
## 10 Recieved 21:46 16/12/2019   10

Converting Message to Words

The chats text column is divided into words along with its text line number.

data=data%>%unnest_tokens(word,Text)
data[c(1:10),]

##     Contact     Type  Time       Date Line       word
## 1        Ak     Sent 19:26 16/12/2019    1     neenga
## 1.1      Ak     Sent 19:26 16/12/2019    1       than
## 1.2      Ak     Sent 19:26 16/12/2019    1         na
## 1.3      Ak     Sent 19:26 16/12/2019    1        div
## 1.4      Ak     Sent 19:26 16/12/2019    1          1
## 1.5      Ak     Sent 19:26 16/12/2019    1        lam
## 1.6      Ak     Sent 19:26 16/12/2019    1        poi
## 1.7      Ak     Sent 19:26 16/12/2019    1       mass
## 1.8      Ak     Sent 19:26 16/12/2019    1 katuveenga
## 2        Ak Recieved 20:40 16/12/2019    2        apo

Fiding sentiments associated with each word

Combining the words along with its sentiments.The sentiments are adapted from NRC library,which has 13,891 words along with their sentiments.

textsentiment=data %>%
  inner_join(get_sentiments("nrc"))
textsentiment[c(1:10),]

##    Contact     Type  Time       Date Line     word    sentiment
## 1       Ak Recieved 20:40 16/12/2019    2 practice     positive
## 2       Ak Recieved 20:40 16/12/2019    2 practice     positive
## 3       Ak Recieved 20:40 16/12/2019    2    level     positive
## 4       Ak Recieved 20:40 16/12/2019    2    level        trust
## 5       Ak     Sent 21:45 16/12/2019    7     good anticipation
## 6       Ak     Sent 21:45 16/12/2019    7     good          joy
## 7       Ak     Sent 21:45 16/12/2019    7     good     positive
## 8       Ak     Sent 21:45 16/12/2019    7     good     surprise
## 9       Ak     Sent 21:45 16/12/2019    7     good        trust
## 10      Ak     Sent 21:47 16/12/2019   16      aye     positive

Message statistics

Trying a bar plot over dates to get the count of messages in each day.

dataset$Date=as.factor(dataset$Date)
ggplot(data=dataset,aes(x=Date,fill=Date))+geom_bar()+coord_flip()+labs(x="Dates",y="No. of messages",title = "Bar plot over Dates")

Line graph for this week

Plotting a line graph for this week with count of messages on each day.

new=data%>%filter(Date %in% c('MONDAY', 'TUESDAY', 'WEDNESDAY', 
                         'THURSDAY', 'FRIDAY', 'SATURDAY', 
                         'SUNDAY', 'TODAY', 'YESTERDAY'))%>%count(Date,Type)
ggplot(new,aes(x=Date,y=n,color=Type,group=Type))+geom_point()+geom_line()

Line graph for this month

Plotting a line graph for this month with count of messages on each date in current month.

myDate = as.POSIXct(Sys.Date())
month=format(myDate,"%m")
new=data%>%filter(!is.na(as.Date(as.character(data$Date),format="%d/%m/%Y")))%>%count(Date,Type)
new=new%>%filter(month==format(as.Date(Date),"%m"))
new=new%>%mutate(Date=substr(Date, start = 1, stop = 2))
ggplot(new,aes(x=Date,y=n,color=Type,group=Type))+geom_point()+geom_line()

Sentiment statistics

Ploting a pie chart over my sentiments in an increasing order

sentimentcount=textsentiment%>%
  group_by(sentiment)%>%count()%>%arrange(desc(n))
ggplot(sentimentcount, aes(x=reorder(sentiment,n), y=n,fill=sentiment)) +
  geom_bar(stat="identity", width=1, color="white") +
  coord_polar("y", start=0)+theme_bw()+labs(x='Sorted Sentiments',y='Usage',title = "Sentiments Contribution")

Top 20 chats

Ploting top 20 chats of both Types(Sent and Recieved) in a decreasing order

msgtype=data%>%group_by(Contact)%>%count(Type)
msgtype=data.frame(msgtype)
colnames(msgtype)=c('Contact','Type','Count')
top20=msgtype%>%group_by(Type)%>%top_n(n=20,wt=Count)%>%ungroup()
scale_x_reordered <- function(..., sep = "___") {
  reg <- paste0(sep, ".+$")
  ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...)
}
ggplot(top20,aes(x=reorder_within(Contact,Count,Type),y=Count,fill=Contact))+scale_x_reordered()+
  facet_wrap(~Type,scales = "free")+geom_bar(stat = "identity")+coord_flip()+labs(x="Contacts",y="Messages Count",
                                                                                  title="Top 20 Chats Sent & Recieved")

Most oftenly Sent words

Picking up the top 5 Contacts in sent messages category and finding the top 5 words based on their usage frequency.

top5sent=dataset[dataset$Type=="Sent",]%>%count(Contact)%>%top_n(5)%>%head(5)
data[data$Contact %in% top5sent$Contact & data$Type=="Sent",]%>%
  count(Contact, word, sort = TRUE) %>%
  group_by(Contact) %>%
  top_n(5) %>%
  ggplot(aes(x = reorder_within(word, n, Contact), y = n, fill = Contact)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~Contact, ncol = 2, scales = "free_y") +
  scale_x_reordered() +
  labs(title = "Most oftenly Sent words",x="Words",y="Count")

Most oftenly Recieved words

Picking up the top 5 Contacts in recieved messages category and finding the top 5 words based on their usage frequency.

top5Recieved=dataset[dataset$Type=="Recieved",]%>%count(Contact)%>%top_n(5)%>%head(5)
data[data$Contact %in% top5Recieved$Contact & data$Type=="Recieved",]%>%
  count(Contact, word, sort = TRUE) %>%
  group_by(Contact) %>%
  top_n(4) %>%
  ggplot(aes(x = reorder_within(word, n, Contact), y = n, fill = Contact)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~Contact, ncol = 2, scales = "free_y") +
  scale_x_reordered() +
  labs(title="Most oftenly Recieved words",x="Words",y="Count")

Sentiment Contribution

Finding the sentiments associated with my words over different timings such as Morning,Afternoon,Evening,Night and Mid Night.

times=textsentiment%>%separate(Time,c("Hour","Minute"), sep = ":")%>%unite(Time,Hour, Minute,sep = ".")
times$Time=as.double(unlist(times$Time))
times=times %>% mutate(group = case_when(
  Time > 6 & Time <= 12~ 'Morning',
  Time > 12 & Time <= 16 ~ 'Afternoon', 
  Time > 16 & Time <= 21 ~ 'Evening',
  Time > 21 & Time <= 24 ~ 'Night',
  Time > 0 & Time <= 6~ 'Mid Night'))
times%>%count(sentiment,group)%>%group_by(group)%>%ungroup()%>%
  ggplot(aes(x=reorder(sentiment,n),y=n,fill=group))+geom_bar(stat="identity")+facet_wrap(~group,ncol=2,scales="free")+coord_flip()+labs(x="Sentiments",y="Count",title = "Sentiments Contribution Over Time")

Radar chart with sentiments

Plotting a Radar chart with my first 5 contacts and their contribution with my sentiments such as joy,antincipation,trust,positive and surprise.

radar=as.data.frame.matrix(head(table(textsentiment$Contact,textsentiment$sentiment),5)[,-c(1,3,4,6,8)])
radar <- rbind(rep(30,5) , rep(0,5) , radar)
colors_border=c( rgb(0.2,0.5,0.5,0.9), rgb(0.8,0.2,0.5,0.9) , rgb(0.7,0.5,0.1,0.9) )
colors_in=c( rgb(0.2,0.5,0.5,0.4), rgb(0.8,0.2,0.5,0.4) , rgb(0.7,0.5,0.1,0.4) )
radarchart( radar, axistype=1 ,
            pcol=colors_border , pfcol=colors_in , plwd=4 , plty=1,
            cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,20,5), cglwd=0.8,title=paste("Radar chart with Sentiments"),
            vlcex=0.8)
legend(x=1.0, y=1, legend = rownames(radar[-c(1,2),]), bty = "n", pch=20 ,
       col=colors_in , text.col = "black", cex=1.2, pt.cex=3)

Machine Learning

Trying to learn something about the data.

learndata=dataset

Predicting the type of message

Lets try to guess the type of message whether it has been sent or recieved
Shuffle the data because it is arranged by authorwise inorder to get some good analysis results.

set.seed(7356)
rows <- sample(nrow(learndata))
learndata=learndata[rows,]
learndata[c(1:10),]

##         Contact
## 377     Harshit
## 749      ramesh
## 715     Pranesh
## 838   Ruby Miss
## 334 Gowtham Bro
## 758     Ranjith
## 462       jerin
## 184       Ashit
## 769     Ranjith
## 431         HOD
##                                                                                 Text
## 377 the words u r frequently using and the emotional containers of the top 10 person
## 749                                                                   hii da rameshu
## 715                                                                       poi paruda
## 838                                                     good night sweet dreams  mam
## 334                                    nan ivalo nalla irukka avunga than karanam so
## 758                                                               regarding WhatsApp
## 462                                              Athula on pannikoda theva paduratha
## 184                                                                          yes bro
## 769                                             do u have any unimplemented idea ...
## 431                                                                      I am suthan
##     Type  Time       Date
## 377 Sent 17:14 17/12/2019
## 749 Sent 22:10 06/12/2019
## 715 Sent 20:02  YESTERDAY
## 838 Sent 23:08 13/12/2019
## 334 Sent 08:37 08/12/2019
## 758 Sent 10:38  WEDNESDAY
## 462 Sent 16:23  YESTERDAY
## 184 Sent 18:25 16/12/2019
## 769 Sent 10:40  WEDNESDAY
## 431 Sent 19:22 19/12/2019

Encoding categoral variables

Replacing the Type column with 0’s and 1’s(1 represents Sent category and 0 represents Recieved category)

learndata$Type <- ifelse(learndata$Type=='Sent', 1, 0)
learndata[c(1:10),]

##         Contact
## 377     Harshit
## 749      ramesh
## 715     Pranesh
## 838   Ruby Miss
## 334 Gowtham Bro
## 758     Ranjith
## 462       jerin
## 184       Ashit
## 769     Ranjith
## 431         HOD
##                                                                                 Text
## 377 the words u r frequently using and the emotional containers of the top 10 person
## 749                                                                   hii da rameshu
## 715                                                                       poi paruda
## 838                                                     good night sweet dreams  mam
## 334                                    nan ivalo nalla irukka avunga than karanam so
## 758                                                               regarding WhatsApp
## 462                                              Athula on pannikoda theva paduratha
## 184                                                                          yes bro
## 769                                             do u have any unimplemented idea ...
## 431                                                                      I am suthan
##     Type  Time       Date
## 377    1 17:14 17/12/2019
## 749    1 22:10 06/12/2019
## 715    1 20:02  YESTERDAY
## 838    1 23:08 13/12/2019
## 334    1 08:37 08/12/2019
## 758    1 10:38  WEDNESDAY
## 462    1 16:23  YESTERDAY
## 184    1 18:25 16/12/2019
## 769    1 10:40  WEDNESDAY
## 431    1 19:22 19/12/2019

Splitting the data

The data is divided into training and testing sets.The training set data is used for learning and the testing set is used to test the found algorithm.The training set extracts about 80% of the original data and the testing set takes up the remaining 20%.

set.seed(300)
split = sample.split(learndata$Type,
                     SplitRatio = 0.80)

training_set =subset(learndata, split==TRUE)
test_set =subset(learndata, split==FALSE)
training_set[c(1:10),]

##         Contact                                          Text Type  Time
## 749      ramesh                                hii da rameshu    1 22:10
## 715     Pranesh                                    poi paruda    1 20:02
## 838   Ruby Miss                  good night sweet dreams  mam    1 23:08
## 334 Gowtham Bro nan ivalo nalla irukka avunga than karanam so    1 08:37
## 758     Ranjith                            regarding WhatsApp    1 10:38
## 462       jerin           Athula on pannikoda theva paduratha    1 16:23
## 184       Ashit                                       yes bro    1 18:25
## 769     Ranjith          do u have any unimplemented idea ...    1 10:40
## 443       jerin                Off aaguthuda service error nu    0 12:52
## 125         Anu                                     I'lla jee    0 20:14
##           Date
## 749 06/12/2019
## 715  YESTERDAY
## 838 13/12/2019
## 334 08/12/2019
## 758  WEDNESDAY
## 462  YESTERDAY
## 184 16/12/2019
## 769  WEDNESDAY
## 443  YESTERDAY
## 125  YESTERDAY

test_set[c(1:10),]

##        Contact
## 377    Harshit
## 431        HOD
## 702      pluto
## 366    Harshit
## 925     Yogesh
## 198      Ashit
## 579 Megala Mam
## 806       Rasi
## 522 Mahesh Jio
## 174     Ashish
##                                                                                 Text
## 377 the words u r frequently using and the emotional containers of the top 10 person
## 431                                                                      I am suthan
## 702                                                       send English paragraphs da
## 366                                                                           Ok bro
## 925                                       If not received communicate in your group.
## 198                                                                     with a and b
## 579                                                                              Okk
## 806                                                                     summada rasi
## 522                    https://www.instagram.com/p/B570YeMhtL0/?igshid=128nfrm2zq82g
## 174                                                                   Hmm okay buddy
##     Type  Time       Date
## 377    1 17:14 17/12/2019
## 431    1 19:22 19/12/2019
## 702    1 13:03     MONDAY
## 366    0 17:10 17/12/2019
## 925    1 10:40 16/12/2019
## 198    1 18:27 16/12/2019
## 579    0 10:42  WEDNESDAY
## 806    1 18:23    TUESDAY
## 522    0 16:21 16/12/2019
## 174    0 17:46 16/12/2019

Creating Classifier

The classifier is created with GENERALIZED LINEAR MODEL based on Binomial family with y as Type and prediction is based on Contact and Time.

options(max.print=30)
classifier = glm(formula = Type ~ Contact+Time,
                 family = binomial(),
                 data = training_set)
summary(classifier)

## 
## Call:
## glm(formula = Type ~ Contact + Time, family = binomial(), data = training_set)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
##  -8.49    0.00    0.00    0.00    8.49  
## 
## Coefficients: (13 not defined because of singularities)
##                         Estimate Std. Error    z value Pr(>|z|)    
## (Intercept)           -1.086e+15  9.746e+07 -1.115e+07   <2e-16 ***
## ContactAkka           -3.417e+15  7.067e+07 -4.835e+07   <2e-16 ***
## ContactAnu            -4.458e+15  8.004e+07 -5.570e+07   <2e-16 ***
## ContactAshish          1.086e+15  1.183e+08  9.181e+06   <2e-16 ***
## ContactAshit           4.553e+15  1.202e+08  3.787e+07   <2e-16 ***
## ContactAsis           -3.303e+15  4.901e+07 -6.739e+07   <2e-16 ***
##  [ reached getOption("max.print") -- omitted 349 rows ]
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:   983.81  on 731  degrees of freedom
## Residual deviance: 13768.68  on 390  degrees of freedom
## AIC: 14453
## 
## Number of Fisher Scoring iterations: 25

Predicting with classifier

Now running the classifier against the test data to predict whether the message is either of Sent category or Recieved category.If the result is above 0.5 then it belongs to Sent category or else it belongs to Recieved category.

options(max.print=200)
test_set=test_set[test_set$Contact %in% training_set$Contact & test_set$Time %in% training_set$Time,]

prob_pred = predict(classifier,
                    type='response',
                    newdata=test_set[-3])
y_pred=ifelse(prob_pred>0.5,1,0)

y_pred

## 377 431 702 366 198 579 806 174 605 408 200 141 637 322 465  40 313 801 172 486 
##   1   1   1   1   1   1   1   1   1   1   1   0   0   1   0   0   1   1   1   1 
## 370 614  54 672 721 468 571 191 343  16 168 667 666 470 325 245 368   7 267 622 
##   0   1   0   1   1   0   1   1   0   1   0   1   1   0   0   1   0   1   0   0 
## 249 103 512 568 673 186 192 281 398 188 335 641 298 148 395 789 143  93 416 158 
##   1   0   0   1   1   1   1   1   1   1   1   0   0   0   1   0   0   0   0   0 
## 195 634 225 170 283 450 367 797 642 204 312 271 180 578 818  84 403 489 819 346 
##   1   1   0   1   1   1   1   0   0   1   0   1   1   1   1   0   0   0   1   0 
##  20 258 765 484 862 909  35 661 324 320  69 479 477 885 651 472 761 328 208 321 
##   1   0   1   0   1   1   1   1   0   1   1   1   0   1   1   1   1   0   0   1 
## 351  61 181 709 752 509 301 815 663 559 747 187 519 546 436 606 531 361 683 692 
##   1   1   1   1   1   1   0   1   1   1   1   1   1   1   1   1   1   1   0   1 
## 292 555 347 694  22 341 736  43 111  11 919 251 273 399 260 616 490 656 811 710 
##   1   1   0   1   1   0   0   1   1   1   1   1   1   1   0   1   0   1   1   0 
## 766 162 517 363  21 608 138 239 790 268 619 907 
##   0   0   0   1   1   0   1   1   0   0   0   1

Confusion matrix

Creating Confusion matrix to check the accuracy of results.

cm=table(test_set[,3],y_pred)
confusionMatrix(cm)

## Confusion Matrix and Statistics
## 
##    y_pred
##      0  1
##   0 19 44
##   1 34 55
##                                          
##                Accuracy : 0.4868         
##                  95% CI : (0.405, 0.5692)
##     No Information Rate : 0.6513         
##     P-Value [Acc > NIR] : 1.0000         
##                                          
##                   Kappa : -0.0823        
##                                          
##  Mcnemar's Test P-Value : 0.3082         
##                                          
##             Sensitivity : 0.3585         
##             Specificity : 0.5556         
##          Pos Pred Value : 0.3016         
##          Neg Pred Value : 0.6180         
##              Prevalence : 0.3487         
##          Detection Rate : 0.1250         
##    Detection Prevalence : 0.4145         
##       Balanced Accuracy : 0.4570         
##                                          
##        'Positive' Class : 0              
##

WordCloud for sent

Plotting a wordcloud for the sent messages.

datasent <- data %>%filter(Type=="Sent")%>%count(word, sort=TRUE)
datasent[c(1:10),]

## # A tibble: 10 x 2
##    word       n
##    <chr>  <int>
##  1 mam      100
##  2 mmm       75
##  3 good      59
##  4 i         49
##  5 da        47
##  6 night     31
##  7 bro       30
##  8 dreams    27
##  9 sweet     27
## 10 sir       26

set.seed(1234) # for reproducibility 
layout(matrix(c(1, 2), nrow=2), heights=c(1, 4))
par(mar=rep(0, 4))
plot.new()
text(x=0.5, y=0.8, "Wordcloud for Sent",cex=2.0)
wordcloud(words = datasent$word, freq = datasent$n, min.freq = 1,  
          max.words=200, random.order=FALSE, rot.per=0.50,  main="Title",
          colors=brewer.pal(12, "Paired"))

WordCloud for Recieved

Plotting a wordcloud for the recieved messages.

datarecieved<- data %>%filter(Type=="Recieved")%>%count(word, sort=TRUE)
datarecieved[c(1:10),]

## # A tibble: 10 x 2
##    word      n
##    <chr> <int>
##  1 da       45
##  2 ok       27
##  3 good     24
##  4 bro      23
##  5 i        20
##  6 hmm      18
##  7 mmm      17
##  8 you      14
##  9 la       12
## 10 oh       12

set.seed(1234) # for reproducibility 
layout(matrix(c(1, 2), nrow=2), heights=c(1, 4))
par(mar=rep(0, 4))
plot.new()
text(x=0.5, y=0.8, "Wordcloud for Recieved",cex=2.0)
wordcloud(words = datarecieved$word, freq = datarecieved$n, min.freq = 1,  
          max.words=200, random.order=FALSE, rot.per=0.50,  
          colors=brewer.pal(12, "Paired"))

Whatsapp Web Scraping(Python) and Analysis(R)

Sudhan

26/12/2019