# Load libraries
library(webshot)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidytext)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(textdata)
library(tidyr)
library(wordcloud2) # For word cloud generation
# Step 1: Load and clean the WhatsApp chat data
whatsapp_chat <- readLines("W.txt")
# Extract message pattern (assuming format: "dd/mm/yyyy, hh:mm - Sender: Message")
pattern <- "^\\d{2}/\\d{2}/\\d{4}, \\d{2}:\\d{2}"
# Filter lines that contain messages
messages <- whatsapp_chat[grep(pattern, whatsapp_chat)]
# Extract timestamp, sender, and message
timestamps <- sub(" - .*", "", messages)
timestamps <- dmy_hm(timestamps) # Convert to date-time format
senders <- sub(".*,\\s\\d{2}:\\d{2} - (.*?):.*", "\\1", messages)
text_messages <- sub(".*?: ", "", messages)
# Create a data frame for analysis
chat_df <- data.frame(Timestamp = timestamps, Sender = senders, Message = text_messages)
chat_df$Hour <- hour(chat_df$Timestamp) # Extract hour for analysis
chat_df$DayOfWeek <- weekdays(chat_df$Timestamp)
# Step 2: Analyze message volume by hour and day of the week
ggplot(chat_df, aes(x = Hour)) +
geom_histogram(binwidth = 1, fill = "steelblue", color = "black") +
labs(title = "WhatsApp Usage by Hour", x = "Hour of Day", y = "Number of Messages") +
theme_minimal()

ggplot(chat_df, aes(x = DayOfWeek)) +
geom_bar(fill = "lightblue") +
labs(title = "Messages by Day of the Week", x = "Day of Week", y = "Number of Messages") +
theme_minimal()

# Step 3: Analyze message length and frequency
chat_df$WordCount <- str_count(chat_df$Message, "\\w+")
ggplot(chat_df, aes(x = WordCount)) +
geom_histogram(binwidth = 5, fill = "darkgreen", color = "black") +
labs(title = "Distribution of Message Lengths", x = "Word Count", y = "Frequency") +
theme_minimal()

chat_df %>%
mutate(RollingHour = floor_date(Timestamp, "hour")) %>%
count(RollingHour) %>%
ggplot(aes(x = RollingHour, y = n)) +
geom_line(color = "blue") +
labs(title = "Message Frequency Over Time", x = "Time", y = "Number of Messages") +
theme_minimal()

# Step 4: Analyze response time
# Step 1: Arrange chat data by timestamp and calculate response time in minutes
chat_df <- chat_df %>%
arrange(Timestamp) %>%
mutate(ResponseTime = as.numeric(difftime(Timestamp, lag(Timestamp), units = "mins"))) # Convert to minutes
# Step 2: Remove the first row with NA response time (since the first message has no previous message to compare)
chat_df_cleaned <- chat_df %>% filter(!is.na(ResponseTime))
# Step 3: Print and check the data for any issues (to ensure ResponseTime is correct)
print(head(chat_df_cleaned)) # Check the first few rows to verify ResponseTime
## Timestamp
## 1 2023-07-15 12:14:00
## 2 2023-07-28 22:28:00
## 3 2023-07-28 22:28:00
## 4 2023-07-28 22:35:00
## 5 2023-08-01 11:49:00
## 6 2023-08-06 11:00:00
## Sender
## 1 15/07/2023, 12:14 pm - Dhivya: Mm
## 2 28/07/2023, 10:28 pm - Dhivya: Mm
## 3 28/07/2023, 10:28 pm - Dhivya: Busula poniya illa appa vanthu cutetu ponara
## 4 28/07/2023, 10:35 pm - Kaviya...: Bus la tha
## 5 01/08/2023, 11:49 am - Kaviya...: <Media omitted>
## 6 06/08/2023, 11:00 am - Kaviya...: Mmm
## Message Hour DayOfWeek WordCount
## 1 Mm 12 Saturday 1
## 2 Mm 22 Friday 1
## 3 Busula poniya illa appa vanthu cutetu ponara 22 Friday 7
## 4 Bus la tha 22 Friday 3
## 5 <Media omitted> 11 Tuesday 2
## 6 Mmm 11 Sunday 1
## ResponseTime
## 1 20139
## 2 19334
## 3 0
## 4 7
## 5 5114
## 6 7151
# Step 4: Convert ResponseTime from seconds to minutes (if necessary)
chat_df_cleaned$ResponseTime <- chat_df_cleaned$ResponseTime / 60 # Convert to minutes
# Step 5: Plot the response time distribution with data points
print(
ggplot(chat_df_cleaned, aes(x = ResponseTime)) +
geom_histogram(binwidth = 1, fill = "purple", color = "black", alpha = 0.6) + # Histogram
geom_jitter(aes(y = 0), width = 0.1, height = 0, color = "black", alpha = 0.5) + # Data points (jittered)
labs(title = "Response Time Distribution (minutes)", x = "Response Time (min)", y = "Frequency") +
theme_minimal()
)

# Check the column names to ensure 'Sender' and 'Receiver' exist
colnames(chat_df)
## [1] "Timestamp" "Sender" "Message" "Hour" "DayOfWeek"
## [6] "WordCount" "ResponseTime"
# Pivot data based on available columns
chat_df_long <- chat_df %>%
# Use `Sender` and `Receiver` if both exist, otherwise use just `Sender`
pivot_longer(cols = c(Sender, if ("Receiver" %in% colnames(chat_df)) "Receiver" else NULL),
names_to = "Role", values_to = "Participant") %>%
count(Participant) %>%
arrange(desc(n))
# View the resulting data frame
head(chat_df_long)
## # A tibble: 6 × 2
## Participant n
## <chr> <int>
## 1 25/06/2024, 12:07 pm - Kaviya...: <Media omitted> 72
## 2 11/03/2024, 12:46 pm - Dhivya: <Media omitted> 9
## 3 04/03/2024, 11:40 am - Dhivya: <Media omitted> 6
## 4 09/08/2023, 12:05 pm - Kaviya...: <Media omitted> 5
## 5 11/03/2024, 10:31 pm - Kaviya...: <Media omitted> 5
## 6 11/03/2024, 12:47 pm - Dhivya: <Media omitted> 5
# Create bar chart
ggplot(chat_df_long, aes(x = reorder(Participant, n), y = n, fill = Participant)) +
geom_bar(stat = "identity", show.legend = FALSE) +
coord_flip() + # Flip the bar chart to horizontal
labs(title = "Message Counts by Participants (Sender + Receiver)", x = "Participant", y = "Total Messages") +
theme_minimal() +
theme(axis.text.x = element_blank(), # Remove x-axis text labels
axis.title.x = element_blank()) # Remove x-axis title

# Step 6: Sentiment analysis
# Step 1: Tokenize and remove stop words
chat_words <- chat_df %>%
unnest_tokens(word, Message) %>%
anti_join(stop_words) # Remove common stop words
## Joining with `by = join_by(word)`
# Step 2: Get sentiment from Bing lexicon
bing_sentiment <- get_sentiments("bing")
# Step 3: Join the tokenized words with Bing sentiment and calculate sentiment score
chat_sentiment <- chat_words %>%
inner_join(bing_sentiment) %>%
count(Timestamp, sentiment) %>%
spread(key = sentiment, value = n, fill = 0) %>% # Use spread() if pivot_wider() is causing issues
mutate(sentiment_score = positive - negative)
## Joining with `by = join_by(word)`
# Step 4: Plot sentiment over time
ggplot(chat_sentiment, aes(x = Timestamp, y = sentiment_score)) +
geom_line(color = "red") +
labs(title = "Sentiment Over Time", x = "Time", y = "Sentiment Score") +
theme_minimal()

# Step 7: Keyword or topic analysis (most common words)
chat_words %>%
count(word, sort = TRUE) %>%
top_n(20) %>%
ggplot(aes(x = reorder(word, n), y = n)) +
geom_bar(stat = "identity", fill = "cyan") +
coord_flip() +
labs(title = "Most Common Words", x = "Word", y = "Frequency") +
theme_minimal()
## Selecting by n

# Step 8: Emoji usage
emoji_pattern <- "[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF]"
chat_df$Emojis <- str_extract_all(chat_df$Message, emoji_pattern)
emoji_list <- unlist(chat_df$Emojis)
emoji_df <- data.frame(emoji = emoji_list) %>%
count(emoji, sort = TRUE)
emoji_df %>%
top_n(10) %>%
ggplot(aes(x = reorder(emoji, n), y = n)) +
geom_bar(stat = "identity", fill = "pink") +
coord_flip() +
labs(title = "Most Used Emojis", x = "Emoji", y = "Count") +
theme_minimal()
## Selecting by n

# Step 9: Conversation gaps (inactivity periods)
chat_df <- chat_df %>%
mutate(TimeDiff = c(NA, diff(Timestamp)))
chat_df %>%
filter(as.numeric(TimeDiff) > 3600) %>%
ggplot(aes(x = Timestamp, y = as.numeric(TimeDiff)/3600)) + # Convert to hours
geom_bar(stat = "identity", fill = "brown") +
labs(title = "Inactivity Periods (Gaps > 1 Hour)", x = "Time", y = "Gap (Hours)") +
theme_minimal()

# Step 10: Create a Word Cloud for most used words
# Create a word frequency data frame (including rare words)
chat_word_freq <- chat_words %>%
count(word, sort = TRUE)
# Remove the words "omitted" and "media" from the word cloud
chat_word_freq <- chat_word_freq %>%
filter(!word %in% c("omitted", "media"))
# Generate word cloud
wordcloud2(data = chat_word_freq,
size = 0.5,
color = "darkblue",
backgroundColor = "white")