# Load libraries
library(webshot)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(tidytext)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(textdata)
library(tidyr)
library(wordcloud2)  # For word cloud generation

# Step 1: Load and clean the WhatsApp chat data
whatsapp_chat <- readLines("W.txt")

# Extract message pattern (assuming format: "dd/mm/yyyy, hh:mm - Sender: Message")
pattern <- "^\\d{2}/\\d{2}/\\d{4}, \\d{2}:\\d{2}"

# Filter lines that contain messages
messages <- whatsapp_chat[grep(pattern, whatsapp_chat)]

# Extract timestamp, sender, and message
timestamps <- sub(" - .*", "", messages)
timestamps <- dmy_hm(timestamps)  # Convert to date-time format
senders <- sub(".*,\\s\\d{2}:\\d{2} - (.*?):.*", "\\1", messages)
text_messages <- sub(".*?: ", "", messages)

# Create a data frame for analysis
chat_df <- data.frame(Timestamp = timestamps, Sender = senders, Message = text_messages)
chat_df$Hour <- hour(chat_df$Timestamp)  # Extract hour for analysis
chat_df$DayOfWeek <- weekdays(chat_df$Timestamp)

# Step 2: Analyze message volume by hour and day of the week
ggplot(chat_df, aes(x = Hour)) +
  geom_histogram(binwidth = 1, fill = "steelblue", color = "black") +
  labs(title = "WhatsApp Usage by Hour", x = "Hour of Day", y = "Number of Messages") +
  theme_minimal()

ggplot(chat_df, aes(x = DayOfWeek)) +
  geom_bar(fill = "lightblue") +
  labs(title = "Messages by Day of the Week", x = "Day of Week", y = "Number of Messages") +
  theme_minimal()

# Step 3: Analyze message length and frequency
chat_df$WordCount <- str_count(chat_df$Message, "\\w+")

ggplot(chat_df, aes(x = WordCount)) +
  geom_histogram(binwidth = 5, fill = "darkgreen", color = "black") +
  labs(title = "Distribution of Message Lengths", x = "Word Count", y = "Frequency") +
  theme_minimal()

chat_df %>%
  mutate(RollingHour = floor_date(Timestamp, "hour")) %>%
  count(RollingHour) %>%
  ggplot(aes(x = RollingHour, y = n)) +
  geom_line(color = "blue") +
  labs(title = "Message Frequency Over Time", x = "Time", y = "Number of Messages") +
  theme_minimal()

# Step 4: Analyze response time
# Step 1: Arrange chat data by timestamp and calculate response time in minutes
chat_df <- chat_df %>%
  arrange(Timestamp) %>%
  mutate(ResponseTime = as.numeric(difftime(Timestamp, lag(Timestamp), units = "mins")))  # Convert to minutes

# Step 2: Remove the first row with NA response time (since the first message has no previous message to compare)
chat_df_cleaned <- chat_df %>% filter(!is.na(ResponseTime))

# Step 3: Print and check the data for any issues (to ensure ResponseTime is correct)
print(head(chat_df_cleaned))  # Check the first few rows to verify ResponseTime
##             Timestamp
## 1 2023-07-15 12:14:00
## 2 2023-07-28 22:28:00
## 3 2023-07-28 22:28:00
## 4 2023-07-28 22:35:00
## 5 2023-08-01 11:49:00
## 6 2023-08-06 11:00:00
##                                                                        Sender
## 1                                           15/07/2023, 12:14 pm - Dhivya: Mm
## 2                                           28/07/2023, 10:28 pm - Dhivya: Mm
## 3 28/07/2023, 10:28 pm - Dhivya: Busula poniya illa appa vanthu cutetu ponara
## 4                                28/07/2023, 10:35 pm - Kaviya...: Bus la tha
## 5                           01/08/2023, 11:49 am - Kaviya...: <Media omitted>
## 6                                       06/08/2023, 11:00 am - Kaviya...: Mmm
##                                        Message Hour DayOfWeek WordCount
## 1                                           Mm   12  Saturday         1
## 2                                           Mm   22    Friday         1
## 3 Busula poniya illa appa vanthu cutetu ponara   22    Friday         7
## 4                                   Bus la tha   22    Friday         3
## 5                              <Media omitted>   11   Tuesday         2
## 6                                          Mmm   11    Sunday         1
##   ResponseTime
## 1        20139
## 2        19334
## 3            0
## 4            7
## 5         5114
## 6         7151
# Step 4: Convert ResponseTime from seconds to minutes (if necessary)
chat_df_cleaned$ResponseTime <- chat_df_cleaned$ResponseTime / 60  # Convert to minutes

# Step 5: Plot the response time distribution with data points
print(
  ggplot(chat_df_cleaned, aes(x = ResponseTime)) +  
    geom_histogram(binwidth = 1, fill = "purple", color = "black", alpha = 0.6) +   # Histogram
    geom_jitter(aes(y = 0), width = 0.1, height = 0, color = "black", alpha = 0.5) +  # Data points (jittered)
    labs(title = "Response Time Distribution (minutes)", x = "Response Time (min)", y = "Frequency") +
    theme_minimal()
)

# Check the column names to ensure 'Sender' and 'Receiver' exist
colnames(chat_df)
## [1] "Timestamp"    "Sender"       "Message"      "Hour"         "DayOfWeek"   
## [6] "WordCount"    "ResponseTime"
# Pivot data based on available columns
chat_df_long <- chat_df %>%
  # Use `Sender` and `Receiver` if both exist, otherwise use just `Sender`
  pivot_longer(cols = c(Sender, if ("Receiver" %in% colnames(chat_df)) "Receiver" else NULL), 
               names_to = "Role", values_to = "Participant") %>%
  count(Participant) %>%
  arrange(desc(n))

# View the resulting data frame
head(chat_df_long)
## # A tibble: 6 × 2
##   Participant                                           n
##   <chr>                                             <int>
## 1 25/06/2024, 12:07 pm - Kaviya...: <Media omitted>    72
## 2 11/03/2024, 12:46 pm - Dhivya: <Media omitted>        9
## 3 04/03/2024, 11:40 am - Dhivya: <Media omitted>        6
## 4 09/08/2023, 12:05 pm - Kaviya...: <Media omitted>     5
## 5 11/03/2024, 10:31 pm - Kaviya...: <Media omitted>     5
## 6 11/03/2024, 12:47 pm - Dhivya: <Media omitted>        5
# Create bar chart
ggplot(chat_df_long, aes(x = reorder(Participant, n), y = n, fill = Participant)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  coord_flip() +  # Flip the bar chart to horizontal
  labs(title = "Message Counts by Participants (Sender + Receiver)", x = "Participant", y = "Total Messages") +
  theme_minimal() +
  theme(axis.text.x = element_blank(),  # Remove x-axis text labels
        axis.title.x = element_blank())  # Remove x-axis title

# Step 6: Sentiment analysis
# Step 1: Tokenize and remove stop words
chat_words <- chat_df %>%
  unnest_tokens(word, Message) %>%
  anti_join(stop_words)  # Remove common stop words
## Joining with `by = join_by(word)`
# Step 2: Get sentiment from Bing lexicon
bing_sentiment <- get_sentiments("bing")

# Step 3: Join the tokenized words with Bing sentiment and calculate sentiment score
chat_sentiment <- chat_words %>%
  inner_join(bing_sentiment) %>%
  count(Timestamp, sentiment) %>%
  spread(key = sentiment, value = n, fill = 0) %>%  # Use spread() if pivot_wider() is causing issues
  mutate(sentiment_score = positive - negative)
## Joining with `by = join_by(word)`
# Step 4: Plot sentiment over time
ggplot(chat_sentiment, aes(x = Timestamp, y = sentiment_score)) +
  geom_line(color = "red") +
  labs(title = "Sentiment Over Time", x = "Time", y = "Sentiment Score") +
  theme_minimal()

# Step 7: Keyword or topic analysis (most common words)
chat_words %>%
  count(word, sort = TRUE) %>%
  top_n(20) %>%
  ggplot(aes(x = reorder(word, n), y = n)) +
  geom_bar(stat = "identity", fill = "cyan") +
  coord_flip() +
  labs(title = "Most Common Words", x = "Word", y = "Frequency") +
  theme_minimal()
## Selecting by n

# Step 8: Emoji usage
emoji_pattern <- "[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF]"
chat_df$Emojis <- str_extract_all(chat_df$Message, emoji_pattern)
emoji_list <- unlist(chat_df$Emojis)

emoji_df <- data.frame(emoji = emoji_list) %>%
  count(emoji, sort = TRUE)

emoji_df %>%
  top_n(10) %>%
  ggplot(aes(x = reorder(emoji, n), y = n)) +
  geom_bar(stat = "identity", fill = "pink") +
  coord_flip() +
  labs(title = "Most Used Emojis", x = "Emoji", y = "Count") +
  theme_minimal()
## Selecting by n

# Step 9: Conversation gaps (inactivity periods)
chat_df <- chat_df %>%
  mutate(TimeDiff = c(NA, diff(Timestamp)))

chat_df %>%
  filter(as.numeric(TimeDiff) > 3600) %>%
  ggplot(aes(x = Timestamp, y = as.numeric(TimeDiff)/3600)) +  # Convert to hours
  geom_bar(stat = "identity", fill = "brown") +
  labs(title = "Inactivity Periods (Gaps > 1 Hour)", x = "Time", y = "Gap (Hours)") +
  theme_minimal()

# Step 10: Create a Word Cloud for most used words
# Create a word frequency data frame (including rare words)
chat_word_freq <- chat_words %>%
  count(word, sort = TRUE)

# Remove the words "omitted" and "media" from the word cloud
chat_word_freq <- chat_word_freq %>%
  filter(!word %in% c("omitted", "media"))

# Generate word cloud
wordcloud2(data = chat_word_freq, 
           size = 0.5, 
           color = "darkblue", 
           backgroundColor = "white")