# Load required libraries
library(dplyr)
library(stringi)
library(ggplot2)
library(wordcloud)
library(tm)

# --- Set working directory ---
data_dir <- "C:/Users/Soniamannepuli/OneDrive/Documents/Coursera-SwiftKey/final/en_US"
if (!dir.exists(data_dir)) stop("Data folder not found! Please check your path.")
setwd(data_dir)

# --- Load datasets ---
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)

# --- Summaries ---
file_summary <- data.frame(
  File = c("Blogs", "News", "Twitter"),
  Lines = c(length(blogs), length(news), length(twitter)),
  Words = c(sum(stri_count_words(blogs)),
            sum(stri_count_words(news)),
            sum(stri_count_words(twitter)))
)

print(file_summary)
##      File   Lines    Words
## 1   Blogs  899288 37546806
## 2    News 1010206 34761151
## 3 Twitter 2360148 30096690
knitr::kable(file_summary, caption = "Basic Summary of SwiftKey Datasets")
Basic Summary of SwiftKey Datasets
File Lines Words
Blogs 899288 37546806
News 1010206 34761151
Twitter 2360148 30096690
set.seed(123)
sample_data <- c(sample(blogs, 2000),
                 sample(news, 2000),
                 sample(twitter, 2000))

# Clean corpus
corpus <- VCorpus(VectorSource(sample_data))
corpus <- corpus %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeNumbers) %>%
  tm_map(stripWhitespace)

# Token frequency
tdm <- TermDocumentMatrix(corpus)
freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
freq_df <- data.frame(word = names(freq), freq = freq)
head(freq_df)
##      word freq
## the   the 8666
## and   and 4525
## that that 1911
## for   for 1873
## with with 1289
## you   you 1283
ggplot(freq_df[1:20, ], aes(x = reorder(word, freq), y = freq)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 20 Most Frequent Words", x = "Word", y = "Frequency") +
  theme_minimal()

wordcloud(words = freq_df$word, freq = freq_df$freq, max.words = 100, colors = brewer.pal(8, "Dark2"))