# Load required libraries
library(dplyr)
library(stringi)
library(ggplot2)
library(wordcloud)
library(tm)
# --- Set working directory ---
data_dir <- "C:/Users/Soniamannepuli/OneDrive/Documents/Coursera-SwiftKey/final/en_US"
if (!dir.exists(data_dir)) stop("Data folder not found! Please check your path.")
setwd(data_dir)
# --- Load datasets ---
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
# --- Summaries ---
file_summary <- data.frame(
File = c("Blogs", "News", "Twitter"),
Lines = c(length(blogs), length(news), length(twitter)),
Words = c(sum(stri_count_words(blogs)),
sum(stri_count_words(news)),
sum(stri_count_words(twitter)))
)
print(file_summary)
## File Lines Words
## 1 Blogs 899288 37546806
## 2 News 1010206 34761151
## 3 Twitter 2360148 30096690
knitr::kable(file_summary, caption = "Basic Summary of SwiftKey Datasets")
Basic Summary of SwiftKey Datasets
| Blogs |
899288 |
37546806 |
| News |
1010206 |
34761151 |
| Twitter |
2360148 |
30096690 |
set.seed(123)
sample_data <- c(sample(blogs, 2000),
sample(news, 2000),
sample(twitter, 2000))
# Clean corpus
corpus <- VCorpus(VectorSource(sample_data))
corpus <- corpus %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(stripWhitespace)
# Token frequency
tdm <- TermDocumentMatrix(corpus)
freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
freq_df <- data.frame(word = names(freq), freq = freq)
head(freq_df)
## word freq
## the the 8666
## and and 4525
## that that 1911
## for for 1873
## with with 1289
## you you 1283
ggplot(freq_df[1:20, ], aes(x = reorder(word, freq), y = freq)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Top 20 Most Frequent Words", x = "Word", y = "Frequency") +
theme_minimal()

wordcloud(words = freq_df$word, freq = freq_df$freq, max.words = 100, colors = brewer.pal(8, "Dark2"))
