The goal of this Capstone Project is to develop a predictive text model similar to SwiftKey. In this Milestone Report, we summarize the exploratory analysis of the data and outline our plan for building the final algorithm and Shiny app.
The dataset is provided by SwiftKey and contains text from three sources: blogs, news, and Twitter. We use the English language files for this project.
```{r setup, include=TRUE} library(tm) library(stringi) library(ggplot2) library(knitr) library(wordcloud) library(dplyr)
blogs <- readLines(“final/en_US/en_US.blogs.txt”, encoding = “UTF-8”, skipNul = TRUE) news <- readLines(“final/en_US/en_US.news.txt”, encoding = “UTF-8”, skipNul = TRUE) twitter <- readLines(“final/en_US/en_US.twitter.txt”, encoding = “UTF-8”, skipNul = TRUE)
data_summary <- data.frame( File = c(“Blogs”, “News”, “Twitter”), Lines = c(length(blogs), length(news), length(twitter)), Words = c(sum(stri_count_words(blogs)), sum(stri_count_words(news)), sum(stri_count_words(twitter))), Size_MB = c( file.info(“final/en_US/en_US.blogs.txt”)\(size / 1024^2, file.info("final/en_US/en_US.news.txt")\)size / 1024^2, file.info(“final/en_US/en_US.twitter.txt”)$size / 1024^2 ) )
kable(data_summary)
set.seed(123) sample_blogs <- sample(blogs, length(blogs) * 0.01) sample_news <- sample(news, length(news) * 0.01) sample_twitter <- sample(twitter, length(twitter) * 0.01)
sample_data <- c(sample_blogs, sample_news, sample_twitter)
corpus <- VCorpus(VectorSource(sample_data)) corpus <- tm_map(corpus, content_transformer(tolower)) corpus <- tm_map(corpus, removePunctuation) corpus <- tm_map(corpus, removeNumbers) corpus <- tm_map(corpus, stripWhitespace) corpus <- tm_map(corpus, removeWords, stopwords(“english”))
dtm <- DocumentTermMatrix(corpus) freq <- colSums(as.matrix(dtm)) freq_df <- data.frame(word = names(freq), freq = freq) top_words <- freq_df %>% arrange(desc(freq)) %>% head(20)
ggplot(top_words, aes(x = reorder(word, freq), y = freq)) + geom_bar(stat = “identity”, fill = “steelblue”) + coord_flip() + labs(title = “Top 20 Most Frequent Words”, x = “Word”, y = “Frequency”)
wordcloud(words = freq_df\(word, freq = freq_df\)freq, min.freq = 100, max.words = 100, colors = brewer.pal(8, “Dark2”), random.order = FALSE)
If you need a lighter version (in case RStudio Cloud crashes due to memory), I can simplify it for you.
Let me know once you’re ready to knit and publish!