This is JHU Data Science Capstone. We work on Swiftkey Datasets comprising blogs, news and twitter feeds from 3 different languages. For this week, the task is to perform exhaustive exploratory data analsysis and build a model for the relationship between words, especifically: building an n-gram model to predict the next word and also is able to handly unseen n-grams. The goal is: “… to minimize both the size and runtime of the model in order to provide a reasonable experience to the user.”
setwd("/Volumes/Data/Documents HD (Samsung)/Documents/PhD/R training/Data-Science-specialization/Capstone Project/en_US")
EN_blog <- readLines("en_US.blogs.txt")
EN_news <- readLines("en_US.news.txt")
EN_twitter <- readLines("en_US.twitter.txt")
#A. EN_blog
EN_blog_cleaned <- tokenize_paragraphs(EN_blog)
EN_blog_cleaned<- tokenize_word_stems(EN_blog_cleaned, stopwords = stopwords::stopwords("en"))
saveRDS(EN_blog_cleaned, file = "EN_blog_cleaned.rds")
EN_blog_ngram<- tokenize_ngrams(EN_blog, n=3, n_min=2, stopwords = stopwords::stopwords("en"))
EN_blog_corp<- VCorpus(VectorSource(EN_blog_cleaned))
saveRDS(EN_blog_ngram, file = "EN_blog_ngram.rds")
saveRDS(EN_blog_corp, file = "EN_blog_corp.rds")
EN_blog_tm<- DocumentTermMatrix(EN_blog_corp, control = list(tolower=TRUE, removeNumbers=TRUE, removePuntuation=TRUE, stripWhitespace=TRUE))
saveRDS(EN_blog_tm, file = "EN_blog_tm.rds")
#B. EN_news
EN_news_cleaned <- tokenize_paragraphs(EN_news)
EN_news_cleaned<- tokenize_word_stems(EN_news_cleaned, stopwords = stopwords::stopwords("en"))
saveRDS(EN_news_cleaned, file = "EN_news_cleaned.rds")
EN_news_ngram<- tokenize_ngrams(EN_news, n=3, n_min=2, stopwords = stopwords::stopwords("en"))
EN_news_corp<- VCorpus(VectorSource(EN_news_cleaned))
saveRDS(EN_blog_ngram, file = "EN_news_ngram.rds")
saveRDS(EN_news_corp, file = "EN_news_corp.rds")
EN_news_tm<- DocumentTermMatrix(EN_news_corp, control = list(tolower=TRUE, removeNumbers=TRUE, removePuntuation=TRUE, stripWhitespace=TRUE))
saveRDS(EN_news_tm, file = "EN_news_tm.rds")
#C. EN_twitter
EN_twitter_cleaned <- tokenize_paragraphs(EN_twitter)
EN_twitter_cleaned<- tokenize_word_stems(EN_twitter_cleaned, stopwords = stopwords::stopwords("en"))
saveRDS(EN_twitter_cleaned, file = "EN_twitter_cleaned.rds")
EN_twitter_ngram<- tokenize_ngrams(EN_twitter, n=3, n_min=2, stopwords = stopwords::stopwords("en"))
EN_twitter_corp<- VCorpus(VectorSource(EN_twitter_cleaned))
saveRDS(EN_twitter_corp, file = "EN_twitter_corp.rds")
EN_twitter_tm<- DocumentTermMatrix(EN_twitter_corp, control = list(tolower=TRUE, removeNumbers=TRUE, removePuntuation=TRUE, stripWhitespace=TRUE))
saveRDS(EN_twitter_tm, file = "EN_twitter_tm.rds")
# A.EN_Blog
EN_blog_freq<-findFreqTerms(EN_blog_tm, 500)
EN_blog_tm2<- EN_blog_tm[,EN_blog_freq] #Create DTM with the most frequent terms
saveRDS(EN_blog_tm2, file = "EN_blog_tm2.rds")
#B. EN_news
EN_news_freq<-findFreqTerms(EN_news_tm, 500)
EN_news_tm2<- EN_news_tm[,EN_news_freq] #Create DTM with the most frequent terms
saveRDS(EN_news_tm2, file = "EN_news_tm2.rds")
#C EN_Twitter
EN_twitter_freq<-findFreqTerms(EN_twitter_tm, 500)
EN_twitter_tm2<- EN_twitter_tm[,EN_twitter_freq] #Create DTM with the most frequent terms
saveRDS(EN_twitter_tm2, file = "EN_twitter_tm2.rds")
#A. EN_Blog
EN_blog_tfidf <- weightTfIdf(EN_blog_tm2,normalize = FALSE)
EN_blog_tfidf2<- as.matrix(EN_blog_tfidf)
EN_blog_tfidf3<- sort(colSums(EN_blog_tfidf2))
saveRDS(EN_blog_tfidf3, file = "EN_blog_tfidf3.rds")
#B. EN_news
EN_news_tfidf <- weightTfIdf(EN_news_tm2,normalize = FALSE)
EN_news_tfidf2<- as.matrix(EN_news_tfidf)
EN_news_tfidf3<- sort(colSums(EN_news_tfidf2))
saveRDS(EN_news_tfidf3, file = "EN_news_tfidf3.rds")
#C EN_Twitter
EN_twitter_tfidf <- weightTfIdf(EN_twitter_tm,normalize = FALSE)
EN_twitter_tfidf2<- as.matrix(EN_twitter_tfidf)
EN_twitter_tfidf3<- sort(colSums(EN_twitter_tfidf2))
saveRDS(EN_twitter_tfidf3, file = "EN_twitter_tfidf3.rds")
# A.EN_Blog
setwd("~/Desktop/Capstone Project/en_US")
EN_blog_tfidf3 <- read_rds("EN_blog_tfidf3.rds")
EN_blog_tfidf4<- data.frame(words=names(EN_blog_tfidf3), EN_blog_tfidf3)
EN_blog_tfidf4 %>% mutate(words = fct_reorder(words,EN_blog_tfidf3)) %>%
top_n(15) %>%
ungroup() %>%
ggplot(aes(words, EN_blog_tfidf3, fill= words, label=round(EN_blog_tfidf3,0))) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Words", y = "Number") +
geom_label(aes(fill = words),colour = "white", fontface = "bold", show.legend = FALSE) +
ggtitle("US English Blog") +
coord_flip() + theme_classic()
wordcloud(tail(EN_blog_tfidf4$words,30), tail(EN_blog_tfidf4$EN_blog_tfidf3, 30), random.order = FALSE, color=brewer.pal(8,"Dark2"))
title("US English Blog", col.main = "grey14")
#B.EN_news
EN_news_tfidf3 <- read_rds("EN_news_tfidf3.rds")
EN_news_tfidf4<- data.frame(words=names(EN_news_tfidf3), EN_news_tfidf3)
EN_news_tfidf4 %>% mutate(words = fct_reorder(words,EN_news_tfidf3)) %>%
top_n(15) %>%
ungroup() %>%
ggplot(aes(words, EN_news_tfidf3, fill= words, label=round(EN_news_tfidf3,0))) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Words", y = "Number") +
geom_label(aes(fill = words),colour = "white", fontface = "bold", show.legend = FALSE) +
ggtitle("US English News") +
coord_flip() + theme_classic()
wordcloud(tail(EN_news_tfidf4$words,30), tail(EN_news_tfidf4$EN_news_tfidf3, 30), random.order = FALSE, color=brewer.pal(8,"Dark2"))
title("US English News", col.main = "grey14")
#C.EN_twitter
EN_twitter_tfidf3 <- read_rds("EN_twitter_tfidf3.rds")
EN_twitter_tfidf4<- data.frame(words=names(EN_twitter_tfidf3), EN_twitter_tfidf3)
EN_twitter_tfidf4 %>% mutate(words = fct_reorder(words,EN_twitter_tfidf3)) %>%
top_n(15) %>%
ungroup() %>%
ggplot(aes(words, EN_twitter_tfidf3, fill= words, label=round(EN_twitter_tfidf3,0))) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Words", y = "Number") +
geom_label(aes(fill = words),colour = "white", fontface = "bold", show.legend = FALSE) +
ggtitle("US English Twitter") +
coord_flip() + theme_classic()
wordcloud(tail(EN_twitter_tfidf4$words,30), tail(EN_twitter_tfidf4$EN_twitter_tfidf3, 30), random.order = FALSE, color=brewer.pal(8,"Dark2"))
title("US English twitter", col.main = "grey14")
In order to achieve an extensive exploratory analysis, the complete datasets were analysed, resulting in extensive computational and time consumption. However, based on this, a more focused analysis can be performed now. The plan is to build a model to predict words based on the previous 2 or 3 words in the same sentence. For this we could use a a traditional n-gram system or we could also feed the information into a Neural Network. The plan for next update is: 1.To develop a prediction model using traditional ML methods 2.To try to develop a model using Deep Learning methods for word prediction.