The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm.
library(stringi) # stats files
library(RWeka)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
blogs <- readLines("en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news <- readLines("en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitter <- readLines("en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")
blogs_numch <- sum(nchar(blogs))
news__numch <- sum(nchar(news))
twitter_numch <- sum(nchar(twitter))
blogs_nwd <- sum(stri_count_words(blogs))
news_nwd <- sum(stri_count_words(news))
twitter_nwd <-sum(stri_count_words(twitter))
line_news<-length(news)
line_twitter<-length(twitter)
line_blogs<-length(blogs)
a_res<-rbind(blogs_numch,news__numch,twitter_numch)
b_res<-rbind(blogs_nwd,news_nwd,twitter_nwd)
c_res<-rbind(line_news,line_twitter,line_blogs)
d_res<-as.data.frame(cbind(a_res,b_res,c_res))
names(d_res)<-c("nr of characters","nr of words","nr of line")
rownames(d_res)<-c("news","twitter","blogs")
d_res
## nr of characters nr of words nr of line
## news 206824505 37546239 1010242
## twitter 203223159 34762395 2360148
## blogs 162096031 30093372 899288
set.seed(152)
blog_clean_en <-iconv(blogs,"latin1","ASCII",sub="")
news_clean_en <-iconv(news,"latin1","ASCII",sub="")
twitter_clean_en <-iconv(twitter,"latin1","ASCII",sub="")
An N-gram means a sequence of N words in Natural Language Processing (NLP).
u_gram_word <- NGramTokenizer(all_combi, Weka_control(min = 1, max = 1))
b_gram_word <- NGramTokenizer(all_combi, Weka_control(min = 2, max = 2))
t_gram_word <- NGramTokenizer(all_combi, Weka_control(min = 3, max = 3))
u_allcomb <- data.frame(table(u_gram_word))%>%arrange(desc(Freq))
b_allcomb <-data.frame(table(b_gram_word))%>%arrange(desc(Freq))
t_allcomb <-data.frame(table(t_gram_word))%>%arrange(desc(Freq))
df_n_gram<-as.data.frame(cbind(u_allcomb[1:15,],b_allcomb[1:15,],t_allcomb[1:15,]))
names(df_n_gram)[c(2,4,6)]<-c("Freq1","Freq2","Freq3")
df_n_gram
## u_gram_word Freq1 b_gram_word Freq2 t_gram_word Freq3
## 1 the 42202 of the 4196 I don t 413
## 2 to 26660 in the 3825 one of the 320
## 3 a 22846 to the 1975 a lot of 269
## 4 and 22713 for the 1948 I can t 202
## 5 of 19832 on the 1919 to be a 182
## 6 I 18107 I m 1737 going to be 161
## 7 in 15418 to be 1535 I m not 159
## 8 is 10611 at the 1308 as well as 153
## 9 for 10471 and the 1177 the end of 151
## 10 that 10325 in a 1149 out of the 145
## 11 you 8987 don t 1085 I want to 142
## 12 it 8757 is a 1020 I didn t 141
## 13 s 8079 it s 1007 Thanks for the 137
## 14 on 7852 with the 1005 the U S 137
## 15 with 7063 for a 874 I ve been 130
ggplot(df_n_gram, aes(x=reorder(u_gram_word,Freq1), y=(Freq1))) +
geom_bar(stat="Identity", fill="#7F5217",color="darkred")+
xlab("Unigrams") + ylab("Frequency")+
ggtitle("Most common 15 Unigrams")+
theme(axis.text.x=element_text(angle=90, hjust=1))
### Creating And Displaying Bigram
ggplot(df_n_gram, aes(x=reorder(b_gram_word,Freq2), y=(Freq2))) +
geom_bar(stat="Identity", fill="#FFFF00",color="darkred")+
xlab("Bigram") + ylab("Frequency")+
ggtitle("Most common 15 Bigram")+
theme(axis.text.x=element_text(angle=90, hjust=1))
### Creating And Displaying Trigram
ggplot(df_n_gram, aes(x=reorder(t_gram_word,Freq3), y=(Freq3))) +
geom_bar(stat="Identity", fill="#4AA02C",color="darkred")+
xlab("Trigram") + ylab("Frequency")+
ggtitle("Most common 15 Trigram")+
theme(axis.text.x=element_text(angle=90, hjust=1))
## Conclusion & Next Steps
After exploring the data, we find some important information which lead us to some recommendation for the next step. build a predictive algorithm, a shyny app which can be able to predict the next word after typing and deploy our app to shinyapps.io server