Intraduction

The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm.

Downloading data and include libraries

library(stringi) # stats files
library(RWeka)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
blogs <- readLines("en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news <- readLines("en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitter <- readLines("en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")

Exploratory data analysys

Number of characters

blogs_numch <- sum(nchar(blogs))
news__numch <- sum(nchar(news))
twitter_numch <- sum(nchar(twitter))

Counting the Words (num.words)

blogs_nwd <- sum(stri_count_words(blogs)) 
news_nwd  <- sum(stri_count_words(news)) 
twitter_nwd  <-sum(stri_count_words(twitter)) 

Number of lines

line_news<-length(news)
line_twitter<-length(twitter)
line_blogs<-length(blogs)

Table of Number of caracters and counting word

a_res<-rbind(blogs_numch,news__numch,twitter_numch)
b_res<-rbind(blogs_nwd,news_nwd,twitter_nwd)
c_res<-rbind(line_news,line_twitter,line_blogs)
d_res<-as.data.frame(cbind(a_res,b_res,c_res))
names(d_res)<-c("nr of characters","nr of words","nr of line")
rownames(d_res)<-c("news","twitter","blogs")
d_res
##         nr of characters nr of words nr of line
## news           206824505    37546239    1010242
## twitter        203223159    34762395    2360148
## blogs          162096031    30093372     899288

Get only 1 % Of data because file is too big

Remove no english caracteres

set.seed(152)
blog_clean_en <-iconv(blogs,"latin1","ASCII",sub="")
news_clean_en <-iconv(news,"latin1","ASCII",sub="")
twitter_clean_en <-iconv(twitter,"latin1","ASCII",sub="")

Sampling by getting only 1% of data

Build N-Grams

An N-gram means a sequence of N words in Natural Language Processing (NLP).

u_gram_word <- NGramTokenizer(all_combi, Weka_control(min = 1, max = 1))
b_gram_word <- NGramTokenizer(all_combi, Weka_control(min = 2, max = 2)) 
t_gram_word <- NGramTokenizer(all_combi, Weka_control(min = 3, max = 3)) 

u_allcomb <- data.frame(table(u_gram_word))%>%arrange(desc(Freq))
b_allcomb <-data.frame(table(b_gram_word))%>%arrange(desc(Freq))
t_allcomb <-data.frame(table(t_gram_word))%>%arrange(desc(Freq))

df_n_gram<-as.data.frame(cbind(u_allcomb[1:15,],b_allcomb[1:15,],t_allcomb[1:15,]))
names(df_n_gram)[c(2,4,6)]<-c("Freq1","Freq2","Freq3")
df_n_gram
##    u_gram_word Freq1 b_gram_word Freq2    t_gram_word Freq3
## 1          the 42202      of the  4196        I don t   413
## 2           to 26660      in the  3825     one of the   320
## 3            a 22846      to the  1975       a lot of   269
## 4          and 22713     for the  1948        I can t   202
## 5           of 19832      on the  1919        to be a   182
## 6            I 18107         I m  1737    going to be   161
## 7           in 15418       to be  1535        I m not   159
## 8           is 10611      at the  1308     as well as   153
## 9          for 10471     and the  1177     the end of   151
## 10        that 10325        in a  1149     out of the   145
## 11         you  8987       don t  1085      I want to   142
## 12          it  8757        is a  1020       I didn t   141
## 13           s  8079        it s  1007 Thanks for the   137
## 14          on  7852    with the  1005        the U S   137
## 15        with  7063       for a   874      I ve been   130

Plot (Graphs & Visualizations)

Creating And Displaying Unigram

ggplot(df_n_gram, aes(x=reorder(u_gram_word,Freq1), y=(Freq1))) +
  geom_bar(stat="Identity", fill="#7F5217",color="darkred")+
  xlab("Unigrams") + ylab("Frequency")+
  ggtitle("Most common 15 Unigrams")+
  theme(axis.text.x=element_text(angle=90, hjust=1))

### Creating And Displaying Bigram

ggplot(df_n_gram, aes(x=reorder(b_gram_word,Freq2), y=(Freq2))) +
  geom_bar(stat="Identity", fill="#FFFF00",color="darkred")+
  xlab("Bigram") + ylab("Frequency")+
  ggtitle("Most common 15 Bigram")+
  theme(axis.text.x=element_text(angle=90, hjust=1))

### Creating And Displaying Trigram

ggplot(df_n_gram, aes(x=reorder(t_gram_word,Freq3), y=(Freq3))) +
  geom_bar(stat="Identity", fill="#4AA02C",color="darkred")+
  xlab("Trigram") + ylab("Frequency")+
  ggtitle("Most common 15 Trigram")+
  theme(axis.text.x=element_text(angle=90, hjust=1))

## Conclusion & Next Steps

After exploring the data, we find some important information which lead us to some recommendation for the next step. build a predictive algorithm, a shyny app which can be able to predict the next word after typing and deploy our app to shinyapps.io server