Introduction

The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. This report will show the exploratory analysis and will explain my goals for the eventual app and algorithm.

Downloading and reading files

setwd('C:/Users/mmartins/Downloads/Coursera-SwiftKey/final/en_US/Capstone')
destfile = "./Coursera-SwiftKey.zip"
destfile
## [1] "./Coursera-SwiftKey.zip"
if(!file.exists(destfile)){
  url = "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
  file <- basename(url)
  download.file(url, file, method="curl")
  unzip(file)
}
news <- readLines("en_US.news.txt", encoding = 'UTF-8',warn = FALSE)
twitter <- readLines("en_US.twitter.txt", encoding = 'UTF-8',warn = FALSE)
blogs <- readLines("en_US.blogs.txt", encoding = 'UTF-8',warn = FALSE)

Exploratory data analysis

library(ngram)
line_news<-length(news)
line_twitter<-length(twitter)
line_blogs<-length(blogs)

wc_news<-wordcount(news)
wc_twitter<-wordcount(twitter)
wc_blogs<-wordcount(blogs)

a<-rbind(line_news,line_twitter,line_blogs)
b<-rbind(wc_news,wc_twitter,wc_blogs)
c<-as.data.frame(cbind(a,b))
names(c)<-c("nr of lines","nr of words")
rownames(c)<-c("news","twitter","blogs")
c
##         nr of lines nr of words
## news          77259     2643969
## twitter     2360148    30373543
## blogs        899288    37334131

Files are too large to process. Therefore, 1% sample is taken of each and then combine the files.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
set.seed(11000)
c_blogs <- sample(blogs, length(blogs)*0.01)
c_news <- sample(news, length(news)*0.01)
c_twitter <- sample(twitter, length(twitter)*0.01)
c_combi=c(c_blogs,c_news,c_twitter)

1,2,3 ngrams

library('rJava')
## java.home option:
## JAVA_HOME environment variable: C:/Users/mmartins/Downloads/jdk-18_linux-aarch64_bin.tar.gz
## Warning in fun(libname, pkgname): Java home setting is INVALID, it will be ignored.
## Please do NOT set it unless you want to override system settings.
library('RWeka')
unigram_combi <- NGramTokenizer(c_combi, Weka_control(min = 1, max = 1))
bigram_combi <- NGramTokenizer(c_combi, Weka_control(min = 2, max = 2)) 
trigram_combi <- NGramTokenizer(c_combi, Weka_control(min = 3, max = 3)) 

library(magrittr) 

unigram_combi<-data.frame(table(unigram_combi))%>%arrange(desc(Freq))
bigram_combi<-data.frame(table(bigram_combi))%>%arrange(desc(Freq))
trigram_combi<-data.frame(table(trigram_combi))%>%arrange(desc(Freq))

df_ngram<-as.data.frame(cbind(unigram_combi[1:15,],bigram_combi[1:15,],trigram_combi[1:15,]))
names(df_ngram)[c(2,4,6)]<-c("Freq1","Freq2","Freq3")
df_ngram
##    unigram_combi Freq1 bigram_combi Freq2  trigram_combi Freq3
## 1            the 26535       of the  2526        I don t   358
## 2             to 18621       in the  2364        I can t   211
## 3              I 16354          I m  1547       a lot of   181
## 4              a 14869      for the  1376 Thanks for the   180
## 5            and 14792       to the  1327     one of the   168
## 6             of 12985       on the  1214        I m not   159
## 7             in  9583        to be  1152        to be a   148
## 8            you  7957        don t   872    going to be   124
## 9             is  7906       at the   860      I want to   123
## 10           for  7372      and the   736     be able to   121
## 11          that  7121       I have   725     don t know   107
## 12            it  6911         is a   723       I have a   106
## 13            on  5445         it s   717       I didn t   104
## 14            my  4975        I was   699     the end of   102
## 15             s  4680         in a   691      I ve been   101

Plots

library(ggplot2)
ggplot(df_ngram, aes(x=reorder(unigram_combi,Freq1), y=(Freq1))) +
  geom_bar(stat="Identity", fill="#AAAAAA",color="darkred")+
  xlab("Unigrams") + ylab("Frequency")+
  ggtitle("Most common 15 Unigrams")+
  theme(axis.text.x=element_text(angle=90, hjust=1))

ggplot(df_ngram, aes(x=reorder(bigram_combi,Freq2), y=(Freq2))) +
  geom_bar(stat="Identity", fill="#AAAAAA", color="blue")+
  xlab("Bigrams") + ylab("Frequency")+
  ggtitle("Most common 15 Bigrams")+
  theme(axis.text.x=element_text(angle=90, hjust=1))

ggplot(df_ngram, aes(x=reorder(trigram_combi,Freq3), y=(Freq3))) +
  geom_bar(stat="Identity", fill="#AAAAAA", color="green")+
  xlab("Trigrams") + ylab("Frequency")+
  ggtitle("Most common 15 Trigrams")+
  theme(axis.text.x=element_text(angle=90, hjust=1))

Conclusions

We now have some interesting findings so its time to move to train and create models.

Ideas: Use a text input box as the user interface of the Shiny app.

Next Steps: 1.Build a predictive algorithm 2.Build a a Shiny app, that suggest the most likely next word after a phrase is typed 3.Prepare a pitch about the app and publish it at “shinyapps.io” server.

It is important to note that each of the steps are important and each steps need to be re-evaluated continuosly to get really working and accurate ML model for our predictive text app.