The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. This report will show the exploratory analysis and will explain my goals for the eventual app and algorithm.
setwd('C:/Users/mmartins/Downloads/Coursera-SwiftKey/final/en_US/Capstone')
destfile = "./Coursera-SwiftKey.zip"
destfile
## [1] "./Coursera-SwiftKey.zip"
if(!file.exists(destfile)){
url = "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
file <- basename(url)
download.file(url, file, method="curl")
unzip(file)
}
news <- readLines("en_US.news.txt", encoding = 'UTF-8',warn = FALSE)
twitter <- readLines("en_US.twitter.txt", encoding = 'UTF-8',warn = FALSE)
blogs <- readLines("en_US.blogs.txt", encoding = 'UTF-8',warn = FALSE)
library(ngram)
line_news<-length(news)
line_twitter<-length(twitter)
line_blogs<-length(blogs)
wc_news<-wordcount(news)
wc_twitter<-wordcount(twitter)
wc_blogs<-wordcount(blogs)
a<-rbind(line_news,line_twitter,line_blogs)
b<-rbind(wc_news,wc_twitter,wc_blogs)
c<-as.data.frame(cbind(a,b))
names(c)<-c("nr of lines","nr of words")
rownames(c)<-c("news","twitter","blogs")
c
## nr of lines nr of words
## news 77259 2643969
## twitter 2360148 30373543
## blogs 899288 37334131
Files are too large to process. Therefore, 1% sample is taken of each and then combine the files.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
set.seed(11000)
c_blogs <- sample(blogs, length(blogs)*0.01)
c_news <- sample(news, length(news)*0.01)
c_twitter <- sample(twitter, length(twitter)*0.01)
c_combi=c(c_blogs,c_news,c_twitter)
1,2,3 ngrams
library('rJava')
## java.home option:
## JAVA_HOME environment variable: C:/Users/mmartins/Downloads/jdk-18_linux-aarch64_bin.tar.gz
## Warning in fun(libname, pkgname): Java home setting is INVALID, it will be ignored.
## Please do NOT set it unless you want to override system settings.
library('RWeka')
unigram_combi <- NGramTokenizer(c_combi, Weka_control(min = 1, max = 1))
bigram_combi <- NGramTokenizer(c_combi, Weka_control(min = 2, max = 2))
trigram_combi <- NGramTokenizer(c_combi, Weka_control(min = 3, max = 3))
library(magrittr)
unigram_combi<-data.frame(table(unigram_combi))%>%arrange(desc(Freq))
bigram_combi<-data.frame(table(bigram_combi))%>%arrange(desc(Freq))
trigram_combi<-data.frame(table(trigram_combi))%>%arrange(desc(Freq))
df_ngram<-as.data.frame(cbind(unigram_combi[1:15,],bigram_combi[1:15,],trigram_combi[1:15,]))
names(df_ngram)[c(2,4,6)]<-c("Freq1","Freq2","Freq3")
df_ngram
## unigram_combi Freq1 bigram_combi Freq2 trigram_combi Freq3
## 1 the 26535 of the 2526 I don t 358
## 2 to 18621 in the 2364 I can t 211
## 3 I 16354 I m 1547 a lot of 181
## 4 a 14869 for the 1376 Thanks for the 180
## 5 and 14792 to the 1327 one of the 168
## 6 of 12985 on the 1214 I m not 159
## 7 in 9583 to be 1152 to be a 148
## 8 you 7957 don t 872 going to be 124
## 9 is 7906 at the 860 I want to 123
## 10 for 7372 and the 736 be able to 121
## 11 that 7121 I have 725 don t know 107
## 12 it 6911 is a 723 I have a 106
## 13 on 5445 it s 717 I didn t 104
## 14 my 4975 I was 699 the end of 102
## 15 s 4680 in a 691 I ve been 101
library(ggplot2)
ggplot(df_ngram, aes(x=reorder(unigram_combi,Freq1), y=(Freq1))) +
geom_bar(stat="Identity", fill="#AAAAAA",color="darkred")+
xlab("Unigrams") + ylab("Frequency")+
ggtitle("Most common 15 Unigrams")+
theme(axis.text.x=element_text(angle=90, hjust=1))
ggplot(df_ngram, aes(x=reorder(bigram_combi,Freq2), y=(Freq2))) +
geom_bar(stat="Identity", fill="#AAAAAA", color="blue")+
xlab("Bigrams") + ylab("Frequency")+
ggtitle("Most common 15 Bigrams")+
theme(axis.text.x=element_text(angle=90, hjust=1))
ggplot(df_ngram, aes(x=reorder(trigram_combi,Freq3), y=(Freq3))) +
geom_bar(stat="Identity", fill="#AAAAAA", color="green")+
xlab("Trigrams") + ylab("Frequency")+
ggtitle("Most common 15 Trigrams")+
theme(axis.text.x=element_text(angle=90, hjust=1))
We now have some interesting findings so its time to move to train and create models.
Ideas: Use a text input box as the user interface of the Shiny app.
Next Steps: 1.Build a predictive algorithm 2.Build a a Shiny app, that suggest the most likely next word after a phrase is typed 3.Prepare a pitch about the app and publish it at “shinyapps.io” server.
It is important to note that each of the steps are important and each steps need to be re-evaluated continuosly to get really working and accurate ML model for our predictive text app.