The goal of this project is just to display that we have gotten used to working with the data and that you are on track to create your prediction algorithm. The motivation for this project is to:
Demonstrate that you’ve downloaded the data and have successfully loaded it in.
Create a basic report of summary statistics about the data sets.
Report any interesting findings that you amassed so far.
Plan for creating a prediction algorithm and Shiny app.
## Datafile
DATAFILE <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
## Download the file for the first time
if(!file.exists("./Coursera-SwiftKey.zip")) {
download.file(DATAFILE, destfile="./Coursera-SwiftKey.zip", method="curl")
## Unzip file Coursera-SwiftKey.zip
unzip("./Coursera-SwiftKey.zip")
}
## setting the working directory
setwd("./final")
## Listing all the folders
list.dirs('.', recursive=FALSE)
## [1] "./de_DE" "./en_US" "./fi_FI" "./ru_RU"
So we have four directories for German (de_DE), English (en_US), Finnish (fi_FI), and Russian (ru_RU) datafiles.
AS English language is my second language, I will focus on analyzing of English folder content to build the prediction model.
## Reading English files in the folder en_US
setwd("./final/en_US")
list.files('.', recursive=FALSE)
## [1] "en_US.blogs.txt" "en_US.news.txt" "en_US.twitter.txt"
blogs <- readLines("./en_US.blogs.txt", encoding = 'UTF-8', skipNul=TRUE, warn=FALSE)
news <- readLines("./en_US.news.txt", encoding = 'UTF-8', skipNul=TRUE, warn=FALSE)
twitter <- readLines("./en_US.twitter.txt", encoding = 'UTF-8', skipNul=TRUE, warn=FALSE)
## Reading Basic information for the files in the folder en_US
A1=file.info("./en_US.news.txt")$size/(1024^2)
A2=file.info("./en_US.blogs.txt")$size/(1024^2)
A3=file.info("./en_US.twitter.txt")$size/(1024^2)
B1=length(blogs)
B2=length(news)
B3=length(twitter)
C1=sum(sapply(gregexpr("\\W+", blogs), length) + 1)
C2=sum(sapply(gregexpr("\\W+", news), length) + 1)
C3=sum(sapply(gregexpr("\\W+", twitter), length) + 1)
## Summary Statistics
FilesInfo=matrix(c(A1,A2,A3,B1,B2,B3,C1,C2,C3),nrow=3,ncol=3)
colnames(FilesInfo) = c('File Size','Line Count','Word Count')
rownames(FilesInfo) = c('News','Blogs','Twitter')
FilesInfo
## File Size Line Count Word Count
## News 196.2775 899288 39121566
## Blogs 200.4242 77259 2825330
## Twitter 159.3641 2360148 32793432
library(NLP)
library(tm)
## Warning: package 'tm' was built under R version 3.3.1
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.3.1
## Loading required package: RColorBrewer
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.3.1
library(SnowballC)
library(openNLP)
## Taking Sample from the files as a trainig set
set.seed(10)
Blogs_S <- blogs[sample(1:length(blogs),15000)]
New_S <- news[sample(1:length(news),15000)]
Twitter_S <- twitter[sample(1:length(twitter),15000)]
TrainingSet <- list(Blogs_S,New_S,Twitter_S)
rm(Blogs_S,Twitter_S,New_S)
## Preparing Corpus file
Corpus_Data<- Corpus(VectorSource(TrainingSet))
rm(TrainingSet)
## Cleaning the Corpus file
Corpus_Data <- tm_map(Corpus_Data, removeNumbers)
Corpus_Data <- tm_map(Corpus_Data, removePunctuation)
Corpus_Data <- tm_map(Corpus_Data, content_transformer(function(x) iconv(x, "latin1", "ASCII", sub=" ")))
Corpus_Data <- tm_map(Corpus_Data, stripWhitespace)
Corpus_Data <- tm_map(Corpus_Data, stemDocument)
Corpus_Data <- tm_map(Corpus_Data, content_transformer(tolower))
SpaceRm <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
Corpus_Data <- tm_map(Corpus_Data, SpaceRm, "/|@|\\|")
Corpus_Data <- tm_map(Corpus_Data, removeWords,c(stopwords("english"),"the","you","and","for","that","with","your","have","be","this","are","can","but","what"))
Corpus_Data <- tm_map(Corpus_Data, removeWords, stopwords("en"))
Corpus_Data <- tm_map(Corpus_Data, removeWords, stopwords("SMART"))
Corpus_Data <- tm_map(Corpus_Data, stemDocument)
## Showing the frequencies of 1-grams, 2-grams and 3-grams in the dataset
I_gram <- NGramTokenizer(Corpus_Data, Weka_control(min=1, max=1))
II_gram <- NGramTokenizer(Corpus_Data, Weka_control(min=2, max=2))
III_gram <- NGramTokenizer(Corpus_Data, Weka_control(min=3, max=3))
One <- data.frame(table(I_gram))
Two <- data.frame(table(II_gram))
Three <- data.frame(table(III_gram))
colnames(One) <-c("OneGram","Freq")
colnames(Two) <-c("TwoGram","Freq")
colnames(Three) <-c("ThreeGram","Freq")
One <- One[order(One$Freq, decreasing = TRUE),]
Two <- Two[order(Two$Freq, decreasing = TRUE),]
Three <- Three[order(Three$Freq, decreasing = TRUE),]
# get top-30 n-grams
OneT <- One[1:30,]
TwoT <- Two[1:30,]
ThreeT <- Three[1:30,]
OneT<- transform(OneT, OneGram = reorder(OneGram, order(Freq, decreasing = TRUE)))
TwoT<- transform(TwoT, TwoGram = reorder(TwoGram, order(Freq, decreasing = TRUE)))
ThreeT<-transform(ThreeT, ThreeGram = reorder(ThreeGram, order(Freq, decreasing = TRUE)))
##Top 30 most frequent words in document en_US.blogs.txt
barplot(OneT$Freq,names.arg=OneT$OneGram, col="green",main ="Top 30",xlab="Words",ylab = "Frequancy")
##Top 30 most frequent two words in document en_US.blogs.txt
barplot(TwoT$Freq,names.arg=TwoT$TwoGram, col="red",main ="Top 30",xlab="Words",ylab = "Frequancy")
##Top 30 most frequent three words in document en_US.blogs.txt
barplot(ThreeT$Freq,names.arg=ThreeT$ThreeGram, col="black",main ="Top 30",xlab="Words",ylab = "Frequancy")
I will build a prediction model. The model should base on the N-gram frequencies. My proposal is to use the information obtain sofar to develop a word prediction model.