The goal of this project is just to display that we have gotten used to working with the data and that you are on track to create your prediction algorithm. The motivation for this project is to:

Demonstrate that you’ve downloaded the data and have successfully loaded it in.
Create a basic report of summary statistics about the data sets.
Report any interesting findings that you amassed so far.
Plan for creating a prediction algorithm and Shiny app.

1 Downloading the data

## Datafile
DATAFILE <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"

## Download the file for the first time
if(!file.exists("./Coursera-SwiftKey.zip")) { 
  download.file(DATAFILE, destfile="./Coursera-SwiftKey.zip", method="curl") 
## Unzip file Coursera-SwiftKey.zip
unzip("./Coursera-SwiftKey.zip")
}

2 Reading the data

## setting the working directory 
 setwd("./final")
## Listing all the folders 
 list.dirs('.', recursive=FALSE)

## [1] "./de_DE" "./en_US" "./fi_FI" "./ru_RU"

So we have four directories for German (de_DE), English (en_US), Finnish (fi_FI), and Russian (ru_RU) datafiles.

AS English language is my second language, I will focus on analyzing of English folder content to build the prediction model.

## Reading English files in the folder en_US

setwd("./final/en_US")
list.files('.', recursive=FALSE)

## [1] "en_US.blogs.txt"   "en_US.news.txt"    "en_US.twitter.txt"

blogs <- readLines("./en_US.blogs.txt", encoding = 'UTF-8', skipNul=TRUE, warn=FALSE)
news <- readLines("./en_US.news.txt", encoding = 'UTF-8', skipNul=TRUE, warn=FALSE)
twitter <- readLines("./en_US.twitter.txt", encoding = 'UTF-8', skipNul=TRUE, warn=FALSE)

## Reading Basic information for the files in the folder en_US

A1=file.info("./en_US.news.txt")$size/(1024^2)
A2=file.info("./en_US.blogs.txt")$size/(1024^2)
A3=file.info("./en_US.twitter.txt")$size/(1024^2)
B1=length(blogs)
B2=length(news)
B3=length(twitter)
C1=sum(sapply(gregexpr("\\W+", blogs), length) + 1)
C2=sum(sapply(gregexpr("\\W+", news), length) + 1)
C3=sum(sapply(gregexpr("\\W+", twitter), length) + 1)

## Summary Statistics

FilesInfo=matrix(c(A1,A2,A3,B1,B2,B3,C1,C2,C3),nrow=3,ncol=3)
colnames(FilesInfo) = c('File Size','Line Count','Word Count')
rownames(FilesInfo) = c('News','Blogs','Twitter')
FilesInfo

##         File Size Line Count Word Count
## News     196.2775     899288   39121566
## Blogs    200.4242      77259    2825330
## Twitter  159.3641    2360148   32793432

3 Explantory Data Anaysis for a trainig set

library(NLP)
library(tm)

## Warning: package 'tm' was built under R version 3.3.1

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 3.3.1

## Loading required package: RColorBrewer

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(RWeka)

## Warning: package 'RWeka' was built under R version 3.3.1

library(SnowballC)
library(openNLP)

## Taking Sample from the files as a trainig set

set.seed(10)
Blogs_S <- blogs[sample(1:length(blogs),15000)]
New_S <- news[sample(1:length(news),15000)]
Twitter_S <- twitter[sample(1:length(twitter),15000)]
TrainingSet <- list(Blogs_S,New_S,Twitter_S)
rm(Blogs_S,Twitter_S,New_S)

## Preparing Corpus file

Corpus_Data<- Corpus(VectorSource(TrainingSet))
rm(TrainingSet)

## Cleaning the Corpus file

Corpus_Data <- tm_map(Corpus_Data, removeNumbers)
Corpus_Data <- tm_map(Corpus_Data, removePunctuation)
Corpus_Data <- tm_map(Corpus_Data, content_transformer(function(x) iconv(x, "latin1", "ASCII", sub=" ")))
Corpus_Data <- tm_map(Corpus_Data, stripWhitespace)
Corpus_Data <- tm_map(Corpus_Data, stemDocument)
Corpus_Data <- tm_map(Corpus_Data, content_transformer(tolower))
SpaceRm <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
Corpus_Data <- tm_map(Corpus_Data, SpaceRm, "/|@|\\|")
Corpus_Data <- tm_map(Corpus_Data, removeWords,c(stopwords("english"),"the","you","and","for","that","with","your","have","be","this","are","can","but","what"))
Corpus_Data <- tm_map(Corpus_Data, removeWords, stopwords("en"))
Corpus_Data <- tm_map(Corpus_Data, removeWords, stopwords("SMART"))
Corpus_Data <- tm_map(Corpus_Data, stemDocument)

## Showing the frequencies of 1-grams, 2-grams and 3-grams in the dataset

I_gram <- NGramTokenizer(Corpus_Data, Weka_control(min=1, max=1))
II_gram <- NGramTokenizer(Corpus_Data, Weka_control(min=2, max=2))
III_gram <- NGramTokenizer(Corpus_Data, Weka_control(min=3, max=3))

One <- data.frame(table(I_gram))
Two <- data.frame(table(II_gram))
Three <- data.frame(table(III_gram))
colnames(One) <-c("OneGram","Freq")
colnames(Two) <-c("TwoGram","Freq")
colnames(Three) <-c("ThreeGram","Freq")

One <- One[order(One$Freq, decreasing = TRUE),]
Two <- Two[order(Two$Freq, decreasing = TRUE),]
Three <- Three[order(Three$Freq, decreasing = TRUE),]

# get top-30 n-grams
OneT <- One[1:30,]
TwoT <- Two[1:30,]
ThreeT <- Three[1:30,]

OneT<- transform(OneT, OneGram = reorder(OneGram, order(Freq, decreasing = TRUE)))
TwoT<- transform(TwoT, TwoGram = reorder(TwoGram, order(Freq, decreasing = TRUE)))
ThreeT<-transform(ThreeT, ThreeGram = reorder(ThreeGram, order(Freq, decreasing = TRUE)))

##Top 30 most frequent words in document en_US.blogs.txt
barplot(OneT$Freq,names.arg=OneT$OneGram, col="green",main ="Top 30",xlab="Words",ylab = "Frequancy")

##Top 30 most frequent two words in document en_US.blogs.txt
barplot(TwoT$Freq,names.arg=TwoT$TwoGram, col="red",main ="Top 30",xlab="Words",ylab = "Frequancy")

##Top 30 most frequent three words in document en_US.blogs.txt
barplot(ThreeT$Freq,names.arg=ThreeT$ThreeGram, col="black",main ="Top 30",xlab="Words",ylab = "Frequancy")

4 Planning for the Prediction Model

I will build a prediction model. The model should base on the N-gram frequencies. My proposal is to use the information obtain sofar to develop a word prediction model.

Exploratory Data Analysis of Capstone Project

Hassan Rabie

September 3, 2016

1 Downloading the data

2 Reading the data

3 Explantory Data Anaysis for a trainig set

4 Planning for the Prediction Model