Summary

This is a Milestone Report for the Coursera Data Science Capstone project week 02 assignment.

The objective of this report is to develop an understanding of the various statistical properties of the data set that can later be used when building the prediction model for the final data product - the Shiny application. Using exploratory data analysis, this report describes the major features of the training data and then summarizes my plans for creating the predictive model.

The motivation for this project is to:

Environment Setup and Loading Data

Data was downloaded from this link https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip and unzipped.

Load files for the english language

myPath <- "/Users/janenyandele/Desktop/Coursera 01/10. Capstone_Project/final/en_US"
blogsPath <- paste(myPath, "en_US.blogs.txt", sep="/")
twitterPath <- paste(myPath, "en_US.twitter.txt", sep="/")
newsPath <- paste(myPath, "en_US.news.txt", sep="/")

con <- file(blogsPath, open = "r")
blogsFile <- readLines(con)
close(con) 

con <- file(twitterPath, open="r")
twitterFile <- readLines(con)
## Warning in readLines(con): line 167155 appears to contain an embedded nul
## Warning in readLines(con): line 268547 appears to contain an embedded nul
## Warning in readLines(con): line 1274086 appears to contain an embedded nul
## Warning in readLines(con): line 1759032 appears to contain an embedded nul
close(con)

con <- file(newsPath, open="r")
newsFile <- readLines(con)
close(con)

Basic summary of the downloaded files

File size

blogs_size <- (file.info(blogsPath)$size)/1024^2
blogs_size
## [1] 200.4242
news_size <- (file.info(newsPath)$size)/1024^2
news_size
## [1] 196.2775
twitter_size <- (file.info(twitterPath)$size)/1024^2
twitter_size
## [1] 159.3641

Line/Character Count

blogs_lc <- length(blogsFile)
blogs_lc
## [1] 899288
news_lc <- length(newsFile)
news_lc
## [1] 1010242
twitter_lc <- length(twitterFile)
twitter_lc
## [1] 2360148

Word Count

library(stringi)
## Warning: package 'stringi' was built under R version 4.1.2
blogs_wc <- sum(stri_count_words(blogsFile))
blogs_wc
## [1] 37546250
news_wc <- sum(stri_count_words(newsFile))
news_wc
## [1] 34762395
twitter_wc <- sum(stri_count_words(twitterFile))
twitter_wc
## [1] 30093372

Table to aggregate the above summary data

files <- rbind("blogsFile","newsFile","twitterFile")
lc <- rbind(blogs_lc,news_lc,twitter_lc)
wc <- rbind(blogs_wc,news_wc,twitter_wc)
size <- rbind(blogs_size,news_size,twitter_size)
table <- data.frame(files,lc,wc,size)
colnames(table) <- c("File Name","Line Count","Word Count","File Size (MB)")
rownames(table) <- c(1, 2, 3)
print(table)
##     File Name Line Count Word Count File Size (MB)
## 1   blogsFile     899288   37546250       200.4242
## 2    newsFile    1010242   34762395       196.2775
## 3 twitterFile    2360148   30093372       159.3641

Plotting Histograms of word counts

Blogs

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.2
words_blogs <- stri_count_words(blogsFile)
summary(words_blogs)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    9.00   28.00   41.75   60.00 6726.00
qplot(words_blogs, bins=500) + coord_cartesian(xlim = c(0, 500))
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

words_news <- stri_count_words(newsFile)
summary(words_news)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   19.00   32.00   34.41   46.00 1796.00
qplot(words_news, bins=500) + coord_cartesian(xlim = c(0, 500))

words_twitter <- stri_count_words(twitterFile)
summary(words_twitter)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    7.00   12.00   12.75   18.00   47.00
qplot(words_twitter, bins=40) + coord_cartesian(xlim = c(0, 40))

Data Sampling and Cleansing

Sampling

Due to large file size and system limitations, take a sample of the datasets to create the prediction model. The code below takes a random sample of 1% of each dataset.

library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
set.seed(2345)

blogs_sample <- sample(blogsFile, length(blogsFile)*0.01, replace = FALSE)
news_sample <- sample(newsFile, length(newsFile)*0.01, replace = FALSE)
twitter_sample <- sample(twitterFile, length(twitterFile)*0.01, replace = FALSE)

sample_data <- c(blogs_sample, news_sample, twitter_sample)
corpus <- VCorpus(VectorSource(sample_data)) 

Cleaning

Remove any punctuation and numbers, convert all letters to lowercase.

cleanCorpus <- tm_map(corpus, content_transformer(tolower)) # Convert all to lower case
cleanCorpus <- tm_map(cleanCorpus, removePunctuation) # Remove punctuation marks
cleanCorpus <- tm_map(cleanCorpus, removeNumbers) # Remove numbers
cleanCorpus <- tm_map(cleanCorpus, stripWhitespace) # Remove whitespace
cleanCorpus <- tm_map(cleanCorpus, PlainTextDocument) # Convert all to plain text document