Capstone Project - Milestone Report

This is a Milestone Report for the Coursera Data Science Project: Swift Key

#Loading the necessary libraries:
library(plyr)
library(dplyr)
library(tm)
library(RWeka)
library(ggplot2)

Dataset is already downloaded, so now we will start reading the data:

path <- file.path("E:\\RStudio\\projects\\DataScienceCapstoneProject-SwiftKey\\final\\en_US")
files<-list.files(path, recursive=TRUE)

con <- file("E:\\RStudio\\projects\\DataScienceCapstoneProject-SwiftKey\\final\\en_US\\en_US.twitter.txt") 
twitterLine<-readLines(con,  skipNul = TRUE)
close(con)


con <- file("E:\\RStudio\\projects\\DataScienceCapstoneProject-SwiftKey\\final\\en_US\\en_US.blogs.txt") 
blogsLine<-readLines(con, skipNul = TRUE)
close(con)


con <- file("E:\\RStudio\\projects\\DataScienceCapstoneProject-SwiftKey\\final\\en_US\\en_US.news.txt") 
newsLine<-readLines(con, skipNul = TRUE)

## Warning in readLines(con, skipNul = TRUE): incomplete final
## line found on 'E:\RStudio\projects\DataScienceCapstoneProject-
## SwiftKey\final\en_US\en_US.news.txt'

close(con)

Summaries of each Text File we have read:

summary(nchar(twitterLine))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     2.0    37.0    64.0    68.8   100.0   213.0

summary(nchar(blogsLine))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0    47.0   157.0   231.7   331.0 40835.0

summary(nchar(newsLine))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       2     111     186     203     270    5760

Length of each file:

length(twitterLine)

## [1] 2360148

length(blogsLine)

## [1] 899288

length(newsLine)

## [1] 77259

Seeing the first 3 rows of each file:

head(twitterLine,3)

## [1] "How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long."  
## [2] "When you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason."
## [3] "they've decided its more fun if I don't."

head(blogsLine,3)

## [1] "In the years thereafter, most of the Oil fields and platforms were named after pagan â\200œgodsâ\200\235."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
## [2] "We love you Mr. Brown."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
## [3] "Chad has been awesome with the kids and holding down the fort while I work later than usual! The kids have been busy together playing Skylander on the XBox together, after Kyan cashed in his $$$ from his piggy bank. He wanted that game so bad and used his gift card from his birthday he has been saving and the money to get it (he never taps into that thing either, that is how we know he wanted it so bad). We made him count all of his money to make sure that he had enough! It was very cute to watch his reaction when he realized he did! He also does a very good job of letting Lola feel like she is playing too, by letting her switch out the characters! She loves it almost as much as him."

head(newsLine,3)

## [1] "He wasn't home alone, apparently."                                                                                                                                                
## [2] "The St. Louis plant had to close. It would die of old age. Workers had been making cars there since the onset of mass automotive production in the 1920s."                        
## [3] "WSU's plans quickly became a hot topic on local online sites. Though most people applauded plans for the new biomedical center, many deplored the potential loss of the building."

Cleaning the Data

set.seed(12345)
samp <- 0.01
blogs <- sample(seq_len(length(blogsLine)),length(blogsLine)*samp)
news <- sample(seq_len(length(newsLine)),length(newsLine)*samp)
twitter <- sample(seq_len(length(twitterLine)),length(twitterLine)*samp)

subBlogs <- blogsLine[blogs[]]
subNews<- newsLine[news[]]
subTwitter<- twitterLine[twitter[]]

Creating Corpus:

corpus <- VCorpus(VectorSource(c(subBlogs,subNews,subTwitter)))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
head(corpus)

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 6

Exploratory Data Analysis

We will see the most 50 frequent words.

For One Word:

f50 <- removeSparseTerms(TermDocumentMatrix(corpus), 0.9999)
f50 <- sort(rowSums(as.matrix(f50)), decreasing = TRUE)
ff50 <- data.frame(word = names(f50), frequency = f50)

g2<- ggplot(ff50[1:50,], aes(word, frequency)) + ggtitle(" 50 Most Frequently Used Words")+ xlab("Words") + ylab("Frequency")+ theme(axis.text.x = element_text(angle = 90)) + geom_bar(stat = "identity", fill="green")+ 
                 theme(plot.title = element_text(hjust = 0.5))
g2

For Two Words:

BigramTokenizer <- function(x) {NGramTokenizer(x, Weka_control(min = 2, max = 2))}
bf50 <- removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = BigramTokenizer)), 0.9999)
bigramf50 <- sort(rowSums(as.matrix(bf50)), decreasing = TRUE)
bigramf50 <- data.frame(word = names(bigramf50), frequency = bigramf50)
g3<- ggplot(bigramf50[1:50,], aes(word, frequency)) + ggtitle(" 50 Most Frequently Used Words")+ xlab("Words") + ylab("Frequency")+ theme(axis.text.x = element_text(angle = 90)) + geom_bar(stat = "identity", fill="green")+ 
                 theme(plot.title = element_text(hjust = 0.5))
g3

For Three Words:

TrigramTokenizer <- function(x) {NGramTokenizer(x, Weka_control(min = 3, max = 3))}
 tf50<- removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = TrigramTokenizer)), 0.9999)
trif50 <- sort(rowSums(as.matrix(tf50)), decreasing = TRUE)
trif50 <- data.frame(word = names(trif50), frequency = trif50)
g4<- ggplot(trif50[1:50,], aes(word, frequency)) + ggtitle(" 50 Most Frequently Used Words")+ xlab("Words") + ylab("Frequency")+ theme(axis.text.x = element_text(angle = 90)) + geom_bar(stat = "identity", fill="green")+ 
                 theme(plot.title = element_text(hjust = 0.5))
g4

saveRDS(ff50, file="unigram.RData")
saveRDS(bigramf50, file="bigram.RData")
saveRDS(trif50, file="trigram.RData")

Capstone Project - Milestone Report

Tanishq Porwal

15/08/2020

This is a Milestone Report for the Coursera Data Science Project: Swift Key

Cleaning the Data

Exploratory Data Analysis