Introduction

The purpose of this work is:

Demonstrate that you’ve downloaded the data and have successfully loaded it in.
Create a basic report of summary statistics about the data sets.
Report any interesting findings that you amassed so far.
Get feedback on your plans for creating a prediction algorithm and Shiny app.

Review criteria

Does the link lead to an HTML page describing the exploratory analysis of the training data set? Has the data scientist done basic summaries of the three files? Word counts, line counts and basic data tables? Has the data scientist made basic plots, such as histograms to illustrate features of the data? Was the report written in a brief, concise style, in a way that a non-data scientist manager could appreciate?

Part 1

Loading/Importing dependency libraries and data

library(RWeka)
library(SnowballC)
library(qdap)

## Loading required package: qdapDictionaries

## Loading required package: qdapRegex

## Loading required package: qdapTools

## Loading required package: RColorBrewer

## Registered S3 methods overwritten by 'qdap':
##   method               from
##   t.DocumentTermMatrix tm  
##   t.TermDocumentMatrix tm

## 
## Attaching package: 'qdap'

## The following object is masked from 'package:base':
## 
##     Filter

library(stringi)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:qdapRegex':
## 
##     %+%

library(tokenizers)
library(downloader)
library(plyr)

## 
## Attaching package: 'plyr'

## The following object is masked from 'package:qdapTools':
## 
##     id

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following object is masked from 'package:qdap':
## 
##     %>%

## The following object is masked from 'package:qdapTools':
## 
##     id

## The following object is masked from 'package:qdapRegex':
## 
##     explain

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(knitr)
library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

## The following object is masked from 'package:qdap':
## 
##     ngrams

## 
## Attaching package: 'tm'

## The following objects are masked from 'package:qdap':
## 
##     as.DocumentTermMatrix, as.TermDocumentMatrix

library(wordcloud)

Part 2

Including Plots

A - Download the dataset and unzip folder

if(!file.exists("./projectData")){
  dir.create("./projectData")
}
UrlAdress <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"

if(!file.exists("./projectData/Coursera-SwiftKey.zip")){
  download.file(UrlAdress,destfile="./projectData/Coursera-SwiftKey.zip",mode = "wb")
}

if(!file.exists("./projectData/final")){
  unzip(zipfile="./projectData/Coursera-SwiftKey.zip",exdir="./projectData")
}

pathFile <- file.path("./projectData/final" , "en_US")
files<-list.files(pathFile, recursive=TRUE)

connN <- file("./projectData/final/en_US/en_US.twitter.txt", "r")
lineTwitter<-readLines(connN)

## Warning in readLines(connN): line 167155 appears to contain an embedded nul

## Warning in readLines(connN): line 268547 appears to contain an embedded nul

## Warning in readLines(connN): line 1274086 appears to contain an embedded nul

## Warning in readLines(connN): line 1759032 appears to contain an embedded nul

connN2 <- file("./projectData/final/en_US/en_US.blogs.txt", "r")
lineBlogs<-readLines(connN2) 
connN3 <- file("./projectData/final/en_US/en_US.news.txt", "r")
lineNews<-readLines(connN3)

## Warning in readLines(connN3): incomplete final line found on './projectData/
## final/en_US/en_US.news.txt'

Sample the data to reduce size

set.seed(123)
blogSample<- sample(lineBlogs,5000)
newsSample<- sample(lineNews, 5000)
sampleTwitter<- sample(lineTwitter, 5000)

sample<- c(blogSample,newsSample,sampleTwitter)
txt<- sent_detect(sample)


txt <- removeNumbers(txt) 

txt <- removePunctuation(txt)

txt <- stripWhitespace(txt)

# converting 
txt <- tolower(txt)

head(txt)

## [1] "the bruschetta however missed the mark"                                                                                                                                                                                               
## [2] "instead of manageable twobite crostini these were huge slices of grilled bread and heaped with toppings of tomato cannellini beans and roasted peppers with goat cheese"                                                              
## [3] "walden pond mt"                                                                                                                                                                                                                       
## [4] "rainier big sur everglades and so forth despite laws banning cell phones while driving and increased awareness of the dangers of doing so itâ\200\231s a common fact that cell phone use while driving is still a widespread occurrence"    
## [5] "perhaps most discouraging to the issue is that much of this distracted driving occurs amongst young drivers which is not only a safety concern but also might indicate that the problem could be deeply rooted for future generations"
## [6] "ghosts and goblins now i can write in specific post information for each day of the week and preplan things out a bit"

# Removing characters
txtNumb <- gsub("[^a-zA-z ]", '', txt)

head(txtNumb)

## [1] "the bruschetta however missed the mark"                                                                                                                                                                                               
## [2] "instead of manageable twobite crostini these were huge slices of grilled bread and heaped with toppings of tomato cannellini beans and roasted peppers with goat cheese"                                                              
## [3] "walden pond mt"                                                                                                                                                                                                                       
## [4] "rainier big sur everglades and so forth despite laws banning cell phones while driving and increased awareness of the dangers of doing so its a common fact that cell phone use while driving is still a widespread occurrence"       
## [5] "perhaps most discouraging to the issue is that much of this distracted driving occurs amongst young drivers which is not only a safety concern but also might indicate that the problem could be deeply rooted for future generations"
## [6] "ghosts and goblins now i can write in specific post information for each day of the week and preplan things out a bit"

More details of my exploratory analysis

What are the frequency distributions of words that are Some words are more frequent than others?

txtNumb <- data.frame(txtNumb,stringsAsFactors = FALSE)
words<-WordTokenizer(txtNumb)

words<-data.frame(unlist(stri_extract_all_words(txtNumb)))

## Warning in stri_extract_all_boundaries(str, simplify, omit_no_match,
## opts_brkiter = stri_opts_brkiter(type = "word", : argument is not an atomic
## vector; coercing

wordcloud(words$unlist.stri_extract_all_words.txtNumb.., scale = c(5, 0.5), max.words = 20, 
          colors = brewer.pal(8,"Dark2"), random.order=FALSE, rot.per=0.35, use.r.layout=FALSE)

## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

2 grams and 3 grams: What are the frequencies ofthis cases in dataset?

gram<-NGramTokenizer(txtNumb)

wordGram01<-NGramTokenizer(gram, Weka_control(min=1, max=1))
wordGram01<-data.frame(table(wordGram01))
wordGram01<-wordGram01[order(wordGram01$Freq, decreasing = TRUE),]

wordGram02<-NGramTokenizer(gram, Weka_control(min=2, max=2))
wordGram02<-data.frame(table(wordGram02))
wordGram02<-wordGram02[order(wordGram02$Freq, decreasing = TRUE),]

wordGram03<-NGramTokenizer(gram, Weka_control(min=3,max=3))
wordGram03<-data.frame(table(wordGram03))
wordGram03<-wordGram03[order(wordGram03$Freq, decreasing = TRUE),]

p<-ggplot(data=wordGram01[1:20,], aes(x=reorder(wordGram01,Freq), y=Freq,
                    fill=factor(reorder(wordGram01,-Freq))))+ geom_bar(stat="identity") 
p + xlab("Word") +labs(title = "Most popular words") +theme(legend.title=element_blank()) + coord_flip()

p<-ggplot(data=wordGram02[1:20,], aes(x=reorder(wordGram02,Freq), y=Freq,
                    fill=factor(reorder(wordGram02,-Freq))))+ geom_bar(stat="identity") 
p + xlab("Word") +labs(title = "Most popular two-word phrases") +theme(legend.title=element_blank()) + coord_flip()

p<-ggplot(data=wordGram03[1:20,], aes(x=reorder(wordGram03,Freq), y=Freq,
                    fill=factor(reorder(wordGram03,-Freq))))+ geom_bar(stat="identity") 
p + xlab("Word") +labs(title = "Most popular three-word phrases") +theme(legend.title=element_blank()) + coord_flip()

Curious details

How many words are needed in a dictionary classified by frequency to cover 40% of all occurrences of words in that particular language? 80%?

sumMaster <- 0
for(i in 1:length(wordGram01$Freq)) {
  sumMaster <- sumMaster + wordGram01$Freq[i]
  if(sumMaster >= 0.4*sum(wordGram01$Freq)){break}
}
print(i)

## [1] 56

sumMaster <- 0
for(i in 1:length(wordGram01$Freq)) {
  sumMaster <- sumMaster + wordGram01$Freq[i]
  if(sumMaster >= 0.8*sum(wordGram01$Freq)){break}
}
print(i)

## [1] 2432

Conclusion

I believe that this result was useful because many of the words used are not unique.

Coursera - Data Science Capstone - Week2 - Milestone Report

Gerson de Oliveira

3/4/2020