The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs (http://rpubs.com/) that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set. The motivation for this project is to: 1. Demonstrate that you’ve downloaded the data and have successfully loaded it in.2. Create a basic report of summary statistics about the data sets.3. Report any interesting findings that you amassed so far.4. Get feedback on your plans for creating a prediction algorithm and Shiny app. Review criteria
Does the link lead to an HTML page describing the exploratory analysis of the training data set?
Has the data scientist done basic summaries of the three files? Word counts, line counts and basic data tables?
Has the data scientist made basic plots, such as histograms to illustrate features of the data?
Was the report written in a brief, concise style, in a way that a non-data scientist manager could appreciate
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if(!file.exists("Coursera-SwiftKey.zip")) {
download.file(url, "Coursera-SwiftKey.zip")
unzip("Coursera-SwiftKey.zip", exdir="C:/Users/JoséIgnacio/Documents")}
##Loading the necessary libraries.
library(stringi)
library(tm)
## Loading required package: NLP
library(rJava)
library(RWeka)
library(RWekajars)
library(NLP)
library(openNLP)
library(RColorBrewer)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(SnowballC)
library(qdap)
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
##
## Attaching package: 'qdapRegex'
## The following object is masked from 'package:ggplot2':
##
## %+%
## Loading required package: qdapTools
##
## Attaching package: 'qdap'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, as.TermDocumentMatrix
## The following object is masked from 'package:NLP':
##
## ngrams
## The following object is masked from 'package:base':
##
## Filter
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:qdap':
##
## %>%
library(simEd)
## Loading required package: rstream
##
## Attaching package: 'simEd'
## The following objects are masked from 'package:base':
##
## sample, set.seed
library(ngram)
library(slam)
library(xtable)
library(wordcloud)
blogs_ <- file("en_US.blogs.txt", "rb") # open for reading in binary mode
blogs <- readLines(blogs_, encoding = "UTF-8", skipNul=TRUE)
news_ <- file("en_US.news.txt", open = "rb") # open for reading in binary mode
news <- readLines(news_, encoding = "UTF-8", skipNul=TRUE)
twitter_ <- file("en_US.twitter.txt", open = "rb") # open for reading in binary mode
twitter <- readLines(twitter_, encoding = "UTF-8", skipNul=TRUE)
file.info("en_US.blogs.txt")$size / 1024^2
## [1] 200.4242
file.info("en_US.news.txt")$size / 1024^2
## [1] 196.2775
file.info("en_US.twitter.txt")$size / 1024^2
## [1] 159.3641
length(blogs)
## [1] 899288
length(news)
## [1] 1010242
length(twitter)
## [1] 2360148
sum(stri_count_words(blogs))
## [1] 37546239
sum(stri_count_words(news))
## [1] 34762395
sum(stri_count_words(twitter))
## [1] 30093413
max(nchar(blogs))
## [1] 40833
max(nchar(news))
## [1] 11384
max(nchar(twitter))
## [1] 140
summary<-data.frame(c("Blog","News", "Twitter"), c(size_blog, size_news, size_twitter), c(words_blog, words_news, words_twitter), c(lines_blogs,lines_news,lines_twitter),
c(max_blogs,max_news,max_twitter))
kable(summary, col.names=c('File','Size (MB)', 'Words', 'Lines','Lenght of longest line'))%>%kable_styling(full_width = F)%>%column_spec(1, width="10em")%>%column_spec(2, width = "10em")%>%column_spec(3,width="10em")%>%column_spec(4,width="10em")%>%column_spec(5,width="10em")
| File | Size (MB) | Words | Lines | Lenght of longest line |
|---|---|---|---|---|
| Blog | 200.4242 | 37546239 | 899288 | 40833 |
| News | 196.2775 | 34762395 | 1010242 | 11384 |
| 159.3641 | 30093413 | 2360148 | 140 |
set.seed(1357)
blogs1<-iconv(blogs,"latin1","ASCII",sub="")
news1<-iconv(news,"latin1","ASCII",sub="")
twitter1<-iconv(twitter,"latin1","ASCII",sub="")
rm(blogs)
rm(news)
rm(twitter)
# sample data set only 1% of each file
sample_data<-c(sample(blogs1,length(blogs1)*0.01),
sample(news1,length(news1)*0.01),
sample(twitter1,length(twitter1)*0.01))
rm(blogs1)
rm(news1)
rm(twitter1)
corpus<-VCorpus(VectorSource(sample_data))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, PlainTextDocument)
An n-gram is a contiguous sequence of n items from a given sample of text or speech. In this case the items are words.
one_g<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
two_g<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
three_g<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
one_table<-TermDocumentMatrix(corpus,control=list(tokenize=one_g))
two_table<-TermDocumentMatrix(corpus,control=list(tokenize=two_g))
three_table<-TermDocumentMatrix(corpus,control=list(tokenize=three_g))
one_corpus<-findFreqTerms(one_table,lowfreq=1000)
two_corpus<-findFreqTerms(two_table,lowfreq=80)
three_corpus<-findFreqTerms(three_table,lowfreq=10)
one_corpus_num<-rowSums(as.matrix(one_table[one_corpus,]))
one_corpus_table<-data.frame(Word=names(one_corpus_num),frequency=one_corpus_num)
one_corpus_sort<-one_corpus_table[order(-one_corpus_table$frequency),]
head(one_corpus_sort)
## Word frequency
## the the 5423
## will will 3217
## said said 3108
## just just 3061
## one one 2890
## like like 2627
two_corpus_num<-rowSums(as.matrix(two_table[two_corpus,]))
two_corpus_table<-data.frame(Word=names(two_corpus_num),frequency=two_corpus_num)
two_corpus_sort<-two_corpus_table[order(-two_corpus_table$frequency),]
head(two_corpus_sort)
## Word frequency
## i dont i dont 575
## i think i think 564
## i love i love 507
## i know i know 400
## i just i just 395
## i can i can 346
three_corpus_num<-rowSums(as.matrix(three_table[three_corpus,]))
three_corpus_table<-data.frame(Word=names(three_corpus_num),frequency=three_corpus_num)
three_corpus_sort<-three_corpus_table[order(-three_corpus_table$frequency),]
head(three_corpus_sort)
## Word frequency
## i dont know i dont know 134
## i dont think i dont think 83
## i think i i think i 72
## i know i i know i 65
## i feel like i feel like 53
## i wish i i wish i 52
4. Some n-grams plots.
4a. Barplot of the most common words.
```r
one_plot<-(ggplot(one_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))+
geom_bar(stat="identity")+
labs(title="Unigrams",x="Words",y="Frequency")+
theme(axis.text.x=element_text(angle=90))+
coord_flip())
one_plot
4b. Barplot of the most common pairs of words.
two_plot<-(ggplot(two_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))+
geom_bar(stat="identity")+
labs(title="Bigrams",x="Words",y="Frequency")+
theme(axis.text.x=element_text(angle=90))+
coord_flip())
two_plot
4c. Barplot of the most trios of words.
three_plot<-(ggplot(three_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))+
geom_bar(stat="identity")+
labs(title="Trigrams",x="Words",y="Frequency")+
theme(axis.text.x=element_text(angle=90))+
coord_flip())
three_plot