Data Science Milestone Report

The motivation for this project is to:

Demonstrate that you’ve downloaded the data and have successfully loaded it in.
Create a basic report of summary statistics about the data sets.
Report any interesting findings that you amassed so far.
Get feedback on your plans for creating a prediction algorithm and Shiny app.

1.1 Download a file from Web

setwd("~/Downloads")

fileUrl<-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(fileUrl,destfile = "~/Downloads/Coursera-SwiftKey.zip",method="curl")
unzip(Coursera-SwiftKey.zip)

1.2 Read the english blogs,twitters and news text files into R:

setwd("~/Downloads/final/en_US")

#Blog
en_blog <- file("en_US.blogs.txt", "r")
Blog <- readLines(en_blog)
close(en_blog)
#News
en_news <- file("en_US.news.txt", "r")
News <- readLines(en_news)
close(en_news)
#Twitter
en_twitter <- file("en_US.twitter.txt", "r")
Twitter <- readLines(en_twitter,skipNul = TRUE)#nuls should be skipped.
close(en_twitter)

2.1 Basic summaries of the three files: en_Blog,en_News,en_Twitter.

library(stringi)#Get to know the lines and size of the files.
Summary<-data.frame(FilesName=c("Blog","News","Twitter"))
stri_stats_general(Blog)

##       Lines LinesNEmpty       Chars CharsNWhite 
##      899288      899288   206824382   170389539

stri_stats_general(News)

##       Lines LinesNEmpty       Chars CharsNWhite 
##     1010242     1010242   203223154   169860866

stri_stats_general(Twitter)

##       Lines LinesNEmpty       Chars CharsNWhite 
##     2360148     2360148   162096241   134082806

#Get to know the words count of the files.
Blogs.words <- stri_count_words(Blog)
News.words <- stri_count_words(News)
Twitter.words <- stri_count_words(Twitter)

2.2 Output of the Summary of the dataset.

Summary$Lines=(c(899288,1010242,2360148))#Extract from the stri
Summary$Chars=(c(206824382,203223154,162096241))#Extract from the stri
Summary$Words=(c(sum(Blogs.words),sum(News.words),sum(Twitter.words)))
Summary$Words_Average=(c(mean(Blogs.words),mean(News.words),mean(Twitter.words)))
Summary

##   FilesName   Lines     Chars    Words Words_Average
## 1      Blog  899288 206824382 37546246      41.75108
## 2      News 1010242 203223154 34762395      34.40997
## 3   Twitter 2360148 162096241 30093410      12.75065

2.3 Output Visualization

library(ggplot2)
library(gridExtra)

## Warning: package 'gridExtra' was built under R version 3.2.4

g1<-qplot(FilesName,Lines,data=Summary,stat="summary",fun.y="sum",facet.~type,xlab="Type",ylab="",main="Lines",geom="bar",fill=factor(FilesName))

g1<-g1+theme(legend.position="none")

g2<-qplot(FilesName,Words,data=Summary,stat="summary",fun.y="sum",facet.~type,xlab="Type",ylab="",main="Words",geom="bar",fill=factor(FilesName))

g2<-g2+theme(legend.position="none")

g3<-qplot(FilesName,Words_Average,data=Summary,stat="summary",fun.y="sum",facet.~type,xlab="Type",ylab="",main="Words_Average",geom="bar",fill=factor(FilesName))

g3<-g3+theme(legend.position="none")

grid.arrange(g1,g2,g3,ncol=3)

3.Sampling

To reiterate, to build models we don’t need to load in and use all of the data. Often relatively few randomly selected rows or chunks need to be included to get an accurate approximation to results that would be obtained using all the data.

set.seed(1)
samplesize=10000
sample_Blog<-sample(Blog,samplesize)
sample_News<-sample(News,samplesize)
sample_Twitter<-sample(Twitter,samplesize)

4.Create the corpus for exploratory data analysis.

library(tm)

## Loading required package: NLP

## Warning: package 'NLP' was built under R version 3.2.3

## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(SnowballC)
library(RWeka)

## Warning: package 'RWeka' was built under R version 3.2.4

datasets<-data.frame(c(sample_Blog,sample_News,sample_Twitter))
datasets2 <- VectorSource(datasets)
Corpus <- Corpus(datasets2)
Corpus<-tm_map(Corpus, content_transformer(tolower))
Corpus<-tm_map(Corpus, PlainTextDocument)
Corpus<-tm_map(Corpus, removePunctuation)
Corpus<-tm_map(Corpus, removeWords, stopwords("english"))
Corpus <- tm_map(Corpus, removeNumbers)
Corpus <- tm_map(Corpus, stripWhitespace)
Corpus <- tm_map(Corpus, PlainTextDocument)
Corpus<-tm_map(Corpus, stemDocument)

# Gram Tokenizers
gram1Tokenizer <- function(x) {NGramTokenizer(x, RWeka::Weka_control(min = 1, max = 1))}
gram2Tokenizer <- function(x) {NGramTokenizer(x, RWeka::Weka_control(min = 2, max = 2))}
gram3Tokenizer <- function(x) {NGramTokenizer(x, RWeka::Weka_control(min = 3, max = 3))}
gram4Tokenizer <- function(x) {NGramTokenizer(x, RWeka::Weka_control(min = 4, max = 4))}

dtm1<- TermDocumentMatrix(Corpus, control = list(tokenize = gram1Tokenizer))
dtm2 <- TermDocumentMatrix(Corpus, control = list(tokenize = gram2Tokenizer))
dtm3<- TermDocumentMatrix(Corpus, control = list(tokenize = gram3Tokenizer))
dtm4<- TermDocumentMatrix(Corpus, control = list(tokenize = gram4Tokenizer))

dtm1_1 <- removeSparseTerms(dtm1, sparse = 0.99)
dtm2_2 <- removeSparseTerms(dtm2, sparse = 0.99)
dtm3_3 <- removeSparseTerms(dtm3, sparse = 0.99)
dtm4_4 <- removeSparseTerms(dtm4, sparse = 0.99)

Data Science Milestone Report

QinQin0912

2016年5月1日

Instruction