Capstone-EDA

Introduction

The goal of this project is to show the knowdlege that I have learned on working with the data and how to create my prediction algorithm.

The motivation for this project is to: 1. Demonstrate that I’ve downloaded the data and have successfully loaded it in. 2. Created a basic report of summary statistics about the data sets. 3. Report any interesting findings that I amassed so far. 4. Provide feedback on my plans for creating a prediction algorithm and Shiny app.

Loading Data

setwd("~/R/Capstone/final/en_US")
con <- file("en_US.twitter.txt", "r")
twitter<-readLines(con)

## Warning in readLines(con): line 167155 appears to contain an embedded nul

## Warning in readLines(con): line 268547 appears to contain an embedded nul

## Warning in readLines(con): line 1274086 appears to contain an embedded nul

## Warning in readLines(con): line 1759032 appears to contain an embedded nul

close(con)

con <- file("en_US.blogs.txt","r")
blogs<-readLines(con)
close(con)

con <- file("en_US.news.txt","r")
news<-readLines(con)

## Warning in readLines(con): incomplete final line found on 'en_US.news.txt'

close(con)

Exploratory Analyses

Count the size of the data

library(stringi)

## Warning: package 'stringi' was built under R version 3.4.1

twitter_words<-stri_count_words(twitter)
blogs_words<-stri_count_words(blogs)
news_words<-stri_count_words(news)

#line counts for each file
nline_twitter<-length(twitter_words)
nline_blogs<-length(blogs_words)
nline_news<-length(news_words)
line<-c(nline_twitter,nline_blogs,nline_news)

#Word counts for each file
twitter_words<-sum(twitter_words)
blogs_words<-sum(blogs_words)
news_words<-sum(news_words)
words<-c(twitter_words,blogs_words,news_words)

#Average word counts per line for each file
ave_twitter_words<-twitter_words/nline_twitter
ave_blogs_words<-blogs_words/nline_blogs
ave_news_words<-news_words/nline_news
ave_words<-c(ave_twitter_words,ave_blogs_words,ave_news_words)

#Summary Table
table<-cbind(words,line,ave_words)
rownames(table) <- c("twitter", "blogs", "news")
table

##            words    line ave_words
## twitter 30279349 2360148  12.82943
## blogs   38872243  899288  43.22558
## news     2710587   77259  35.08442

Subset and Clean the data

library(tm)

## Warning: package 'tm' was built under R version 3.4.2

## Loading required package: NLP

## Warning: package 'NLP' was built under R version 3.4.1

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 3.4.2

## Loading required package: RColorBrewer

#We are only going to use 10% of the data
subTwitter <- sample(twitter, length(twitter)*0.01)
subBlogs <- sample(blogs, length(blogs)*0.01)
subNews <- sample(news, length(news)*0.01)

#All subseted data were combined to build the corpus 
combined_data <- c(subBlogs, subNews, subTwitter)
combined_data <- iconv(combined_data, 'UTF-8', 'ASCII')
corpus <- Corpus(VectorSource(as.data.frame(combined_data, stringsAsFactors = FALSE))) 

toSpace <- content_transformer(function(x, pattern) 
                                {
                                    return (gsub(pattern, " ", x))
                                })


corpus <- tm_map(corpus, toSpace, "-")
corpus <- tm_map(corpus, toSpace, ":")
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, toSpace, "'")
corpus <- tm_map(corpus, toSpace, "'")
corpus <- tm_map(corpus, toSpace, "'")
corpus <- tm_map(corpus, toSpace, " -")
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)

Create Ngrams Tokenization and usage frequency analyses

library(RWeka)

## Warning: package 'RWeka' was built under R version 3.4.2

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.4.1

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

unigram <- NGramTokenizer(corpus, Weka_control(min = 1, max = 1))
bigram <- NGramTokenizer(corpus, Weka_control(min = 2, max = 2)) 
trigram <- NGramTokenizer(corpus, Weka_control(min = 3, max = 3)) 

#Unigram frequency distribution
unigram.df <- data.frame(table(unigram))
unigram.df <- unigram.df[order(unigram.df$Freq, decreasing = TRUE),]

plot1<-ggplot(unigram.df[1:15,], aes(x=reorder(unigram, -Freq), y=Freq)) +
    geom_bar(stat = "identity")+ 
    xlab("Unigram") + ylab("Frequency") +
    theme(axis.text.x=element_text(angle=90, hjust=1))+
    labs(title = "Most common words")
plot1

bigram.df <- data.frame(table(bigram))
bigram.df <- bigram.df[order(bigram.df$Freq, decreasing = TRUE),]

plot2<-ggplot(bigram.df[1:15,], aes(x=reorder(bigram, -Freq), y=Freq)) +
    geom_bar(stat = "identity")+ 
    xlab("Bigram") + ylab("Frequency") +
    theme(axis.text.x=element_text(angle=90, hjust=1))+
    labs(title = "Most common word pairs")
plot2

trigram.df <- data.frame(table(trigram))
trigram.df <- trigram.df[order(trigram.df$Freq, decreasing = TRUE),]

plot3<-ggplot(trigram.df[1:15,], aes(x=reorder(trigram, -Freq), y=Freq)) +
    geom_bar(stat = "identity")+ 
    xlab("Trigram") + ylab("Frequency") +
    theme(axis.text.x=element_text(angle=90, hjust=1))+
    labs(title = "Most common word triplets")

plot3

Capstone-EDA

Haowei Song

October 19, 2017

Introduction

Loading Data

Exploratory Analyses

Count the size of the data

Subset and Clean the data

Create Ngrams Tokenization and usage frequency analyses

Future Work

Based on exploratory and n-gram models analysis result, the Shiny app should consist of the following improvments, features or functions:

1. A prediction model based on the n-gram

2. Building the n-grams model on complete data set

3. Further cleaning the data to improve prediction accuracy

4. Automatically updating the data set to personalize the prediction model