Data Science Capstone Milestone Report

Introduction

This is milestone report for capstone project . We have data in 3 text files from blogs, news and twitter. Our aim here is to do exploratory data analysis on this data and get summary statistics.

Data

knitr::opts_chunk$set(echo = TRUE, warning = FALSE)


library(R.utils)

## Warning: package 'R.utils' was built under R version 3.5.3

## Loading required package: R.oo

## Warning: package 'R.oo' was built under R version 3.5.3

## Loading required package: R.methodsS3

## R.methodsS3 v1.7.1 (2016-02-15) successfully loaded. See ?R.methodsS3 for help.

## R.oo v1.23.0 successfully loaded. See ?R.oo for help.

## 
## Attaching package: 'R.oo'

## The following object is masked from 'package:R.methodsS3':
## 
##     throw

## The following objects are masked from 'package:methods':
## 
##     getClasses, getMethods

## The following objects are masked from 'package:base':
## 
##     attach, detach, load, save

## R.utils v2.9.2 successfully loaded. See ?R.utils for help.

## 
## Attaching package: 'R.utils'

## The following object is masked from 'package:utils':
## 
##     timestamp

## The following objects are masked from 'package:base':
## 
##     cat, commandArgs, getOption, inherits, isOpen, parse, warnings

library(stringi)

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#Setting the workin g directory
setwd("D:/profile/documents/en_Us")

#Read blogs and twitter file
blogs<- readLines("en_US.blogs.txt", encoding = "UTF-8")

twitter<- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)

#Read the news file using binary mode as file has special characters
con<- file("en_US.news.txt", open = "rb")
news<- readLines("en_US.news.txt", encoding = "UTF-8")

## Warning in readLines("en_US.news.txt", encoding = "UTF-8"): incomplete
## final line found on 'en_US.news.txt'

close(con)
rm(con)

Observing Data

#file size in Mb
blog_size<- file.info("en_US.blogs.txt")$size/(1024*1024)

twitter_size<-file.info("en_US.twitter.txt")$size/(1024*1024)

news_size<-file.info("en_US.news.txt")$size/(1024*1024)

#Number of lines in 3 files

blog_lines<-countLines("en_US.blogs.txt")

twitter_lines<-countLines("en_US.twitter.txt")

news_lines<-countLines("en_US.news.txt")

# Number of words

blogs_words<- stri_stats_latex(blogs)[4]

twitter_words<- stri_stats_latex(twitter)[4]

news_words<- stri_stats_latex(news)[4]

# Number of Characters

blogs_char<- sum(nchar(blogs))

twitter_char<- sum(nchar(twitter))

news_char<- sum(nchar(news))

# Summary table

data.frame("File Name" = c( "blogs","twitter",  "news"),
           "size" = as.integer(c(blog_size, twitter_size, news_size)),
           "lines" = c(blog_lines, twitter_lines, news_lines),
           "words" = c(blogs_words, twitter_words, news_words),
           "characters"=c(blogs_char, twitter_char, news_char))

##   File.Name size   lines    words characters
## 1     blogs  200  899288 37570839  206824505
## 2   twitter  159 2360148 30451170  162096241
## 3      news  196 1010242  2651432   15639408

Sampling

As our data size is large and it will take lot of computational time and resources ,so we will create a sample.

set.seed(123)

blog_sample<- blogs[sample(1:length(blogs), 15000)]

twitter_sample<-twitter[sample(1:length(twitter), 15000)]

news_sample<-news[sample(1:length(news), 15000)]

dir.create("data_sample")

write(blog_sample, "data_sample/blog_sample.txt")

write(twitter_sample, "data_sample/twitter_sample.txt")

write(news_sample, "data_sample/news_sample.txt")

Create and Clean Corpus

We will create Corpus from sample data files so that we can clean and manipulate them.

library(NLP)
library(tm)

data_corpus<- c(blog_sample, twitter_sample, news_sample)
my_corpus<- VCorpus(VectorSource(list(data_corpus)))

Now we have created our corpus , we need to clean it. For that, we will transform all characters to lowercase, we will remove the punctuation, remove the numbers and the common english stopwords (and, the, or etc..)

my_corpus<- tm_map(my_corpus, content_transformer(tolower))

my_corpus<- tm_map(my_corpus, removePunctuation)

my_corpus<- tm_map(my_corpus, removeNumbers)

my_corpus<- tm_map(my_corpus, removeWords, stopwords("english"))

my_corpus<- tm_map(my_corpus, stripWhitespace)

Now to remove profaninity we will use Google badword database.

google_badwords <- read.delim("badwords.txt",sep = "",header = FALSE)

google_badwords<- google_badwords[,1]

my_corpus<- tm_map(my_corpus, removeWords, google_badwords)


writeCorpus(my_corpus, filenames = "my_corpus.txt")

my_corpus1<- readLines("my_corpus.txt")

Exploratory Data Analysis

Unigrams

library(NLP)

library(RWeka)

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

delim <- " \\r\\n\\t.,;:\"()?!"

unigram<- function(x)NGramTokenizer(x, Weka_control(min = 1, max = 1))

unigram_matrix<- TermDocumentMatrix(my_corpus, control = list(tokenize = unigram))

unigram_corpus<- findFreqTerms(unigram_matrix, lowfreq = 1000)

unigram_corpus_num<-rowSums(as.matrix(unigram_matrix[unigram_corpus,]))

unigram_corpus_tab<-data.frame(Word = names(unigram_corpus_num), frequency = unigram_corpus_num)

unigram_corpus_tab<-unigram_corpus_tab[order(-unigram_corpus_tab$frequency), ]

unigram_corpus_tab[1:30,]

##          Word frequency
## said     said      4406
## will     will      4097
## one       one      3852
## just     just      3376
## like     like      3144
## can       can      3054
## time     time      2826
## get       get      2547
## new       new      2407
## now       now      2129
## people people      2021
## good     good      1941
## also     also      1889
## day       day      1874
## first   first      1862
## know     know      1832
## back     back      1680
## make     make      1634
## last     last      1628
## two       two      1601
## year     year      1601
## see       see      1584
## love     love      1567
## think   think      1527
## much     much      1518
## even     even      1517
## going   going      1499
## really really      1487
## well     well      1483
## way       way      1435

ggplot(head(unigram_corpus_tab, 15),aes(x = reorder(Word, -frequency), y = frequency)) +
  geom_bar(stat = "Identity", fill = "blue") +
  ggtitle("Unigram Freq") +
  geom_text(aes(label=frequency), vjust = -0.5) +
  ylab("Frequency") +
  xlab("Words")

Bigrams

bigram<- function(x)NGramTokenizer(x, Weka_control(min = 2, max = 2))

bigram_matrix<- TermDocumentMatrix(my_corpus, control = list(tokenize = bigram))

bigram_corpus<- findFreqTerms(bigram_matrix, lowfreq = 80)

bigram_corpus_num<- rowSums(as.matrix(bigram_matrix[bigram_corpus, ]))

bigram_corpus_tab<- data.frame(Word = names(bigram_corpus_num), frequency = bigram_corpus_num)

bigram_corpus_tab<- bigram_corpus_tab[order(-bigram_corpus_tab$frequency),]

bigram_corpus_tab[1:30,]

##                            Word frequency
## last year             last year       276
## right now             right now       263
## new york               new york       246
## high school         high school       198
## last week             last week       189
## years ago             years ago       178
## even though         even though       168
## first time           first time       165
## dont know             dont know       159
## last night           last night       147
## cant wait             cant wait       135
## new jersey           new jersey       131
## feel like             feel like       130
## st louis               st louis       128
## im going               im going       124
## united states     united states       116
## can get                 can get       114
## dont want             dont want       112
## make sure             make sure       110
## los angeles         los angeles       107
## every day             every day       106
## san francisco     san francisco       105
## one day                 one day       104
## said                    said        104
## many people         many people       103
## two years             two years       101
## looking forward looking forward        98
## looks like           looks like        98
## next week             next week        95
## can see                 can see        90

ggplot(head(bigram_corpus_tab, 15),aes(x = reorder(Word, -frequency), y = frequency)) +
  geom_bar(stat = "Identity", fill = "blue") +
  ggtitle("Bigram Freq") +
  geom_text(aes(label=frequency), vjust = -0.5) +
  ylab("Frequency") +
  xlab("Words")+
  theme(axis.text.x=element_text(angle=60))

Trigrams

trigram<- function(x)NGramTokenizer(x, Weka_control(min = 3, max = 3))

trigram_matrix<- TermDocumentMatrix(my_corpus, control = list(tokenize = trigram))

trigram_corpus<- findFreqTerms(trigram_matrix, lowfreq = 10)

trigram_corpus_num<- rowSums(as.matrix(trigram_matrix[trigram_corpus, ]))

trigram_corpus_tab<- data.frame(Word = names(trigram_corpus_num), frequency = trigram_corpus_num)

trigram_corpus_tab<- trigram_corpus_tab[order(-trigram_corpus_tab$frequency),]

trigram_corpus_tab[1:30,]

##                                          Word frequency
## new york city                   new york city        34
## cant wait see                   cant wait see        30
## happy mothers day           happy mothers day        23
## president barack obama president barack obama        21
## two years ago                   two years ago        21
## let us know                       let us know        19
## u u u                                   u u u        18
## dont even know                 dont even know        17
## new york times                 new york times        17
## st louis county               st louis county        16
## will take place               will take place        16
## first time since             first time since        15
## gov chris christie         gov chris christie        15
## four years ago                 four years ago        14
## world war ii                     world war ii        14
## happy new year                 happy new year        13
## dont get wrong                 dont get wrong        12
## high school students     high school students        12
## new years eve                   new years eve        12
## us district judge           us district judge        12
## cant wait get                   cant wait get        11
## john smiths grand         john smiths grand        11
## past two years                 past two years        11
## pates fountain parks     pates fountain parks        11
## cant wait hear                 cant wait hear        10
## im pretty sure                 im pretty sure        10
## martin luther king         martin luther king        10
## new york new                     new york new        10
## smiths grand national smiths grand national        10
## NA                                       <NA>        NA

ggplot(head(trigram_corpus_tab, 10),aes(x = reorder(Word, -frequency), y = frequency)) +
  geom_bar(stat = "Identity", fill = "blue") +
  ggtitle("Trigram Freq") +
  geom_text(aes(label=frequency), vjust = -0.5) +
  ylab("Frequency") +
  xlab("Words")+
  theme(axis.text.x=element_text(angle=60))

Conclusion

This concludes our exploratory data analysis for the project. In next steps we are going to model our predictive algorithm usiny shiny app and other machine learning techniques.