This is milestone report for capstone project . We have data in 3 text files from blogs, news and twitter. Our aim here is to do exploratory data analysis on this data and get summary statistics.
knitr::opts_chunk$set(echo = TRUE, warning = FALSE)
library(R.utils)
## Warning: package 'R.utils' was built under R version 3.5.3
## Loading required package: R.oo
## Warning: package 'R.oo' was built under R version 3.5.3
## Loading required package: R.methodsS3
## R.methodsS3 v1.7.1 (2016-02-15) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.23.0 successfully loaded. See ?R.oo for help.
##
## Attaching package: 'R.oo'
## The following object is masked from 'package:R.methodsS3':
##
## throw
## The following objects are masked from 'package:methods':
##
## getClasses, getMethods
## The following objects are masked from 'package:base':
##
## attach, detach, load, save
## R.utils v2.9.2 successfully loaded. See ?R.utils for help.
##
## Attaching package: 'R.utils'
## The following object is masked from 'package:utils':
##
## timestamp
## The following objects are masked from 'package:base':
##
## cat, commandArgs, getOption, inherits, isOpen, parse, warnings
library(stringi)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Setting the workin g directory
setwd("D:/profile/documents/en_Us")
#Read blogs and twitter file
blogs<- readLines("en_US.blogs.txt", encoding = "UTF-8")
twitter<- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
#Read the news file using binary mode as file has special characters
con<- file("en_US.news.txt", open = "rb")
news<- readLines("en_US.news.txt", encoding = "UTF-8")
## Warning in readLines("en_US.news.txt", encoding = "UTF-8"): incomplete
## final line found on 'en_US.news.txt'
close(con)
rm(con)
#file size in Mb
blog_size<- file.info("en_US.blogs.txt")$size/(1024*1024)
twitter_size<-file.info("en_US.twitter.txt")$size/(1024*1024)
news_size<-file.info("en_US.news.txt")$size/(1024*1024)
#Number of lines in 3 files
blog_lines<-countLines("en_US.blogs.txt")
twitter_lines<-countLines("en_US.twitter.txt")
news_lines<-countLines("en_US.news.txt")
# Number of words
blogs_words<- stri_stats_latex(blogs)[4]
twitter_words<- stri_stats_latex(twitter)[4]
news_words<- stri_stats_latex(news)[4]
# Number of Characters
blogs_char<- sum(nchar(blogs))
twitter_char<- sum(nchar(twitter))
news_char<- sum(nchar(news))
# Summary table
data.frame("File Name" = c( "blogs","twitter", "news"),
"size" = as.integer(c(blog_size, twitter_size, news_size)),
"lines" = c(blog_lines, twitter_lines, news_lines),
"words" = c(blogs_words, twitter_words, news_words),
"characters"=c(blogs_char, twitter_char, news_char))
## File.Name size lines words characters
## 1 blogs 200 899288 37570839 206824505
## 2 twitter 159 2360148 30451170 162096241
## 3 news 196 1010242 2651432 15639408
As our data size is large and it will take lot of computational time and resources ,so we will create a sample.
set.seed(123)
blog_sample<- blogs[sample(1:length(blogs), 15000)]
twitter_sample<-twitter[sample(1:length(twitter), 15000)]
news_sample<-news[sample(1:length(news), 15000)]
dir.create("data_sample")
write(blog_sample, "data_sample/blog_sample.txt")
write(twitter_sample, "data_sample/twitter_sample.txt")
write(news_sample, "data_sample/news_sample.txt")
We will create Corpus from sample data files so that we can clean and manipulate them.
library(NLP)
library(tm)
data_corpus<- c(blog_sample, twitter_sample, news_sample)
my_corpus<- VCorpus(VectorSource(list(data_corpus)))
Now we have created our corpus , we need to clean it. For that, we will transform all characters to lowercase, we will remove the punctuation, remove the numbers and the common english stopwords (and, the, or etc..)
my_corpus<- tm_map(my_corpus, content_transformer(tolower))
my_corpus<- tm_map(my_corpus, removePunctuation)
my_corpus<- tm_map(my_corpus, removeNumbers)
my_corpus<- tm_map(my_corpus, removeWords, stopwords("english"))
my_corpus<- tm_map(my_corpus, stripWhitespace)
Now to remove profaninity we will use Google badword database.
google_badwords <- read.delim("badwords.txt",sep = "",header = FALSE)
google_badwords<- google_badwords[,1]
my_corpus<- tm_map(my_corpus, removeWords, google_badwords)
writeCorpus(my_corpus, filenames = "my_corpus.txt")
my_corpus1<- readLines("my_corpus.txt")
library(NLP)
library(RWeka)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
delim <- " \\r\\n\\t.,;:\"()?!"
unigram<- function(x)NGramTokenizer(x, Weka_control(min = 1, max = 1))
unigram_matrix<- TermDocumentMatrix(my_corpus, control = list(tokenize = unigram))
unigram_corpus<- findFreqTerms(unigram_matrix, lowfreq = 1000)
unigram_corpus_num<-rowSums(as.matrix(unigram_matrix[unigram_corpus,]))
unigram_corpus_tab<-data.frame(Word = names(unigram_corpus_num), frequency = unigram_corpus_num)
unigram_corpus_tab<-unigram_corpus_tab[order(-unigram_corpus_tab$frequency), ]
unigram_corpus_tab[1:30,]
## Word frequency
## said said 4406
## will will 4097
## one one 3852
## just just 3376
## like like 3144
## can can 3054
## time time 2826
## get get 2547
## new new 2407
## now now 2129
## people people 2021
## good good 1941
## also also 1889
## day day 1874
## first first 1862
## know know 1832
## back back 1680
## make make 1634
## last last 1628
## two two 1601
## year year 1601
## see see 1584
## love love 1567
## think think 1527
## much much 1518
## even even 1517
## going going 1499
## really really 1487
## well well 1483
## way way 1435
ggplot(head(unigram_corpus_tab, 15),aes(x = reorder(Word, -frequency), y = frequency)) +
geom_bar(stat = "Identity", fill = "blue") +
ggtitle("Unigram Freq") +
geom_text(aes(label=frequency), vjust = -0.5) +
ylab("Frequency") +
xlab("Words")
bigram<- function(x)NGramTokenizer(x, Weka_control(min = 2, max = 2))
bigram_matrix<- TermDocumentMatrix(my_corpus, control = list(tokenize = bigram))
bigram_corpus<- findFreqTerms(bigram_matrix, lowfreq = 80)
bigram_corpus_num<- rowSums(as.matrix(bigram_matrix[bigram_corpus, ]))
bigram_corpus_tab<- data.frame(Word = names(bigram_corpus_num), frequency = bigram_corpus_num)
bigram_corpus_tab<- bigram_corpus_tab[order(-bigram_corpus_tab$frequency),]
bigram_corpus_tab[1:30,]
## Word frequency
## last year last year 276
## right now right now 263
## new york new york 246
## high school high school 198
## last week last week 189
## years ago years ago 178
## even though even though 168
## first time first time 165
## dont know dont know 159
## last night last night 147
## cant wait cant wait 135
## new jersey new jersey 131
## feel like feel like 130
## st louis st louis 128
## im going im going 124
## united states united states 116
## can get can get 114
## dont want dont want 112
## make sure make sure 110
## los angeles los angeles 107
## every day every day 106
## san francisco san francisco 105
## one day one day 104
## said said 104
## many people many people 103
## two years two years 101
## looking forward looking forward 98
## looks like looks like 98
## next week next week 95
## can see can see 90
ggplot(head(bigram_corpus_tab, 15),aes(x = reorder(Word, -frequency), y = frequency)) +
geom_bar(stat = "Identity", fill = "blue") +
ggtitle("Bigram Freq") +
geom_text(aes(label=frequency), vjust = -0.5) +
ylab("Frequency") +
xlab("Words")+
theme(axis.text.x=element_text(angle=60))
trigram<- function(x)NGramTokenizer(x, Weka_control(min = 3, max = 3))
trigram_matrix<- TermDocumentMatrix(my_corpus, control = list(tokenize = trigram))
trigram_corpus<- findFreqTerms(trigram_matrix, lowfreq = 10)
trigram_corpus_num<- rowSums(as.matrix(trigram_matrix[trigram_corpus, ]))
trigram_corpus_tab<- data.frame(Word = names(trigram_corpus_num), frequency = trigram_corpus_num)
trigram_corpus_tab<- trigram_corpus_tab[order(-trigram_corpus_tab$frequency),]
trigram_corpus_tab[1:30,]
## Word frequency
## new york city new york city 34
## cant wait see cant wait see 30
## happy mothers day happy mothers day 23
## president barack obama president barack obama 21
## two years ago two years ago 21
## let us know let us know 19
## u u u u u u 18
## dont even know dont even know 17
## new york times new york times 17
## st louis county st louis county 16
## will take place will take place 16
## first time since first time since 15
## gov chris christie gov chris christie 15
## four years ago four years ago 14
## world war ii world war ii 14
## happy new year happy new year 13
## dont get wrong dont get wrong 12
## high school students high school students 12
## new years eve new years eve 12
## us district judge us district judge 12
## cant wait get cant wait get 11
## john smiths grand john smiths grand 11
## past two years past two years 11
## pates fountain parks pates fountain parks 11
## cant wait hear cant wait hear 10
## im pretty sure im pretty sure 10
## martin luther king martin luther king 10
## new york new new york new 10
## smiths grand national smiths grand national 10
## NA <NA> NA
ggplot(head(trigram_corpus_tab, 10),aes(x = reorder(Word, -frequency), y = frequency)) +
geom_bar(stat = "Identity", fill = "blue") +
ggtitle("Trigram Freq") +
geom_text(aes(label=frequency), vjust = -0.5) +
ylab("Frequency") +
xlab("Words")+
theme(axis.text.x=element_text(angle=60))
This concludes our exploratory data analysis for the project. In next steps we are going to model our predictive algorithm usiny shiny app and other machine learning techniques.