Data Science Captone Milestone Report Week 2

Assumption

File is unzip and stored in local folder

Set Working directory

Set the local folder as working directory

setwd("/Users/ashwini/Desktop/Data Science/final/en_US")

Read Data

Read the data from each file blog, news, twitter

blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8" ,skipNul = TRUE)

news <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)

twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)

load Required package

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## Loading required package: NLP

## 
## This data.table install has not detected OpenMP support. It will work but slower in single threaded mode.

## quanteda version 0.9.8.5

## 
## Attaching package: 'quanteda'

## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords

## The following object is masked from 'package:NLP':
## 
##     ngrams

## The following object is masked from 'package:base':
## 
##     sample

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

Dataset details

Basic data information for each file blog, news, twitter

Dataset_info <- sapply(list(blogs,news,twitter),function(x) summary(stri_count_words(x))[c('Min.','Max.')])
rownames(Dataset_info)=c('Min','Max')
stats=data.frame(
  Dataset=c("blogs","news","twitter"),      
  t(rbind(
    sapply(list(blogs,news,twitter),stri_stats_general)[c('Lines','Chars'),],
    Words=sapply(list(blogs,news,twitter),stri_stats_latex)['Words',],
    Dataset_info)
  ))
  
 
knitr::kable(head(stats))

Dataset	Lines	Chars	Words	Min	Max
blogs	899288	206824382	37570839	0	6726
news	1010242	203223154	34494539	1	1796
twitter	2360148	162096241	30451170	1	47

Clean the data and calculate frequency of ngram

Prepare the sample data

set.seed(1024)
sampledataset_blogs <- c(sample(iconv(blogs, "latin1", "ASCII", sub=""), length(blogs) * 0.05))
sampledataset_news <- sample(iconv(news, "latin1", "ASCII", sub=""), length(news) * 0.05)
sampledataset_twitter <- sample(iconv(twitter, "latin1", "ASCII", sub=""), length(twitter) * 0.05)

1-GRAM

1- GRAM Word and Frequency Matrix - Blog

unigram_blogs <- docfreq(dfm(sampledataset_blogs, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE))[1:15]
unigram_blogs <- data.frame(word=names(unigram_blogs), frequency=unigram_blogs)
knitr::kable(unigram_blogs)

	word	frequency
the	the	27361
glass	glass	168
sees	sees	67
truth	truth	275
and	and	23453
is	is	12745
creation	creation	79
it	it	11980
what	what	4554
we	we	5292
want	want	1562
to	to	22675
see	see	2306
yourself	yourself	321
not	not	6575

1- GRAM Word and Frequency Matrix - News

unigram_news <- docfreq(dfm(sampledataset_news, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE))[1:15]

unigram_news <- data.frame(word=names(unigram_news), frequency=unigram_news)

knitr::kable(unigram_news)

	word	frequency
plate	plate	106
the	the	37282
poached	poached	8
pears	pears	8
and	and	26558
truffles	truffles	7
swept	swept	30
money	money	834
out	out	3291
an	an	5503
employment	employment	66
department	department	597
fund	fund	205
a	a	25474
risk	risk	183

1- GRAM Word and Frequency Matrix - Twitter

unigram_twitter <- docfreq(dfm(sampledataset_twitter, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE))[1:15]

unigram_twitter <- data.frame(word=names(unigram_twitter), frequency=unigram_twitter)

knitr::kable(unigram_twitter)

	word	frequency
hey	hey	1269
if	if	5204
you	you	22079
are	are	7410
feeling	feeling	474
crazy	crazy	528
tk	tk	6
has	has	2278
all	all	5925
the	the	36523
equipment	equipment	23
and	and	19524
ability	ability	49
would	would	2518
need	need	2411

1- GRAM Plots

1 - Gram Plat - Blog

qplot(unigram_blogs$word,unigram_blogs$frequency, xlab = "word" , ylab = "frequenct" , main = "1 Gram Plot - blogs")

1 - Gram Plot - News

qplot(unigram_news$word,unigram_news$frequency, xlab = "word" , ylab = "frequenct" , main = "1 Gram - News")

1 - Gram Plot - Twitter

qplot(unigram_twitter$word,unigram_twitter$frequency, xlab = "word" , ylab = "frequenct" , main = "1 Gram - Twitter")

2-GRAM

2- GRAM Word and Frequency Matrix - Blog

bigram_blogs <- docfreq(dfm(sampledataset_blogs, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE,ngrams = 2, concatenator = ' '))[1:15]
bigram_blogs <- data.frame(word=names(bigram_blogs), frequency=bigram_blogs)
knitr::kable(bigram_blogs)

	word	frequency
the glass	the glass	29
glass sees	glass sees	1
sees truth	sees truth	1
truth and	truth and	35
and truth	and truth	9
truth is	truth is	53
is creation	is creation	1
creation it	creation it	3
it sees	it sees	1
sees what	sees what	2
what we	what we	199
we want	we want	70
want to	want to	1072
to see	to see	828
see see	see see	1

2- GRAM Word and Frequency Matrix - News

bigram_news <- docfreq(dfm(sampledataset_news, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE,ngrams = 2, concatenator = ' '))[1:15]

bigram_news <- data.frame(word=names(bigram_news), frequency=bigram_news)

knitr::kable(bigram_news)

	word	frequency
plate the	plate the	3
the poached	the poached	2
poached pears	poached pears	1
pears and	pears and	3
and truffles	and truffles	1
swept money	swept money	1
money out	money out	6
out an	out an	19
an employment	an employment	3
employment department	employment department	1
department fund	department fund	1
fund a	fund a	6
a risk	a risk	21
risk management	risk management	3
management fund	management fund	1

2- GRAM Word and Frequency Matrix - Twitter

bigram_twitter <- docfreq(dfm(sampledataset_twitter, verbose = FALSE, toLower = TRUE, removePunct = TRUE,ngrams = 2, concatenator = ' ' ,removeSeparators = TRUE))[1:15]

bigram_twitter <- data.frame(word=names(bigram_twitter), frequency=bigram_twitter)

knitr::kable(bigram_twitter)

	word	frequency
hey if	hey if	10
if you	if you	1578
you are	you are	1128
are feeling	are feeling	8
feeling crazy	feeling crazy	1
crazy tk	crazy tk	1
tk has	tk has	1
has all	has all	6
all the	all the	947
the equipment	the equipment	3
equipment and	equipment and	2
and ability	and ability	4
ability you	ability you	1
you would	you would	147
would need	would need	5

2- GRAM Plots

2 - Gram Plat - Blog

qplot(bigram_blogs$word,bigram_blogs$frequency, xlab = "word" , ylab = "frequenct" , main = "2 Gram Plot - blogs")

1 - Gram Plot - News

qplot(bigram_news$word,bigram_news$frequency, xlab = "word" , ylab = "frequenct" , main = "2 Gram - News")

2 - Gram Plot - Twitter

qplot(bigram_twitter$word,bigram_twitter$frequency, xlab = "word" , ylab = "frequenct" , main = "2 Gram - Twitter")

Data Science Captone Milestone Report Week 2

Ashwini

11/27/2016

Assumption

Set Working directory

Read Data

load Required package

Dataset details

Clean the data and calculate frequency of ngram

1-GRAM

1- GRAM Word and Frequency Matrix - Blog

1- GRAM Word and Frequency Matrix - News

1- GRAM Word and Frequency Matrix - Twitter

1- GRAM Plots

1 - Gram Plat - Blog

1 - Gram Plot - News

1 - Gram Plot - Twitter

2-GRAM

2- GRAM Word and Frequency Matrix - Blog

2- GRAM Word and Frequency Matrix - News

2- GRAM Word and Frequency Matrix - Twitter

2- GRAM Plots

2 - Gram Plat - Blog

1 - Gram Plot - News

2 - Gram Plot - Twitter