File is unzip and stored in local folder
Set the local folder as working directory
setwd("/Users/ashwini/Desktop/Data Science/final/en_US")
Read the data from each file blog, news, twitter
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8" ,skipNul = TRUE)
news <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: NLP
##
## This data.table install has not detected OpenMP support. It will work but slower in single threaded mode.
## quanteda version 0.9.8.5
##
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:NLP':
##
## ngrams
## The following object is masked from 'package:base':
##
## sample
Basic data information for each file blog, news, twitter
Dataset_info <- sapply(list(blogs,news,twitter),function(x) summary(stri_count_words(x))[c('Min.','Max.')])
rownames(Dataset_info)=c('Min','Max')
stats=data.frame(
Dataset=c("blogs","news","twitter"),
t(rbind(
sapply(list(blogs,news,twitter),stri_stats_general)[c('Lines','Chars'),],
Words=sapply(list(blogs,news,twitter),stri_stats_latex)['Words',],
Dataset_info)
))
knitr::kable(head(stats))
| Dataset | Lines | Chars | Words | Min | Max |
|---|---|---|---|---|---|
| blogs | 899288 | 206824382 | 37570839 | 0 | 6726 |
| news | 1010242 | 203223154 | 34494539 | 1 | 1796 |
| 2360148 | 162096241 | 30451170 | 1 | 47 |
Prepare the sample data
set.seed(1024)
sampledataset_blogs <- c(sample(iconv(blogs, "latin1", "ASCII", sub=""), length(blogs) * 0.05))
sampledataset_news <- sample(iconv(news, "latin1", "ASCII", sub=""), length(news) * 0.05)
sampledataset_twitter <- sample(iconv(twitter, "latin1", "ASCII", sub=""), length(twitter) * 0.05)
unigram_blogs <- docfreq(dfm(sampledataset_blogs, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE))[1:15]
unigram_blogs
## the glass sees truth and is creation it
## 27361 168 67 275 23453 12745 79 11980
## what we want to see yourself not
## 4554 5292 1562 22675 2306 321 6575
unigram_news <- docfreq(dfm(sampledataset_news, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE))[1:15]
unigram_news
## plate the poached pears and truffles
## 106 37282 8 8 26558 7
## swept money out an employment department
## 30 834 3291 5503 66 597
## fund a risk
## 205 25474 183
unigram_twitter <- docfreq(dfm(sampledataset_twitter, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE))[1:15]
unigram_twitter
## hey if you are feeling crazy tk
## 1269 5204 22079 7410 474 528 6
## has all the equipment and ability would
## 2278 5925 36523 23 19524 49 2518
## need
## 2411
bigram_blogs <- docfreq(dfm(sampledataset_blogs, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE, ngrams = 2, concatenator = ' '))[1:15]
bigram_blogs
## the glass glass sees sees truth truth and and truth truth is
## 29 1 1 35 9 53
## is creation creation it it sees sees what what we we want
## 1 3 1 2 199 70
## want to to see see see
## 1072 828 1
bigram_news <- docfreq(dfm(sampledataset_news, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE, ngrams = 2, concatenator = ' '))[1:15]
bigram_news
## plate the the poached poached pears
## 3 2 1
## pears and and truffles swept money
## 3 1 1
## money out out an an employment
## 6 19 3
## employment department department fund fund a
## 1 1 6
## a risk risk management management fund
## 21 3 1
bigram_twitter <- docfreq(dfm(sampledataset_twitter, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE, ngrams = 2, concatenator = ' '))[1:15]
bigram_twitter
## hey if if you you are are feeling feeling crazy
## 10 1578 1128 8 1
## crazy tk tk has has all all the the equipment
## 1 1 6 947 3
## equipment and and ability ability you you would would need
## 2 4 1 147 5
trigram_blogs <- docfreq(dfm(sampledataset_blogs, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE, ngrams = 3, concatenator = ' '))[1:15]
trigram_blogs
## the glass sees glass sees truth sees truth and truth and truth
## 1 1 1 1
## and truth is truth is creation is creation it creation it sees
## 3 1 1 1
## it sees what sees what we what we want we want to
## 1 1 7 45
## want to see to see see see see yourself
## 44 1 1
trigram_news<- docfreq(dfm(sampledataset_news, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE, ngrams = 3, concatenator = ' '))[1:15]
trigram_news
## plate the poached the poached pears
## 1 1
## poached pears and pears and truffles
## 1 1
## swept money out money out an
## 1 1
## out an employment an employment department
## 2 1
## employment department fund department fund a
## 1 1
## fund a risk a risk management
## 1 1
## risk management fund management fund and
## 1 1
## fund and a
## 1
trigram_twitter <- docfreq(dfm(sampledataset_twitter, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE, ngrams = 3, concatenator = ' '))[1:15]
trigram_twitter
## hey if you if you are you are feeling
## 2 122 6
## are feeling crazy feeling crazy tk crazy tk has
## 1 1 1
## tk has all has all the all the equipment
## 1 4 1
## the equipment and equipment and ability and ability you
## 1 1 1
## ability you would you would need would need for
## 1 1 1