A analytics and modelling report on the given dataset of blogs, news and twitter for English writing users.
library(stringr)
library(tm)
library(ngram)
library(knitr)
library(dplyr)
library(ggplot2)
library(data.table)
blogssize <- round((file.info("en_US.blogs.txt"))$size/1024/1000,2)
blogscon <- file("en_US.blogs.txt", open="r")
blogslines <- readLines(blogscon)
bchars <- lapply(blogslines, nchar)
blogschars<-length(bchars)
blogsmaxchars <- max(unlist(bchars))
blogswords <- sum(sapply(strsplit(blogslines, "\\s+"), length))
blogs<-round(c(blogssize,blogswords,blogsmaxchars,blogschars))
blogs
[1] 205 37334441 40835 899288
twittersize <- round((file.info("en_US.twitter.txt"))$size/1024/1000,2)
twittercon <- file("en_US.twitter.txt", open="r")
twitterlines <- readLines(twittercon)
tchars <- lapply(twitterlines, nchar)
twitterchars<-length(tchars)
twittermaxchars <- max(unlist(tchars))
twitterwords <- sum(sapply(strsplit(twitterlines, "\\s+"), length))
twitter<-round(c(twittersize,twitterwords,twittermaxchars,twitterchars))
twitter
[1] 163 30373792 213 2360148
newssize <- round((file.info("en_US.news.txt"))$size/1024/1000,2)
newscon <- file("en_US.news.txt", open="r")
newslines <- readLines(newscon)
nchars <- lapply(newslines, nchar)
newschars<-length(nchars)
newsmaxchars <- max(unlist(nchars))
newswords <- sum(sapply(strsplit(newslines, "\\s+"), length))
news<-round(c(newssize,newswords,newsmaxchars,newschars))
news
[1] 201 2643972 5760 77259
datasumm<-data.frame(blogs,news,twitter,stringsAsFactors = TRUE)
datasumm
## blogs news twitter
## 1 205 201 163
## 2 37334441 2643972 30373792
## 3 40835 5760 213
## 4 899288 77259 2360148
sizes<-round(c(blogssize,newssize,twittersize))
chars<-round(c(blogschars,newschars,twitterchars))
words<-round(c(blogswords,newswords,twitterwords))
maxchars<-round(c(blogsmaxchars,newsmaxchars,twittermaxchars))
plotdata<-round(data.frame(sizes,chars,words,maxchars))
rownames(plotdata)<-c("blogs","news","twitter")
plotdata
## sizes chars words maxchars
## blogs 205 899288 37334441 40835
## news 201 77259 2643972 5760
## twitter 163 2360148 30373792 213
barplot(height=plotdata$sizes,names.arg = rownames(plotdata),main="FileSizes")
barplot(height=plotdata$chars,names.arg = rownames(plotdata),main="Lines")
barplot(height=plotdata$words,names.arg = rownames(plotdata),main="Words")
barplot(height=plotdata$maxchars,names.arg = rownames(plotdata),main="Longest Lines")
Looking at both the data summary table and barplots, it is evident that for similar file sizes: 1. Twitter uses the maximum number of lines whereas news uses the least 2. Bloggers have used maximum words, microblogging site users(Twitteraties) less than that and news material the least 3. Undoubtedly blogging wins the race for longest line and on twitter people preferred writing short lines
blogslines <- tolower(blogslines)
blogslines <- unlist(strsplit(blogslines,"[.,:;!?(){}<>]+"))
blogslines <- gsub("^[^a-z0-9]+|[^a-z0-9]+$", " ", blogslines)
blogslines <- gsub("[^a-z0-9]+\\s", " ", blogslines)
blogslines <- gsub("\\s[^a-z0-9]+", " ", blogslines)
blogslines <- gsub("\\s+", " ", blogslines)
blogslines <- str_trim(blogslines)
head(blogslines)
## [1] "in the years thereafter"
## [2] "most of the oil fields and platforms were named after pagan gods"
## [3] "we love you mr"
## [4] "brown"
## [5] "chad has been awesome with the kids and holding down the fort while i work later than usual"
## [6] "the kids have been busy together playing skylander on the xbox together"
words <- unlist(strsplit(blogslines, "\\s+"))
word.freq <- table(words)
dtfm <- cbind.data.frame(names(word.freq), as.integer(word.freq))
names(dtfm) <- c('Word', 'Frequency')
row.names(dtfm) <- dtfm[,1]
dtfm <- dtfm[order(-dtfm$Frequency),]
head(dtfm)
Word Frequency
the the 1857387
and and 1088633
to to 1065971
a a 899498
of of 875194
i i 774628
library(ggplot2)
pdata<-dtfm[1:10,]
pdata
## Word Frequency
## the the 1857387
## and and 1088633
## to to 1065971
## a a 899498
## of of 875194
## i i 774628
## in in 593964
## that that 460013
## is is 432105
## it it 402682
ggplot(dtfm[1:40,], aes(y=reorder(Word,Frequency), Frequency))+geom_col()+xlab("Frequency")+ylab("Words")
Tried around 5-6 times but wasn’t able to evaluate the code due to system constraints
blogslines <- blogslines[str_count(blogslines, "\\s+")>0]
gram2 <- ngram(blogslines, n=2)
df <- get.phrasetable(gram2)
saveRDS(df, "bloggram2.RData")