WordPred.utf8

Data Science Capstone - Week 2

Introduction

A analytics and modelling report on the given dataset of blogs, news and twitter for English writing users.

IMPORTING LIBRARIES

library(stringr)
library(tm)
library(ngram)
library(knitr)
library(dplyr)
library(ggplot2)
library(data.table)

LOADING and PROCESSING BLOGS DATA

blogssize <- round((file.info("en_US.blogs.txt"))$size/1024/1000,2)
blogscon <- file("en_US.blogs.txt", open="r")
blogslines <- readLines(blogscon)
bchars <- lapply(blogslines, nchar)
blogschars<-length(bchars)
blogsmaxchars <- max(unlist(bchars))
blogswords <- sum(sapply(strsplit(blogslines, "\\s+"), length))

blogs<-round(c(blogssize,blogswords,blogsmaxchars,blogschars))
blogs

[1]      205 37334441    40835   899288

LOADING and PROCESSING TWITTER DATA

twittersize <- round((file.info("en_US.twitter.txt"))$size/1024/1000,2)
twittercon <- file("en_US.twitter.txt", open="r")
twitterlines <- readLines(twittercon)
tchars <- lapply(twitterlines, nchar)
twitterchars<-length(tchars)
twittermaxchars <- max(unlist(tchars))
twitterwords <- sum(sapply(strsplit(twitterlines, "\\s+"), length))

twitter<-round(c(twittersize,twitterwords,twittermaxchars,twitterchars))
twitter

[1]      163 30373792      213  2360148

LOADING and PROCESSING NEWS DATA

newssize <- round((file.info("en_US.news.txt"))$size/1024/1000,2)
newscon <- file("en_US.news.txt", open="r")
newslines <- readLines(newscon)
nchars <- lapply(newslines, nchar)
newschars<-length(nchars)
newsmaxchars <- max(unlist(nchars))
newswords <- sum(sapply(strsplit(newslines, "\\s+"), length))

news<-round(c(newssize,newswords,newsmaxchars,newschars))
news

[1]     201 2643972    5760   77259

DATA SUMMARY

datasumm<-data.frame(blogs,news,twitter,stringsAsFactors = TRUE)
datasumm

##      blogs    news  twitter
## 1      205     201      163
## 2 37334441 2643972 30373792
## 3    40835    5760      213
## 4   899288   77259  2360148

DATA FOR PLOTTING

sizes<-round(c(blogssize,newssize,twittersize))
chars<-round(c(blogschars,newschars,twitterchars))
words<-round(c(blogswords,newswords,twitterwords))
maxchars<-round(c(blogsmaxchars,newsmaxchars,twittermaxchars))
plotdata<-round(data.frame(sizes,chars,words,maxchars))
rownames(plotdata)<-c("blogs","news","twitter")
plotdata

##         sizes   chars    words maxchars
## blogs     205  899288 37334441    40835
## news      201   77259  2643972     5760
## twitter   163 2360148 30373792      213

Basic Plots of filesizes, lines, words and longest lines

barplot(height=plotdata$sizes,names.arg = rownames(plotdata),main="FileSizes")

barplot(height=plotdata$chars,names.arg = rownames(plotdata),main="Lines")

barplot(height=plotdata$words,names.arg = rownames(plotdata),main="Words")

barplot(height=plotdata$maxchars,names.arg = rownames(plotdata),main="Longest Lines")

ANALYSIS

Looking at both the data summary table and barplots, it is evident that for similar file sizes: 1. Twitter uses the maximum number of lines whereas news uses the least 2. Bloggers have used maximum words, microblogging site users(Twitteraties) less than that and news material the least 3. Undoubtedly blogging wins the race for longest line and on twitter people preferred writing short lines

CLEANING DATA

blogslines <- tolower(blogslines)
blogslines <- unlist(strsplit(blogslines,"[.,:;!?(){}<>]+")) 
blogslines <- gsub("^[^a-z0-9]+|[^a-z0-9]+$", " ", blogslines) 
blogslines <- gsub("[^a-z0-9]+\\s", " ", blogslines) 
blogslines <- gsub("\\s[^a-z0-9]+", " ", blogslines) 
blogslines <- gsub("\\s+", " ", blogslines)
blogslines <- str_trim(blogslines)
head(blogslines)

## [1] "in the years thereafter"                                                                    
## [2] "most of the oil fields and platforms were named after pagan gods"                           
## [3] "we love you mr"                                                                             
## [4] "brown"                                                                                      
## [5] "chad has been awesome with the kids and holding down the fort while i work later than usual"
## [6] "the kids have been busy together playing skylander on the xbox together"

Splitting lines into words and counting their frequencies

words <- unlist(strsplit(blogslines, "\\s+"))
word.freq <- table(words)
dtfm <- cbind.data.frame(names(word.freq), as.integer(word.freq))
names(dtfm) <- c('Word', 'Frequency')
row.names(dtfm) <- dtfm[,1]
dtfm <- dtfm[order(-dtfm$Frequency),]
head(dtfm)

    Word Frequency
the  the   1857387
and  and   1088633
to    to   1065971
a      a    899498
of    of    875194
i      i    774628

Frequency Plot of top 40 words

library(ggplot2)
pdata<-dtfm[1:10,]
pdata

##      Word Frequency
## the   the   1857387
## and   and   1088633
## to     to   1065971
## a       a    899498
## of     of    875194
## i       i    774628
## in     in    593964
## that that    460013
## is     is    432105
## it     it    402682

ggplot(dtfm[1:40,], aes(y=reorder(Word,Frequency), Frequency))+geom_col()+xlab("Frequency")+ylab("Words")

2-gram

Tried around 5-6 times but wasn’t able to evaluate the code due to system constraints

blogslines <- blogslines[str_count(blogslines, "\\s+")>0]
gram2 <- ngram(blogslines, n=2)
df <- get.phrasetable(gram2)
saveRDS(df, "bloggram2.RData")