Milestone Report

Date 12 June 2016

Loading the Data

con <- file("en_US.twitter.txt", "r") 
dataTwitter <- readLines(con)
close(con)

con <- file("en_US.blogs.txt", "r") 
dataBlogs <- readLines(con)
close(con)

con <- file("en_US.news.txt", "r") 
dataNews <- read.table(con,sep="\n",quote = "", header=F,stringsAsFactors = F)
close(con)

library(tm)
library(RWeka)
library(ggplot2)

A basic summary about the three datasets:

library(ngram)
dataS<-cbind(c(length(dataTwitter),length(dataBlogs),length(dataNews[,1])),
                   c(wordcount(dataTwitter),wordcount(dataBlogs),wordcount(dataNews[,1])))
dataSummary<-data.frame(dataset=c("dataTwitter","dataBlogs","dataNews"),dataS)
names(dataSummary)[2:3]<-c("Length","Wordcount")
dataSummary

##       dataset  Length Wordcount
## 1 dataTwitter 2360148  30373543
## 2   dataBlogs  899288  37334131
## 3    dataNews   77258   2643184

Analysis for the Twitter data

Taking a sample and cleaning the data

set.seed(123)
test <- sample(dataTwitter,1000)

corpora <- Corpus(VectorSource(test))

corpora <- tm_map(corpora,removeWords,stopwords("english"))
funs <- list(stripWhitespace,
             removePunctuation,
             removeNumbers,
             content_transformer(tolower))
corpora <- tm_map(corpora, FUN = tm_reduce, tmFuns = funs)
corpora[[1]][1]

## $content
## [1] "thanks rting"

for (j in seq(corpora)){
    corpora[[j]] <- gsub("$|&|^|<|>", " ", corpora[[j]])
    corpora[[j]] <- gsub("/", " ", corpora[[j]])
    corpora[[j]] <- gsub("@", " ", corpora[[j]])
    corpora[[j]] <- gsub("#", " ", corpora[[j]])
  
    }
corpora <- tm_map(corpora, PlainTextDocument)

Tokenize for single words

ngramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
tdm <- TermDocumentMatrix(corpora, control = list(tokenize = ngramTokenizer))
tdmM <- as.matrix(tdm)

frequencies <- sort(rowSums(tdmM),decreasing = T)[1:20]
df <- data.frame(word=names(frequencies), freq=frequencies)

A look at the words

##   just   like   love   good    now   will   know    one    can    the 
##     59     59     49     48     44     43     41     41     40     39 
##    get thanks  today    day    lol   time    got    new   life   want 
##     37     36     33     32     31     31     28     28     23     23

And a bit more information with some ‘tm’ functions:

findFreqTerms(tdm,lowfreq=20)

##  [1] "back"   "beep"   "can"    "day"    "follow" "get"    "good"  
##  [8] "got"    "great"  "just"   "know"   "life"   "like"   "lol"   
## [15] "love"   "make"   "new"    "now"    "one"    "people" "really"
## [22] "see"    "thanks" "the"    "think"  "time"   "today"  "want"  
## [29] "will"

findAssocs(tdm,"just",corlimit=.2)

## $just
##    walk hundred  uswirl   miles 
##    0.28    0.26    0.26    0.23

Ploting the most frequent words

library(ggplot2)
g <- ggplot(df, aes(word,freq))    
g <- g + geom_bar(stat="identity",fill="white",colour = "green") + theme_bw()
g <- g + theme(axis.text.x=element_text(angle=15, hjust=1))   
g

library(wordcloud)  
set.seed(123)   
wordcloud(names(frequencies), frequencies, min.freq=2,colors=brewer.pal(2, "Pastel2"))

Plotting the 2-grams

findFreqTerms(tdm,lowfreq=7)

##  [1] "beep beep" "feel like" "i can"     "i feel"    "i just"   
##  [6] "i know"    "i love"    "i think"   "i want"    "right now"

findAssocs(tdm,"i love",corlimit=.2)

## $`i love`
##            love i     love somebody    â<U+0098><U+0085>party party        almost ten 
##              0.46              0.46              0.23              0.23 
##        bad disney        bday jordy     behind always         coming sr 
##              0.23              0.23              0.23              0.23 
## commercial mchips            day hb       desert view         disney no 
##              0.23              0.23              0.23              0.23 
##      drive almost          east rim  everyday weekâ<U+0098><U+0085>      favor tweets 
##              0.23              0.23              0.23              0.23 
##       gichy gichy         gichy goo           girl im           goo not 
##              0.23              0.23              0.23              0.23 
##     grandcanyon i        happy bday             hb rt         hope know 
##              0.23              0.23              0.23              0.23 
##            i east         i morning             i rly         id rather 
##              0.23              0.23              0.23              0.23 
##         im behind        ive smiled   jamming pandora        jordy your 
##              0.23              0.23              0.23              0.23 
##         keep life         know love          let keep        lived hear 
##              0.23              0.23              0.23              0.23 
##           loser i           love â<U+0099>   love commercial       love disney 
##              0.23              0.23              0.23              0.23 
##        love favor           love id      love jamming         love life 
##              0.23              0.23              0.23              0.23 
##        love loser           love ma        love momma            love s 
##              0.23              0.23              0.23              0.23 
##         love view     love watching          love yew          ma youre 
##              0.23              0.23              0.23              0.23 
##          make day          make let            mean i           model i 
##              0.23              0.23              0.23              0.23 
##      morning long            no bad          not mean          now girl 
##              0.23              0.23              0.23              0.23 
##       pandora ðÿ<U+009E>    party everyday       party party          people i 
##              0.23              0.23              0.23              0.23 
##       playrt make        rather see         rim drive         rly think 
##              0.23              0.23              0.23              0.23 
##  rt hoosiernation          s nephew         see lived       smiled much 
##              0.23              0.23              0.23              0.23 
##             sr yr         ten years         think ive       view desert 
##              0.23              0.23              0.23              0.23 
##  view grandcanyon   watching playrt         weekâ<U+0098><U+0085> i         with good 
##              0.23              0.23              0.23              0.23 
##         your role      youre coming            yr now 
##              0.23              0.23              0.23

Plotting the 3-grams

Analysis for the Blogs data

Taking a sample and cleaning the data

## $content
## [1] "to sum unjustly wounded men let us overlook wickedness worsen pain sharpen minds revenge remember mount god learn believe certain whatever enemy wickedly committed us permitted sent godâ<U+0080><U+0099>s just dispensation calvin â<U+0080><U+0093> institutes "

Tokenize for single words

A look at the words

##    the   will    one   just    can   time   like people   know    now 
##    212    159    135    125    108    100     96     83     72     72 
##    see    get really    and  think    way   also    day   want   back 
##     69     65     62     61     61     61     60     60     59     56

Ploting the most frequent words

Plotting the 2-grams

Plotting the 3-grams

Analysis for the News data

Taking a sample and cleaning the data

## $content
## [1] "riveras works beatnik studios show bold color expressionistic handling full references precolumbian cultures work depicts male figure subtitled nahuatl aztec word yaotl nahuatl warrior depicts fearsome visage"

Tokenize for single words

A look at the words

##    the   said   will    one    new   also    can    but    two  years 
##    524    502    228    155    142    141    133    125    115    115 
##  first   just   time   year   last   like    â<U+0080><U+0094>   city people  state 
##    113    112    106    100     97     92     87     87     84     83