Text Mining n Topic Modeling

TRR Papers

Data Calling

setwd("/Users/subasishdas1/Copy/Rpubs/rpubs/kdix")
kdx <- read.csv("kdx.csv")
names(kdx)

## [1] "Year"     "Paper"    "Abstract"

options(warn=-1)
head(kdx[-c(3)])

##        Year
## 1 2001-2009
## 2 2010-2014
## 3 2001-2009
## 4 1996-2000
## 5 1996-2000
## 6 2010-2014
##                                                                                         Paper
## 1 Effect of Wireless Communication and Entertainment Devices on Simulated Driving Performance
## 2                      \nAssessing How Drivers of Through Vehicles React to Driveway Activity
## 3                                              Urban Roadside Safety Cluster-Crash Evaluation
## 4                                              Capacity for North Carolina Freeway Work Zones
## 5                                    Estimating Free-Flow Speeds for Rural Multilane Highways
## 6      How Far from Optimal Are Current Advisory Speeds? Analysis Based on Safety Performance

table(kdx$Year)

## 
## 1996-2000 2001-2009 2010-2014 
##         6         7         7

kdx1 <- data.frame(Year = unique(kdx$Year), 
Title = tapply(kdx$Paper, kdx$Year, paste, collapse = ' '))
names(kdx1)

## [1] "Year"  "Title"

library(tm)

## Loading required package: NLP

mydata.corpus <- Corpus(VectorSource(kdx1$Title))
mydata.corpus <- tm_map(mydata.corpus, content_transformer(function(x) iconv(x, to='UTF-8-MAC', sub='byte')), mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, content_transformer(tolower), mc.cores=1) 
mydata.corpus <- tm_map(mydata.corpus, removePunctuation, preserve_intra_word_dashes=TRUE, mc.cores=1) 
my_stopwords <- c(stopwords('german'),"the", "due", "are", "not", "for", "this", "and",  "that", "there", "new", "near", "beyond", "time", "from", "been", "both", "than",  "has","now", "until", "all", "use", "two", "ave", "blvd", "east", "between", "ccc", "end", "have", "avenue", "before", "i-us", "i-e", "i-i-", "ames", "belle", "gen", "okeefe", "one", "just", "mac", "being", "i-i-", "left", "right", "west", "when","levels","remaining","based", "issues",  "still", "off", "over", "only", "north", "past", "twin", "while",  "i-w" ,  "general" , "harvey", "i-e","i-i-","i-us" , "must", "more", "work","read",  "reached", "morrison",  "mph", "three","info", "canal", "camp", "la-", "approximately",  "amp", "access", "approaching",  "forest", "friday",  "its", "affect", "after", "within", "what", "various", "under", "toward", "san", "other" , "city", "into", "by", "for", "is", "are", "their", "he", "she", "research", "through", "between", "under", "below", "over", "with", "an", "affect", "nowadays", "present", "important", "significant", "then", "using", "having", "via", "vermont", "some", "rap", "how", "can")
mydata.corpus <- tm_map(mydata.corpus, removeWords, my_stopwords, mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, removeNumbers, mc.cores=1) 

mydata.dtm <- TermDocumentMatrix(mydata.corpus) 
mydata.dtm

## <<TermDocumentMatrix (terms: 119, documents: 3)>>
## Non-/sparse entries: 142/215
## Sparsity           : 60%
## Maximal term length: 24
## Weighting          : term frequency (tf)

dim(mydata.dtm)

## [1] 119   3

inspect(mydata.dtm[1:3,1:3])

## <<TermDocumentMatrix (terms: 3, documents: 3)>>
## Non-/sparse entries: 3/6
## Sparsity           : 67%
## Maximal term length: 8
## Weighting          : term frequency (tf)
## 
##           Docs
## Terms      1 2 3
##   -lane    0 0 1
##   -turn    1 0 0
##   -vehicle 0 2 0

DTM <- DocumentTermMatrix(mydata.corpus)
inspect(DTM[1:3,1:3])

## <<DocumentTermMatrix (documents: 3, terms: 3)>>
## Non-/sparse entries: 3/6
## Sparsity           : 67%
## Maximal term length: 8
## Weighting          : term frequency (tf)
## 
##     Terms
## Docs -lane -turn -vehicle
##    1     0     1        0
##    2     0     0        2
##    3     1     0        0

findFreqTerms(mydata.dtm, lowfreq=2)

##  [1] "-vehicle"      "acceleration"  "analysis"      "behavior"     
##  [5] "crashes"       "data"          "deceleration"  "driveway"     
##  [9] "fatal"         "georgia"       "global"        "highway"      
## [13] "highways"      "intersections" "models"        "normal"       
## [17] "passenger"     "performance"   "positioning"   "road"         
## [21] "rural"         "safety"        "speeds"        "system"       
## [25] "urban"         "vehicles"      "zones"

Exploratory Data Analysis

kdx2 <- data.frame(Year = unique(kdx$Year), 
Abs = tapply(kdx$Abstract, kdx$Year, paste, collapse = ' '))
names(kdx2)

## [1] "Year" "Abs"

mydata.corpus <- Corpus(VectorSource(kdx2$Abs))
mydata.corpus <- tm_map(mydata.corpus, content_transformer(function(x) iconv(x, to='UTF-8-MAC', sub='byte')), mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, content_transformer(tolower), mc.cores=1) 
mydata.corpus <- tm_map(mydata.corpus, removePunctuation, preserve_intra_word_dashes=TRUE, mc.cores=1) 
my_stopwords <- c(stopwords('german'),"the", "due", "are", "not", "for", "this", "and",  "that", "there", "new", "near", "beyond", "time", "from", "been", "both", "than",  "has","now", "until", "all", "use", "two", "ave", "blvd", "east", "between", "end", "have", "avenue", "before",    "just", "mac", "being",  "when","levels","remaining","based", "still", "off", "over", "only", "north", "past", "twin", "while",  "i-w" ,  "general" , "harvey", "must", "more", "work","read",  "reached", "morrison",  "mph", "three","info", "canal", "camp", "la-", "approximately",  "amp", "access", "approaching",  "forest", "friday",  "its", "affect", "after", "within", "what", "various", "under", "toward", "san", "other" , "city", "into", "by", "for", "is", "are", "their", "he", "she", "research", "through", "between", "under", "below", "over", "with", "an", "affect", "nowadays", "present", "important", "significant", "then", "-lane", "can", "authors", "each", "finally", "included", "may", "most", "overall", "provides", "rates", "set", "such", "type", "uses", "used", "well", "were",
                  "which")
mydata.corpus <- tm_map(mydata.corpus, removeWords, my_stopwords, mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, removeNumbers, mc.cores=1) 

# build a term-document matrix
mydata.dtm <- TermDocumentMatrix(mydata.corpus)
mydata.dtm

## <<TermDocumentMatrix (terms: 1022, documents: 3)>>
## Non-/sparse entries: 1335/1731
## Sparsity           : 56%
## Maximal term length: 24
## Weighting          : term frequency (tf)

dim(mydata.dtm)

## [1] 1022    3

DTM <- DocumentTermMatrix(mydata.corpus)
findFreqTerms(mydata.dtm, lowfreq=20)

## [1] "crashes" "data"    "fatal"   "safety"  "speed"   "speeds"  "study"  
## [8] "urban"

findAssocs(mydata.dtm, 'speed', 0.99)

##                speed
## effect          1.00
## evaluate        1.00
## facilities      1.00
## implementation  1.00
## model           1.00
## models          1.00
## often           1.00
## researchers     1.00
## urban           1.00
## locations       0.99
## roadside        0.99

mydata.dtm2 <- removeSparseTerms(mydata.dtm, sparse=0.005)
dim(mydata.dtm2)

## [1] 58  3

library(slam)
TDM.dense <- as.matrix(mydata.dtm2)
head(TDM.dense)

##                  Docs
## Terms             1 2 3
##   analysis        2 3 7
##   associated      3 2 3
##   behavior        3 3 3
##   characteristics 3 5 4
##   collection      3 1 1
##   compared        2 1 3

## write.csv(TDM.dense, "tdm_1.csv")

#### PLOT 1
tdm_1 <- read.csv("tdm_1.csv")
head(tdm_1)

##      Term TRR96_00 TRR01_09 TRR10_14 Count
## 1 crashes       14        2       13    29
## 2   speed        6       14        9    29
## 3  safety        2        3       20    25
## 4   study        8       10        7    25
## 5    data        8       12        4    24
## 6   urban        3       13        7    23

tdm_BT_1 <- tdm_1[c(1,5)]
head(tdm_BT_1)

##      Term Count
## 1 crashes    29
## 2   speed    29
## 3  safety    25
## 4   study    25
## 5    data    24
## 6   urban    23

library(ggplot2)

## 
## Attaching package: 'ggplot2'
## 
## The following object is masked from 'package:NLP':
## 
##     annotate

p <- ggplot(tdm_BT_1, aes(x = reorder(Term, -Count), y = Count))
p <- p + geom_bar(stat = "identity")+theme_bw()
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p + labs(title = "Most Frequent Terms")+xlab("Terms")

###PLOT 2
library(reshape)
tdm_BT_2= melt(tdm_1[1:4], value.name = "count")

## Using Term as id variables

head(tdm_BT_2)

##      Term variable value
## 1 crashes TRR96_00    14
## 2   speed TRR96_00     6
## 3  safety TRR96_00     2
## 4   study TRR96_00     8
## 5    data TRR96_00     8
## 6   urban TRR96_00     3

ggplot(tdm_BT_2, aes(x = variable, y = Term, fill = value)) +
geom_tile(colour = "white") +
scale_fill_gradient(high="#FF0000" , low="#FFFFFF")+
ylab("")+ xlab("TRR Papers")+
theme(panel.background = element_blank()) +
theme(axis.ticks.x = element_blank())+ theme(axis.text.y = element_text(size = 12))

### PLOT 3
library(wordcloud)

## Loading required package: RColorBrewer

term.matrix <- TermDocumentMatrix(mydata.corpus)
term.matrix <- as.matrix(term.matrix)
head(term.matrix)

##           Docs
## Terms      1 2 3
##   -kmh     1 0 0
##   -phase   0 1 0
##   -ramp    1 0 0
##   -to-one  1 0 0
##   -vehicle 0 2 0
##   -way     0 1 0

colnames(term.matrix) <- c("TRR96_00","TRR01_09", "TRR10_14")
comparison.cloud(term.matrix,max.words=100, random.order=FALSE)

m <- as.matrix(mydata.dtm2)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
wordcloud(d$word,d$freq)

pal <- brewer.pal(9,"BuGn")
pal <- pal[-(1:4)]
wordcloud(d$word,d$freq,c(8,.3),2,,FALSE,,.15,pal)

pal <- brewer.pal(6,"Dark2")
pal <- pal[-(1)]
wordcloud(d$word,d$freq,c(8,.3),2,,TRUE,,.15,pal)

#random colors
wordcloud(d$word,d$freq,c(8,.3),2,,TRUE,TRUE,.15,pal)

##### with font #####
wordcloud(d$word,d$freq,c(8,.3),2,,TRUE,,.15,pal,
vfont=c("gothic english","plain"))

wordcloud(d$word,d$freq,c(8,.3),2,100,TRUE,,.15,pal,vfont=c("script","plain"))

wordcloud(d$word,d$freq,c(8,.3),2,100,TRUE,,.15,pal,vfont=c("serif","plain"))

TOPIC MODELING

mydata.corpus <- Corpus(VectorSource(kdx$Paper))
mydata.corpus <- tm_map(mydata.corpus, content_transformer(function(x) iconv(x, to='UTF-8-MAC', sub='byte')), mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, content_transformer(tolower), mc.cores=1) 
mydata.corpus <- tm_map(mydata.corpus, removePunctuation, preserve_intra_word_dashes=TRUE, mc.cores=1) 
my_stopwords <- c(stopwords('german'),"the", "due", "are", "not", "for", "this", "and",  "that", "there", "new", "near", "beyond", "time", "from", "been", "both", "than",  "has","now", "until", "all", "use", "two", "ave", "blvd", "east", "between", "end", "have", "avenue", "before",    "just", "mac", "being",  "when","levels","remaining","based", "still", "off", "over", "only", "north", "past", "twin", "while",  "i-w" ,  "general" , "harvey", "must", "more", "work","read",  "reached", "morrison",  "mph", "three","info", "canal", "camp", "la-", "approximately",  "amp", "access", "approaching",  "forest", "friday",  "its", "affect", "after", "within", "what", "various", "under", "toward", "san", "other" , "city", "into", "by", "for", "is", "are", "their", "he", "she", "research", "through", "between", "under", "below", "over", "with", "an", "affect", "nowadays", "present", "important", "significant", "then")
mydata.corpus <- tm_map(mydata.corpus, removeWords, my_stopwords, mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, removeNumbers, mc.cores=1) 

# build a term-document matrix
mydata.dtm3 <- TermDocumentMatrix(mydata.corpus)
mydata.dtm3

## <<TermDocumentMatrix (terms: 120, documents: 20)>>
## Non-/sparse entries: 165/2235
## Sparsity           : 93%
## Maximal term length: 24
## Weighting          : term frequency (tf)

dim(mydata.dtm3)

## [1] 120  20

library(topicmodels)
dtm <- as.DocumentTermMatrix(mydata.dtm3)
library(topicmodels)
lda <- LDA(dtm, k = 4) # find 8 topics
term <- terms(lda, 3) # first 4 terms of every topic
term

##      Topic 1    Topic 2       Topic 3       Topic 4       
## [1,] "analysis" "data"        "highways"    "zones"       
## [2,] "safety"   "global"      "performance" "georgia"     
## [3,] "how"      "positioning" "rural"       "acceleration"

term <- apply(term, MARGIN = 2, paste, collapse = ", ")

# first topic identified for every document (tweet)
require(data.table) #fore IDate

## Loading required package: data.table

topic <- topics(lda, 1)
topics <- data.frame(Year=kdx$Year)
library(ggplot2)
qplot(Year, ..count.., data=topics, geom="density",
fill=term[topic], position="stack")+theme_bw()+
theme(panel.background = element_blank()) +
theme(axis.ticks.x = element_blank())+ 
theme(axis.text.y = element_text(size = 12), axis.text.x = element_text(size = 12))

Popularity

tdm2 = as.matrix(mydata.dtm3)
head(tdm2)

##               Docs
## Terms          1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
##   -lane        0 0 0 0 0 0 0 0 0  0  0  0  0  0  0  0  1  0  0  0
##   -vehicle     0 0 0 0 0 0 0 0 0  0  1  0  0  0  1  0  0  0  0  0
##   -way         0 0 0 0 0 0 0 0 0  0  0  1  0  0  0  0  0  0  0  0
##   acceleration 0 0 0 0 0 0 0 0 0  0  0  1  1  0  0  0  0  0  0  0
##   activity     0 1 0 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0
##   advisory     0 0 0 0 0 1 0 0 0  0  0  0  0  0  0  0  0  0  0  0

popu = sort(colSums(tdm2), decreasing=TRUE)
head(popu)

## 11 15 16 12 17 19 
## 14 11 11 10 10 10

popu1 = data.frame(Number_of_Papers = names(popu), topics=popu)


library(ggplot2)
ggplot(popu1[1:15,], aes(x=topics, y=Number_of_Papers)) + geom_point(size=5, colour="red") + coord_flip() +
  ggtitle("Popularity") + 
  theme(axis.text.x=element_text(size=13,face="bold", colour="black"), axis.text.y=element_text(size=13,colour="black",                                        face="bold"), axis.title.x=element_text(size=14, face="bold"), axis.title.y=element_text(size=14,face="bold"),
        plot.title=element_text(size=24,face="bold"))+theme_bw()

VEM and Gibbs

k <- 15
SEED <- 2010
library(topicmodels)
jss_TM <- list(
VEM = LDA(tdm2, k = k, control = list(seed = SEED)),
VEM_fixed = LDA(tdm2, k = k, control = list(estimate.alpha = FALSE,
seed = SEED)),
Gibbs = LDA(tdm2, k = k, method = "Gibbs", control = list(
seed = SEED, burnin = 1000, thin = 100, iter = 1000)),
CTM = CTM(tdm2, k = k, control = list(seed = SEED,
var = list(tol = 10^-4), em = list(tol = 10^-3))))
sapply(jss_TM[1:3], slot, "alpha")

##       VEM VEM_fixed     Gibbs 
## 11.058651  3.333333  3.333333

OnepageR codes

library(dplyr)

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:data.table':
## 
##     between, last
## 
## The following object is masked from 'package:reshape':
## 
##     rename
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

words <- tdm2                                                          %>%
  as.matrix                                                           %>%
  colnames                                                            %>%
  (function(x) x[nchar(x) < 5])
length(words)

## [1] 20

head(words, 15)

##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14"
## [15] "15"

summary(nchar(words))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    1.00    2.00    1.55    2.00    2.00

table(nchar(words))

## 
##  1  2 
##  9 11

library(qdap)

## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
## 
## Attaching package: 'qdapRegex'
## 
## The following objects are masked from 'package:dplyr':
## 
##     escape, explain
## 
## Loading required package: qdapTools
## 
## Attaching package: 'qdapTools'
## 
## The following object is masked from 'package:dplyr':
## 
##     id
## 
## 
## Attaching package: 'qdap'
## 
## The following object is masked from 'package:dplyr':
## 
##     %>%
## 
## The following object is masked from 'package:reshape':
## 
##     condense
## 
## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, as.TermDocumentMatrix
## 
## The following object is masked from 'package:base':
## 
##     Filter

dist_tab(nchar(words))

##   interval freq cum.freq percent cum.percent
## 1        1    9        9      45          45
## 2        2   11       20      55         100

Conducted by: Subasish Das

Text Mining n Topic Modeling

July 22, 2015

TRR Papers

Data Calling

Exploratory Data Analysis

TOPIC MODELING

Popularity

VEM and Gibbs

OnepageR codes