setwd("/Users/subasishdas1/Copy/Rpubs/rpubs/kdix")
kdx <- read.csv("kdx.csv")
names(kdx)
## [1] "Year" "Paper" "Abstract"
options(warn=-1)
head(kdx[-c(3)])
## Year
## 1 2001-2009
## 2 2010-2014
## 3 2001-2009
## 4 1996-2000
## 5 1996-2000
## 6 2010-2014
## Paper
## 1 Effect of Wireless Communication and Entertainment Devices on Simulated Driving Performance
## 2 \nAssessing How Drivers of Through Vehicles React to Driveway Activity
## 3 Urban Roadside Safety Cluster-Crash Evaluation
## 4 Capacity for North Carolina Freeway Work Zones
## 5 Estimating Free-Flow Speeds for Rural Multilane Highways
## 6 How Far from Optimal Are Current Advisory Speeds? Analysis Based on Safety Performance
table(kdx$Year)
##
## 1996-2000 2001-2009 2010-2014
## 6 7 7
kdx1 <- data.frame(Year = unique(kdx$Year),
Title = tapply(kdx$Paper, kdx$Year, paste, collapse = ' '))
names(kdx1)
## [1] "Year" "Title"
library(tm)
## Loading required package: NLP
mydata.corpus <- Corpus(VectorSource(kdx1$Title))
mydata.corpus <- tm_map(mydata.corpus, content_transformer(function(x) iconv(x, to='UTF-8-MAC', sub='byte')), mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, content_transformer(tolower), mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, removePunctuation, preserve_intra_word_dashes=TRUE, mc.cores=1)
my_stopwords <- c(stopwords('german'),"the", "due", "are", "not", "for", "this", "and", "that", "there", "new", "near", "beyond", "time", "from", "been", "both", "than", "has","now", "until", "all", "use", "two", "ave", "blvd", "east", "between", "ccc", "end", "have", "avenue", "before", "i-us", "i-e", "i-i-", "ames", "belle", "gen", "okeefe", "one", "just", "mac", "being", "i-i-", "left", "right", "west", "when","levels","remaining","based", "issues", "still", "off", "over", "only", "north", "past", "twin", "while", "i-w" , "general" , "harvey", "i-e","i-i-","i-us" , "must", "more", "work","read", "reached", "morrison", "mph", "three","info", "canal", "camp", "la-", "approximately", "amp", "access", "approaching", "forest", "friday", "its", "affect", "after", "within", "what", "various", "under", "toward", "san", "other" , "city", "into", "by", "for", "is", "are", "their", "he", "she", "research", "through", "between", "under", "below", "over", "with", "an", "affect", "nowadays", "present", "important", "significant", "then", "using", "having", "via", "vermont", "some", "rap", "how", "can")
mydata.corpus <- tm_map(mydata.corpus, removeWords, my_stopwords, mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, removeNumbers, mc.cores=1)
mydata.dtm <- TermDocumentMatrix(mydata.corpus)
mydata.dtm
## <<TermDocumentMatrix (terms: 119, documents: 3)>>
## Non-/sparse entries: 142/215
## Sparsity : 60%
## Maximal term length: 24
## Weighting : term frequency (tf)
dim(mydata.dtm)
## [1] 119 3
inspect(mydata.dtm[1:3,1:3])
## <<TermDocumentMatrix (terms: 3, documents: 3)>>
## Non-/sparse entries: 3/6
## Sparsity : 67%
## Maximal term length: 8
## Weighting : term frequency (tf)
##
## Docs
## Terms 1 2 3
## -lane 0 0 1
## -turn 1 0 0
## -vehicle 0 2 0
DTM <- DocumentTermMatrix(mydata.corpus)
inspect(DTM[1:3,1:3])
## <<DocumentTermMatrix (documents: 3, terms: 3)>>
## Non-/sparse entries: 3/6
## Sparsity : 67%
## Maximal term length: 8
## Weighting : term frequency (tf)
##
## Terms
## Docs -lane -turn -vehicle
## 1 0 1 0
## 2 0 0 2
## 3 1 0 0
findFreqTerms(mydata.dtm, lowfreq=2)
## [1] "-vehicle" "acceleration" "analysis" "behavior"
## [5] "crashes" "data" "deceleration" "driveway"
## [9] "fatal" "georgia" "global" "highway"
## [13] "highways" "intersections" "models" "normal"
## [17] "passenger" "performance" "positioning" "road"
## [21] "rural" "safety" "speeds" "system"
## [25] "urban" "vehicles" "zones"
kdx2 <- data.frame(Year = unique(kdx$Year),
Abs = tapply(kdx$Abstract, kdx$Year, paste, collapse = ' '))
names(kdx2)
## [1] "Year" "Abs"
mydata.corpus <- Corpus(VectorSource(kdx2$Abs))
mydata.corpus <- tm_map(mydata.corpus, content_transformer(function(x) iconv(x, to='UTF-8-MAC', sub='byte')), mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, content_transformer(tolower), mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, removePunctuation, preserve_intra_word_dashes=TRUE, mc.cores=1)
my_stopwords <- c(stopwords('german'),"the", "due", "are", "not", "for", "this", "and", "that", "there", "new", "near", "beyond", "time", "from", "been", "both", "than", "has","now", "until", "all", "use", "two", "ave", "blvd", "east", "between", "end", "have", "avenue", "before", "just", "mac", "being", "when","levels","remaining","based", "still", "off", "over", "only", "north", "past", "twin", "while", "i-w" , "general" , "harvey", "must", "more", "work","read", "reached", "morrison", "mph", "three","info", "canal", "camp", "la-", "approximately", "amp", "access", "approaching", "forest", "friday", "its", "affect", "after", "within", "what", "various", "under", "toward", "san", "other" , "city", "into", "by", "for", "is", "are", "their", "he", "she", "research", "through", "between", "under", "below", "over", "with", "an", "affect", "nowadays", "present", "important", "significant", "then", "-lane", "can", "authors", "each", "finally", "included", "may", "most", "overall", "provides", "rates", "set", "such", "type", "uses", "used", "well", "were",
"which")
mydata.corpus <- tm_map(mydata.corpus, removeWords, my_stopwords, mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, removeNumbers, mc.cores=1)
# build a term-document matrix
mydata.dtm <- TermDocumentMatrix(mydata.corpus)
mydata.dtm
## <<TermDocumentMatrix (terms: 1022, documents: 3)>>
## Non-/sparse entries: 1335/1731
## Sparsity : 56%
## Maximal term length: 24
## Weighting : term frequency (tf)
dim(mydata.dtm)
## [1] 1022 3
DTM <- DocumentTermMatrix(mydata.corpus)
findFreqTerms(mydata.dtm, lowfreq=20)
## [1] "crashes" "data" "fatal" "safety" "speed" "speeds" "study"
## [8] "urban"
findAssocs(mydata.dtm, 'speed', 0.99)
## speed
## effect 1.00
## evaluate 1.00
## facilities 1.00
## implementation 1.00
## model 1.00
## models 1.00
## often 1.00
## researchers 1.00
## urban 1.00
## locations 0.99
## roadside 0.99
mydata.dtm2 <- removeSparseTerms(mydata.dtm, sparse=0.005)
dim(mydata.dtm2)
## [1] 58 3
library(slam)
TDM.dense <- as.matrix(mydata.dtm2)
head(TDM.dense)
## Docs
## Terms 1 2 3
## analysis 2 3 7
## associated 3 2 3
## behavior 3 3 3
## characteristics 3 5 4
## collection 3 1 1
## compared 2 1 3
## write.csv(TDM.dense, "tdm_1.csv")
#### PLOT 1
tdm_1 <- read.csv("tdm_1.csv")
head(tdm_1)
## Term TRR96_00 TRR01_09 TRR10_14 Count
## 1 crashes 14 2 13 29
## 2 speed 6 14 9 29
## 3 safety 2 3 20 25
## 4 study 8 10 7 25
## 5 data 8 12 4 24
## 6 urban 3 13 7 23
tdm_BT_1 <- tdm_1[c(1,5)]
head(tdm_BT_1)
## Term Count
## 1 crashes 29
## 2 speed 29
## 3 safety 25
## 4 study 25
## 5 data 24
## 6 urban 23
library(ggplot2)
##
## Attaching package: 'ggplot2'
##
## The following object is masked from 'package:NLP':
##
## annotate
p <- ggplot(tdm_BT_1, aes(x = reorder(Term, -Count), y = Count))
p <- p + geom_bar(stat = "identity")+theme_bw()
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p + labs(title = "Most Frequent Terms")+xlab("Terms")
###PLOT 2
library(reshape)
tdm_BT_2= melt(tdm_1[1:4], value.name = "count")
## Using Term as id variables
head(tdm_BT_2)
## Term variable value
## 1 crashes TRR96_00 14
## 2 speed TRR96_00 6
## 3 safety TRR96_00 2
## 4 study TRR96_00 8
## 5 data TRR96_00 8
## 6 urban TRR96_00 3
ggplot(tdm_BT_2, aes(x = variable, y = Term, fill = value)) +
geom_tile(colour = "white") +
scale_fill_gradient(high="#FF0000" , low="#FFFFFF")+
ylab("")+ xlab("TRR Papers")+
theme(panel.background = element_blank()) +
theme(axis.ticks.x = element_blank())+ theme(axis.text.y = element_text(size = 12))
### PLOT 3
library(wordcloud)
## Loading required package: RColorBrewer
term.matrix <- TermDocumentMatrix(mydata.corpus)
term.matrix <- as.matrix(term.matrix)
head(term.matrix)
## Docs
## Terms 1 2 3
## -kmh 1 0 0
## -phase 0 1 0
## -ramp 1 0 0
## -to-one 1 0 0
## -vehicle 0 2 0
## -way 0 1 0
colnames(term.matrix) <- c("TRR96_00","TRR01_09", "TRR10_14")
comparison.cloud(term.matrix,max.words=100, random.order=FALSE)
m <- as.matrix(mydata.dtm2)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
wordcloud(d$word,d$freq)
pal <- brewer.pal(9,"BuGn")
pal <- pal[-(1:4)]
wordcloud(d$word,d$freq,c(8,.3),2,,FALSE,,.15,pal)
pal <- brewer.pal(6,"Dark2")
pal <- pal[-(1)]
wordcloud(d$word,d$freq,c(8,.3),2,,TRUE,,.15,pal)
#random colors
wordcloud(d$word,d$freq,c(8,.3),2,,TRUE,TRUE,.15,pal)
##### with font #####
wordcloud(d$word,d$freq,c(8,.3),2,,TRUE,,.15,pal,
vfont=c("gothic english","plain"))
wordcloud(d$word,d$freq,c(8,.3),2,100,TRUE,,.15,pal,vfont=c("script","plain"))
wordcloud(d$word,d$freq,c(8,.3),2,100,TRUE,,.15,pal,vfont=c("serif","plain"))
mydata.corpus <- Corpus(VectorSource(kdx$Paper))
mydata.corpus <- tm_map(mydata.corpus, content_transformer(function(x) iconv(x, to='UTF-8-MAC', sub='byte')), mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, content_transformer(tolower), mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, removePunctuation, preserve_intra_word_dashes=TRUE, mc.cores=1)
my_stopwords <- c(stopwords('german'),"the", "due", "are", "not", "for", "this", "and", "that", "there", "new", "near", "beyond", "time", "from", "been", "both", "than", "has","now", "until", "all", "use", "two", "ave", "blvd", "east", "between", "end", "have", "avenue", "before", "just", "mac", "being", "when","levels","remaining","based", "still", "off", "over", "only", "north", "past", "twin", "while", "i-w" , "general" , "harvey", "must", "more", "work","read", "reached", "morrison", "mph", "three","info", "canal", "camp", "la-", "approximately", "amp", "access", "approaching", "forest", "friday", "its", "affect", "after", "within", "what", "various", "under", "toward", "san", "other" , "city", "into", "by", "for", "is", "are", "their", "he", "she", "research", "through", "between", "under", "below", "over", "with", "an", "affect", "nowadays", "present", "important", "significant", "then")
mydata.corpus <- tm_map(mydata.corpus, removeWords, my_stopwords, mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, removeNumbers, mc.cores=1)
# build a term-document matrix
mydata.dtm3 <- TermDocumentMatrix(mydata.corpus)
mydata.dtm3
## <<TermDocumentMatrix (terms: 120, documents: 20)>>
## Non-/sparse entries: 165/2235
## Sparsity : 93%
## Maximal term length: 24
## Weighting : term frequency (tf)
dim(mydata.dtm3)
## [1] 120 20
library(topicmodels)
dtm <- as.DocumentTermMatrix(mydata.dtm3)
library(topicmodels)
lda <- LDA(dtm, k = 4) # find 8 topics
term <- terms(lda, 3) # first 4 terms of every topic
term
## Topic 1 Topic 2 Topic 3 Topic 4
## [1,] "analysis" "data" "highways" "zones"
## [2,] "safety" "global" "performance" "georgia"
## [3,] "how" "positioning" "rural" "acceleration"
term <- apply(term, MARGIN = 2, paste, collapse = ", ")
# first topic identified for every document (tweet)
require(data.table) #fore IDate
## Loading required package: data.table
topic <- topics(lda, 1)
topics <- data.frame(Year=kdx$Year)
library(ggplot2)
qplot(Year, ..count.., data=topics, geom="density",
fill=term[topic], position="stack")+theme_bw()+
theme(panel.background = element_blank()) +
theme(axis.ticks.x = element_blank())+
theme(axis.text.y = element_text(size = 12), axis.text.x = element_text(size = 12))
tdm2 = as.matrix(mydata.dtm3)
head(tdm2)
## Docs
## Terms 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## -lane 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## -vehicle 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0
## -way 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## acceleration 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0
## activity 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## advisory 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
popu = sort(colSums(tdm2), decreasing=TRUE)
head(popu)
## 11 15 16 12 17 19
## 14 11 11 10 10 10
popu1 = data.frame(Number_of_Papers = names(popu), topics=popu)
library(ggplot2)
ggplot(popu1[1:15,], aes(x=topics, y=Number_of_Papers)) + geom_point(size=5, colour="red") + coord_flip() +
ggtitle("Popularity") +
theme(axis.text.x=element_text(size=13,face="bold", colour="black"), axis.text.y=element_text(size=13,colour="black", face="bold"), axis.title.x=element_text(size=14, face="bold"), axis.title.y=element_text(size=14,face="bold"),
plot.title=element_text(size=24,face="bold"))+theme_bw()
k <- 15
SEED <- 2010
library(topicmodels)
jss_TM <- list(
VEM = LDA(tdm2, k = k, control = list(seed = SEED)),
VEM_fixed = LDA(tdm2, k = k, control = list(estimate.alpha = FALSE,
seed = SEED)),
Gibbs = LDA(tdm2, k = k, method = "Gibbs", control = list(
seed = SEED, burnin = 1000, thin = 100, iter = 1000)),
CTM = CTM(tdm2, k = k, control = list(seed = SEED,
var = list(tol = 10^-4), em = list(tol = 10^-3))))
sapply(jss_TM[1:3], slot, "alpha")
## VEM VEM_fixed Gibbs
## 11.058651 3.333333 3.333333
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:data.table':
##
## between, last
##
## The following object is masked from 'package:reshape':
##
## rename
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
words <- tdm2 %>%
as.matrix %>%
colnames %>%
(function(x) x[nchar(x) < 5])
length(words)
## [1] 20
head(words, 15)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14"
## [15] "15"
summary(nchar(words))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 1.00 2.00 1.55 2.00 2.00
table(nchar(words))
##
## 1 2
## 9 11
library(qdap)
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
##
## Attaching package: 'qdapRegex'
##
## The following objects are masked from 'package:dplyr':
##
## escape, explain
##
## Loading required package: qdapTools
##
## Attaching package: 'qdapTools'
##
## The following object is masked from 'package:dplyr':
##
## id
##
##
## Attaching package: 'qdap'
##
## The following object is masked from 'package:dplyr':
##
## %>%
##
## The following object is masked from 'package:reshape':
##
## condense
##
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, as.TermDocumentMatrix
##
## The following object is masked from 'package:base':
##
## Filter
dist_tab(nchar(words))
## interval freq cum.freq percent cum.percent
## 1 1 9 9 45 45
## 2 2 11 20 55 100
Conducted by: Subasish Das