Paper Title

Papers per year

setwd("/Users/subasishdas1/Copy/Rpubs/rpubs/tm_rpubs")
data <- read.csv("TRB_AM.csv")
dim(data)
## [1] 45933     7
bt_tw2 <- data[!duplicated(data[,2]),]
dim(bt_tw2)
## [1] 15361     7
bt_tw3 <- subset(bt_tw2 , Year.the.Paper.was.Presented > 2006 )
dim(bt_tw3)
## [1] 15357     7
### stm for all documents TITLE
### stm
library(stm)
## stm v1.1.3 (2016-01-14) successfully loaded. See ?stm for help.
processed <- textProcessor(bt_tw3$Paper.Abstract, metadata = bt_tw3, stem=FALSE, customstopwords = c("the", "due","traffic", "model","next", "are", "not", "for", "this", "and", "papers","reveiw", "that",  "there", "new", "near", "beyond", "time", "from", "been", "both", "than", "review","subcommittee", "has","now", "until", "all", "use", "two", "ave", "blvd", "east", "between", "ccc", "end", "have",  "avenue", "before", "i-us", "i-e", "i-i-", "ames", "belle", "gen", "okeefe", "one", "just", "mac", "being", "i-i-", "left", "right", "west",  "when","levels","remaining","based", "issues",  "still", "off", "over", "only", "north", "past", "twin", "while",  "i-w" ,  "general" , "harvey", "i-e","i-i-","i-us" , "must", "more", "work","read",  "reached", "morrison",  "mph", "three","info", "canal", "camp", "la-", "approximately",  "amp", 
"access", "approaching",  "forest", "friday",  "its", "affect", "after", "within", "what", "various", "under", "toward", "san", "other" , "city", "into", "by", "for", "is", "are", "their", "he", "she", "research", "through", "between", "under", "below", "over", "with", "an", "affect", "nowadays", "present", "important", "significant", 
"then", "using", "having", "via", "vermont", "some", "rap", "how", "can", "inc", "transportation",  "advanced", "applied" , "asphalt", "associates", "association", "authority", "center", "central", "cities", "college",   "commission","company","construction","consultant"  ,    "consultants"  ,    "consulting", "corporation",      "council" ,"county" ,"department","development" , "engineering" ,"group",     "highway", "icf" , "imperial", "inc", "innovation","institute","international",   "kth" , "laboratory" ,"llc" ,"los" ,"ltd" , "metropolitan","ministry","national" ,"old"  , "park"  ,"parsons","planning"   ,"polytechnic" ,"polytechnique"  ,"resource","road" ,"royal", "safety" , "santa","science","state", "systematics","systems","tech","technical","technological",
"technology"   , "toronto","transit","transport" , "transportation","united" ,
"universidad", "universitat","university","ahd","paper", "ahn", "special","call","ahd",
"activities","cfp","toledo","geroliminis", "study", "case", "assessment", "analysis", "approach"))
## Building corpus... 
## Converting to Lower Case... 
## Removing stopwords... 
## Remove Custom Stopwords...
## Removing numbers... 
## Removing punctuation... 
## Creating Output...
out <- prepDocuments(processed$documents, processed$vocab, processed$meta) 
## Removing 19270 of 40044 terms (19270 of 1221726 tokens) due to frequency 
## Removing 1 Documents with No Words 
## Your corpus now has 15356 documents, 20774 terms and 1202456 tokens.
docs <- out$documents
vocab <- out$vocab
meta <-out$meta

#### 10 topics
out <- prepDocuments(processed$documents, processed$vocab,processed$meta, lower.thresh = 200)
## Removing 38654 of 40044 terms (398447 of 1221726 tokens) due to frequency 
## Removing 8 Documents with No Words 
## Your corpus now has 15349 documents, 1390 terms and 823279 tokens.
docs <- out$documents
poliblogPrevFit <- stm(out$documents, out$vocab, K =10,prevalence=~ Year.the.Paper.was.Presented + Reviewing.Committee.s.Name,
                       data = out$meta, init.type = "Spectral")
## Beginning Initialization.
##   Calculating the gram matrix...
##   Finding anchor words...
##      ..........
##   Recovering initialization...
##      .............
## Initialization complete.
## ....................................................................................................
## Completed E-Step (9 seconds). 
## Completed M-Step (2 seconds). 
## Completing Iteration 1 (approx. per word bound = -6.705) 
## ....................................................................................................
## Completed E-Step (9 seconds). 
## Completed M-Step (2 seconds). 
## Completing Iteration 2 (approx. per word bound = -6.576, relative change = 1.914e-02) 
## ....................................................................................................
## Completed E-Step (8 seconds). 
## Completed M-Step (2 seconds). 
## Completing Iteration 3 (approx. per word bound = -6.539, relative change = 5.737e-03) 
## ....................................................................................................
## Completed E-Step (7 seconds). 
## Completed M-Step. 
## Completing Iteration 4 (approx. per word bound = -6.523, relative change = 2.411e-03) 
## ....................................................................................................
## Completed E-Step (7 seconds). 
## Completed M-Step (2 seconds). 
## Completing Iteration 5 (approx. per word bound = -6.515, relative change = 1.243e-03) 
## Topic 1: travel, survey, choice, results, mode 
##  Topic 2: test, results, mixtures, tests, temperature 
##  Topic 3: pavement, design, data, performance, pavements 
##  Topic 4: drivers, crash, speed, crashes, driving 
##  Topic 5: bridge, project, system, design, projects 
##  Topic 6: emissions, vehicle, fuel, vehicles, energy 
##  Topic 7: data, network, travel, models, proposed 
##  Topic 8: lane, control, signal, vehicles, capacity 
##  Topic 9: service, bus, rail, system, passenger 
##  Topic 10: concrete, aggregate, materials, strength, surface 
## ....................................................................................................
## Completed E-Step (6 seconds). 
## Completed M-Step (2 seconds). 
## Completing Iteration 6 (approx. per word bound = -6.510, relative change = 7.038e-04) 
## ....................................................................................................
## Completed E-Step (6 seconds). 
## Completed M-Step. 
## Completing Iteration 7 (approx. per word bound = -6.507, relative change = 4.302e-04) 
## ....................................................................................................
## Completed E-Step (6 seconds). 
## Completed M-Step. 
## Completing Iteration 8 (approx. per word bound = -6.506, relative change = 2.580e-04) 
## ....................................................................................................
## Completed E-Step (6 seconds). 
## Completed M-Step. 
## Completing Iteration 9 (approx. per word bound = -6.505, relative change = 1.569e-04) 
## ....................................................................................................
## Completed E-Step (6 seconds). 
## Completed M-Step (2 seconds). 
## Completing Iteration 10 (approx. per word bound = -6.504, relative change = 9.343e-05) 
## Topic 1: travel, survey, choice, activity, results 
##  Topic 2: test, mixtures, tests, results, temperature 
##  Topic 3: pavement, design, data, performance, pavements 
##  Topic 4: drivers, speed, crash, crashes, driving 
##  Topic 5: system, bridge, project, management, agencies 
##  Topic 6: vehicle, emissions, fuel, vehicles, costs 
##  Topic 7: data, network, models, travel, proposed 
##  Topic 8: lane, control, pedestrian, vehicles, signal 
##  Topic 9: service, bus, rail, system, passenger 
##  Topic 10: concrete, materials, aggregate, strength, surface 
## ....................................................................................................
## Completed E-Step (6 seconds). 
## Completed M-Step. 
## Completing Iteration 11 (approx. per word bound = -6.504, relative change = 5.246e-05) 
## ....................................................................................................
## Completed E-Step (6 seconds). 
## Completed M-Step. 
## Completing Iteration 12 (approx. per word bound = -6.504, relative change = 2.857e-05) 
## ....................................................................................................
## Completed E-Step (6 seconds). 
## Completed M-Step. 
## Completing Iteration 13 (approx. per word bound = -6.503, relative change = 1.475e-05) 
## ....................................................................................................
## Completed E-Step (6 seconds). 
## Completed M-Step. 
## Model Converged
labelTopics(poliblogPrevFit, c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
## Topic 1 Top Words:
##       Highest Prob: travel, choice, survey, activity, results, mode, behavior 
##       FREX: parking, household, bicycle, land, choice, activity, survey 
##       Lift: parking, household, respondents, households, individuals, home, residents 
##       Score: parking, travel, household, choice, trip, bicycle, households 
## Topic 2 Top Words:
##       Highest Prob: test, mixtures, tests, temperature, results, modulus, binder 
##       FREX: binder, modulus, temperature, mixtures, mixture, test, fatigue 
##       Lift: binders, binder, temperatures, stress, strain, deformation, permanent 
##       Score: binders, mixtures, binder, modulus, hma, moisture, temperature 
## Topic 3 Top Words:
##       Highest Prob: pavement, design, data, performance, pavements, used, condition 
##       FREX: pavement, pavements, condition, sections, mepdg, rehabilitation, distress 
##       Lift: mepdg, mechanisticempirical, distress, pavement, rehabilitation, deterioration, sections 
##       Score: mepdg, pavement, pavements, distress, cracking, thickness, mechanisticempirical 
## Topic 4 Top Words:
##       Highest Prob: drivers, speed, crash, crashes, driving, data, driver 
##       FREX: crash, crashes, drivers, driver, driving, weather, speed 
##       Lift: injury, fatal, crashes, crash, collisions, accident, collision 
##       Score: injury, crashes, crash, drivers, driver, driving, speed 
## Topic 5 Top Words:
##       Highest Prob: system, bridge, project, management, agencies, performance, process 
##       FREX: bridge, projects, agencies, management, project, practices, challenges 
##       Lift: bridge, funding, stakeholders, lessons, practices, departments, projects 
##       Score: bridge, projects, project, management, bridges, public, funding 
## Topic 6 Top Words:
##       Highest Prob: vehicle, emissions, fuel, vehicles, costs, cost, truck 
##       FREX: emissions, fuel, emission, consumption, noise, truck, energy 
##       Lift: noise, emissions, emission, fuel, greenhouse, consumption, gas 
##       Score: noise, emissions, fuel, emission, vehicle, vehicles, energy 
## Topic 7 Top Words:
##       Highest Prob: data, network, models, proposed, method, travel, results 
##       FREX: algorithm, estimation, problem, network, link, algorithms, assignment 
##       Lift: incident, assignment, algorithms, algorithm, equilibrium, stochastic, computational 
##       Score: incident, travel, algorithm, network, data, models, assignment 
## Topic 8 Top Words:
##       Highest Prob: lane, control, pedestrian, vehicles, signal, lanes, flow 
##       FREX: signal, lanes, pedestrian, intersection, lane, intersections, turn 
##       Lift: turn, signal, signalized, vissim, queue, lanes, intersection 
##       Score: turn, pedestrian, lane, intersections, intersection, signal, signalized 
## Topic 9 Top Words:
##       Highest Prob: service, bus, rail, system, passenger, services, freight 
##       FREX: bus, rail, services, stations, passengers, passenger, station 
##       Lift: bus, rail, train, buses, station, passengers, ridership 
##       Score: bus, rail, passengers, service, passenger, buses, freight 
## Topic 10 Top Words:
##       Highest Prob: concrete, materials, aggregate, strength, material, used, results 
##       FREX: concrete, materials, water, strength, aggregate, steel, cement 
##       Lift: aggregates, cement, reinforced, steel, durability, water, concrete 
##       Score: aggregates, concrete, cement, materials, steel, soil, strength
plot.STM(poliblogPrevFit, type = "summary")

topicQuality(model=poliblogPrevFit, documents=docs)
##  [1]  -93.72068  -75.22895  -87.26924  -74.95874  -94.00450  -95.56113
##  [7]  -79.04181  -90.05502 -105.19811  -99.33002
##  [1] 9.568776 9.792392 9.718656 9.460848 9.309246 9.867268 9.228019
##  [8] 9.884891 9.868550 9.675392

#### 50 topics
docs <- out$documents
poliblogPrevFit <- stm(out$documents, out$vocab, K =20,prevalence=~ Year.the.Paper.was.Presented + Reviewing.Committee.s.Name,
                       data = out$meta, init.type = "Spectral")
## Beginning Initialization.
##   Calculating the gram matrix...
##   Finding anchor words...
##      ....................
##   Recovering initialization...
##      .............
## Initialization complete.
## ....................................................................................................
## Completed E-Step (12 seconds). 
## Completed M-Step (5 seconds). 
## Completing Iteration 1 (approx. per word bound = -6.580) 
## ....................................................................................................
## Completed E-Step (11 seconds). 
## Completed M-Step (4 seconds). 
## Completing Iteration 2 (approx. per word bound = -6.479, relative change = 1.546e-02) 
## ....................................................................................................
## Completed E-Step (11 seconds). 
## Completed M-Step (4 seconds). 
## Completing Iteration 3 (approx. per word bound = -6.450, relative change = 4.491e-03) 
## ....................................................................................................
## Completed E-Step (10 seconds). 
## Completed M-Step (4 seconds). 
## Completing Iteration 4 (approx. per word bound = -6.437, relative change = 1.971e-03) 
## ....................................................................................................
## Completed E-Step (10 seconds). 
## Completed M-Step (4 seconds). 
## Completing Iteration 5 (approx. per word bound = -6.430, relative change = 1.096e-03) 
## Topic 1: parking, urban, car, areas, land 
##  Topic 2: mixtures, test, binder, performance, mixture 
##  Topic 3: pavement, design, data, performance, models 
##  Topic 4: crash, crashes, factors, risk, data 
##  Topic 5: bridge, design, bridges, steel, system 
##  Topic 6: air, quality, noise, airport, method 
##  Topic 7: data, method, travel, information, models 
##  Topic 8: lane, capacity, control, signal, lanes 
##  Topic 9: service, bus, passenger, rail, system 
##  Topic 10: concrete, aggregate, materials, strength, material 
##  Topic 11: pedestrian, pedestrians, crossing, street, walking 
##  Topic 12: freight, truck, models, demand, data 
##  Topic 13: vehicle, emissions, fuel, vehicles, energy 
##  Topic 14: network, problem, route, proposed, travel 
##  Topic 15: travel, choice, survey, activity, mode 
##  Topic 16: pavement, surface, sections, friction, treatment 
##  Topic 17: speed, drivers, driving, driver, vehicle 
##  Topic 18: project, management, agencies, system, projects 
##  Topic 19: test, results, soil, load, tests 
##  Topic 20: fatigue, damage, cracking, loading, test 
## ....................................................................................................
## Completed E-Step (9 seconds). 
## Completed M-Step (4 seconds). 
## Completing Iteration 6 (approx. per word bound = -6.426, relative change = 6.438e-04) 
## ....................................................................................................
## Completed E-Step (9 seconds). 
## Completed M-Step (4 seconds). 
## Completing Iteration 7 (approx. per word bound = -6.423, relative change = 4.166e-04) 
## ....................................................................................................
## Completed E-Step (9 seconds). 
## Completed M-Step (4 seconds). 
## Completing Iteration 8 (approx. per word bound = -6.421, relative change = 2.654e-04) 
## ....................................................................................................
## Completed E-Step (9 seconds). 
## Completed M-Step (4 seconds). 
## Completing Iteration 9 (approx. per word bound = -6.420, relative change = 1.701e-04) 
## ....................................................................................................
## Completed E-Step (9 seconds). 
## Completed M-Step (4 seconds). 
## Completing Iteration 10 (approx. per word bound = -6.420, relative change = 1.110e-04) 
## Topic 1: urban, parking, areas, car, congestion 
##  Topic 2: mixtures, test, binder, mix, mixture 
##  Topic 3: design, pavement, performance, data, models 
##  Topic 4: crash, crashes, risk, factors, data 
##  Topic 5: bridge, design, bridges, steel, system 
##  Topic 6: quality, air, noise, airport, method 
##  Topic 7: data, method, travel, information, used 
##  Topic 8: lane, control, capacity, flow, lanes 
##  Topic 9: service, bus, rail, passenger, system 
##  Topic 10: concrete, materials, aggregate, strength, material 
##  Topic 11: pedestrian, pedestrians, street, crossing, walking 
##  Topic 12: demand, truck, models, freight, data 
##  Topic 13: vehicle, emissions, vehicles, fuel, energy 
##  Topic 14: network, problem, proposed, route, algorithm 
##  Topic 15: travel, choice, survey, activity, mode 
##  Topic 16: pavement, surface, sections, maintenance, pavements 
##  Topic 17: speed, drivers, driving, driver, vehicle 
##  Topic 18: project, management, agencies, system, projects 
##  Topic 19: test, results, tests, load, soil 
##  Topic 20: fatigue, damage, cracking, loading, failure 
## ....................................................................................................
## Completed E-Step (8 seconds). 
## Completed M-Step (3 seconds). 
## Completing Iteration 11 (approx. per word bound = -6.419, relative change = 7.557e-05) 
## ....................................................................................................
## Completed E-Step (9 seconds). 
## Completed M-Step (3 seconds). 
## Completing Iteration 12 (approx. per word bound = -6.419, relative change = 5.888e-05) 
## ....................................................................................................
## Completed E-Step (8 seconds). 
## Completed M-Step (4 seconds). 
## Completing Iteration 13 (approx. per word bound = -6.418, relative change = 4.245e-05) 
## ....................................................................................................
## Completed E-Step (8 seconds). 
## Completed M-Step (3 seconds). 
## Completing Iteration 14 (approx. per word bound = -6.418, relative change = 3.428e-05) 
## ....................................................................................................
## Completed E-Step (9 seconds). 
## Completed M-Step (3 seconds). 
## Completing Iteration 15 (approx. per word bound = -6.418, relative change = 2.772e-05) 
## Topic 1: urban, parking, areas, congestion, car 
##  Topic 2: mixtures, test, binder, mix, mixture 
##  Topic 3: design, pavement, models, performance, data 
##  Topic 4: crash, crashes, risk, factors, weather 
##  Topic 5: bridge, bridges, design, steel, structures 
##  Topic 6: quality, air, value, noise, airport 
##  Topic 7: data, method, information, travel, used 
##  Topic 8: lane, control, flow, capacity, lanes 
##  Topic 9: service, bus, rail, passenger, system 
##  Topic 10: concrete, materials, aggregate, strength, material 
##  Topic 11: pedestrian, pedestrians, street, crossing, walking 
##  Topic 12: demand, models, truck, freight, spatial 
##  Topic 13: vehicle, emissions, vehicles, fuel, cost 
##  Topic 14: network, problem, proposed, route, algorithm 
##  Topic 15: travel, choice, survey, behavior, mode 
##  Topic 16: pavement, surface, maintenance, pavements, sections 
##  Topic 17: speed, drivers, driving, driver, vehicle 
##  Topic 18: project, management, agencies, system, projects 
##  Topic 19: test, results, tests, parameters, soil 
##  Topic 20: fatigue, cracking, damage, loading, failure 
## ....................................................................................................
## Completed E-Step (8 seconds). 
## Completed M-Step (3 seconds). 
## Completing Iteration 16 (approx. per word bound = -6.418, relative change = 2.198e-05) 
## ....................................................................................................
## Completed E-Step (8 seconds). 
## Completed M-Step (3 seconds). 
## Completing Iteration 17 (approx. per word bound = -6.418, relative change = 1.230e-05) 
## ....................................................................................................
## Completed E-Step (8 seconds). 
## Completed M-Step (3 seconds). 
## Model Converged
plot.STM(poliblogPrevFit, type = "summary")

topicQuality(model=poliblogPrevFit, documents=docs)
##  [1] -101.92734  -54.30599  -95.54238  -94.09291  -90.69681 -128.42601
##  [7]  -82.60370  -84.98504 -105.41771  -87.19903 -105.52033 -105.89442
## [13]  -93.13068  -85.99347  -96.53880 -105.07810  -95.03942  -85.47494
## [19] -101.76570  -89.95134
##  [1] 9.917536 9.874393 9.707715 9.737327 9.688569 9.916309 9.562054
##  [8] 9.918612 9.934172 9.962553 9.864089 9.806485 9.924639 9.891028
## [15] 9.739729 9.932881 9.717994 9.368588 9.664888 9.902868

mod.out.corr <- topicCorr(poliblogPrevFit)
plot.topicCorr(mod.out.corr, vertex.color = "white", vertex.label.cex = 0.95,
                  vertex.label.color = "black")