Papers per year
setwd("/Users/subasishdas1/Copy/Rpubs/rpubs/tm_rpubs")
data <- read.csv("TRB_AM.csv")
dim(data)
## [1] 45933 7
bt_tw2 <- data[!duplicated(data[,2]),]
dim(bt_tw2)
## [1] 15361 7
bt_tw3 <- subset(bt_tw2 , Year.the.Paper.was.Presented > 2006 )
dim(bt_tw3)
## [1] 15357 7
### stm for all documents TITLE
### stm
library(stm)
## stm v1.1.3 (2016-01-14) successfully loaded. See ?stm for help.
processed <- textProcessor(bt_tw3$Paper.Abstract, metadata = bt_tw3, stem=FALSE, customstopwords = c("the", "due","traffic", "model","next", "are", "not", "for", "this", "and", "papers","reveiw", "that", "there", "new", "near", "beyond", "time", "from", "been", "both", "than", "review","subcommittee", "has","now", "until", "all", "use", "two", "ave", "blvd", "east", "between", "ccc", "end", "have", "avenue", "before", "i-us", "i-e", "i-i-", "ames", "belle", "gen", "okeefe", "one", "just", "mac", "being", "i-i-", "left", "right", "west", "when","levels","remaining","based", "issues", "still", "off", "over", "only", "north", "past", "twin", "while", "i-w" , "general" , "harvey", "i-e","i-i-","i-us" , "must", "more", "work","read", "reached", "morrison", "mph", "three","info", "canal", "camp", "la-", "approximately", "amp",
"access", "approaching", "forest", "friday", "its", "affect", "after", "within", "what", "various", "under", "toward", "san", "other" , "city", "into", "by", "for", "is", "are", "their", "he", "she", "research", "through", "between", "under", "below", "over", "with", "an", "affect", "nowadays", "present", "important", "significant",
"then", "using", "having", "via", "vermont", "some", "rap", "how", "can", "inc", "transportation", "advanced", "applied" , "asphalt", "associates", "association", "authority", "center", "central", "cities", "college", "commission","company","construction","consultant" , "consultants" , "consulting", "corporation", "council" ,"county" ,"department","development" , "engineering" ,"group", "highway", "icf" , "imperial", "inc", "innovation","institute","international", "kth" , "laboratory" ,"llc" ,"los" ,"ltd" , "metropolitan","ministry","national" ,"old" , "park" ,"parsons","planning" ,"polytechnic" ,"polytechnique" ,"resource","road" ,"royal", "safety" , "santa","science","state", "systematics","systems","tech","technical","technological",
"technology" , "toronto","transit","transport" , "transportation","united" ,
"universidad", "universitat","university","ahd","paper", "ahn", "special","call","ahd",
"activities","cfp","toledo","geroliminis", "study", "case", "assessment", "analysis", "approach"))
## Building corpus...
## Converting to Lower Case...
## Removing stopwords...
## Remove Custom Stopwords...
## Removing numbers...
## Removing punctuation...
## Creating Output...
out <- prepDocuments(processed$documents, processed$vocab, processed$meta)
## Removing 19270 of 40044 terms (19270 of 1221726 tokens) due to frequency
## Removing 1 Documents with No Words
## Your corpus now has 15356 documents, 20774 terms and 1202456 tokens.
docs <- out$documents
vocab <- out$vocab
meta <-out$meta
#### 10 topics
out <- prepDocuments(processed$documents, processed$vocab,processed$meta, lower.thresh = 200)
## Removing 38654 of 40044 terms (398447 of 1221726 tokens) due to frequency
## Removing 8 Documents with No Words
## Your corpus now has 15349 documents, 1390 terms and 823279 tokens.
docs <- out$documents
poliblogPrevFit <- stm(out$documents, out$vocab, K =10,prevalence=~ Year.the.Paper.was.Presented + Reviewing.Committee.s.Name,
data = out$meta, init.type = "Spectral")
## Beginning Initialization.
## Calculating the gram matrix...
## Finding anchor words...
## ..........
## Recovering initialization...
## .............
## Initialization complete.
## ....................................................................................................
## Completed E-Step (9 seconds).
## Completed M-Step (2 seconds).
## Completing Iteration 1 (approx. per word bound = -6.705)
## ....................................................................................................
## Completed E-Step (9 seconds).
## Completed M-Step (2 seconds).
## Completing Iteration 2 (approx. per word bound = -6.576, relative change = 1.914e-02)
## ....................................................................................................
## Completed E-Step (8 seconds).
## Completed M-Step (2 seconds).
## Completing Iteration 3 (approx. per word bound = -6.539, relative change = 5.737e-03)
## ....................................................................................................
## Completed E-Step (7 seconds).
## Completed M-Step.
## Completing Iteration 4 (approx. per word bound = -6.523, relative change = 2.411e-03)
## ....................................................................................................
## Completed E-Step (7 seconds).
## Completed M-Step (2 seconds).
## Completing Iteration 5 (approx. per word bound = -6.515, relative change = 1.243e-03)
## Topic 1: travel, survey, choice, results, mode
## Topic 2: test, results, mixtures, tests, temperature
## Topic 3: pavement, design, data, performance, pavements
## Topic 4: drivers, crash, speed, crashes, driving
## Topic 5: bridge, project, system, design, projects
## Topic 6: emissions, vehicle, fuel, vehicles, energy
## Topic 7: data, network, travel, models, proposed
## Topic 8: lane, control, signal, vehicles, capacity
## Topic 9: service, bus, rail, system, passenger
## Topic 10: concrete, aggregate, materials, strength, surface
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step (2 seconds).
## Completing Iteration 6 (approx. per word bound = -6.510, relative change = 7.038e-04)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 7 (approx. per word bound = -6.507, relative change = 4.302e-04)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 8 (approx. per word bound = -6.506, relative change = 2.580e-04)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 9 (approx. per word bound = -6.505, relative change = 1.569e-04)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step (2 seconds).
## Completing Iteration 10 (approx. per word bound = -6.504, relative change = 9.343e-05)
## Topic 1: travel, survey, choice, activity, results
## Topic 2: test, mixtures, tests, results, temperature
## Topic 3: pavement, design, data, performance, pavements
## Topic 4: drivers, speed, crash, crashes, driving
## Topic 5: system, bridge, project, management, agencies
## Topic 6: vehicle, emissions, fuel, vehicles, costs
## Topic 7: data, network, models, travel, proposed
## Topic 8: lane, control, pedestrian, vehicles, signal
## Topic 9: service, bus, rail, system, passenger
## Topic 10: concrete, materials, aggregate, strength, surface
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 11 (approx. per word bound = -6.504, relative change = 5.246e-05)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 12 (approx. per word bound = -6.504, relative change = 2.857e-05)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 13 (approx. per word bound = -6.503, relative change = 1.475e-05)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Model Converged
labelTopics(poliblogPrevFit, c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
## Topic 1 Top Words:
## Highest Prob: travel, choice, survey, activity, results, mode, behavior
## FREX: parking, household, bicycle, land, choice, activity, survey
## Lift: parking, household, respondents, households, individuals, home, residents
## Score: parking, travel, household, choice, trip, bicycle, households
## Topic 2 Top Words:
## Highest Prob: test, mixtures, tests, temperature, results, modulus, binder
## FREX: binder, modulus, temperature, mixtures, mixture, test, fatigue
## Lift: binders, binder, temperatures, stress, strain, deformation, permanent
## Score: binders, mixtures, binder, modulus, hma, moisture, temperature
## Topic 3 Top Words:
## Highest Prob: pavement, design, data, performance, pavements, used, condition
## FREX: pavement, pavements, condition, sections, mepdg, rehabilitation, distress
## Lift: mepdg, mechanisticempirical, distress, pavement, rehabilitation, deterioration, sections
## Score: mepdg, pavement, pavements, distress, cracking, thickness, mechanisticempirical
## Topic 4 Top Words:
## Highest Prob: drivers, speed, crash, crashes, driving, data, driver
## FREX: crash, crashes, drivers, driver, driving, weather, speed
## Lift: injury, fatal, crashes, crash, collisions, accident, collision
## Score: injury, crashes, crash, drivers, driver, driving, speed
## Topic 5 Top Words:
## Highest Prob: system, bridge, project, management, agencies, performance, process
## FREX: bridge, projects, agencies, management, project, practices, challenges
## Lift: bridge, funding, stakeholders, lessons, practices, departments, projects
## Score: bridge, projects, project, management, bridges, public, funding
## Topic 6 Top Words:
## Highest Prob: vehicle, emissions, fuel, vehicles, costs, cost, truck
## FREX: emissions, fuel, emission, consumption, noise, truck, energy
## Lift: noise, emissions, emission, fuel, greenhouse, consumption, gas
## Score: noise, emissions, fuel, emission, vehicle, vehicles, energy
## Topic 7 Top Words:
## Highest Prob: data, network, models, proposed, method, travel, results
## FREX: algorithm, estimation, problem, network, link, algorithms, assignment
## Lift: incident, assignment, algorithms, algorithm, equilibrium, stochastic, computational
## Score: incident, travel, algorithm, network, data, models, assignment
## Topic 8 Top Words:
## Highest Prob: lane, control, pedestrian, vehicles, signal, lanes, flow
## FREX: signal, lanes, pedestrian, intersection, lane, intersections, turn
## Lift: turn, signal, signalized, vissim, queue, lanes, intersection
## Score: turn, pedestrian, lane, intersections, intersection, signal, signalized
## Topic 9 Top Words:
## Highest Prob: service, bus, rail, system, passenger, services, freight
## FREX: bus, rail, services, stations, passengers, passenger, station
## Lift: bus, rail, train, buses, station, passengers, ridership
## Score: bus, rail, passengers, service, passenger, buses, freight
## Topic 10 Top Words:
## Highest Prob: concrete, materials, aggregate, strength, material, used, results
## FREX: concrete, materials, water, strength, aggregate, steel, cement
## Lift: aggregates, cement, reinforced, steel, durability, water, concrete
## Score: aggregates, concrete, cement, materials, steel, soil, strength
plot.STM(poliblogPrevFit, type = "summary")

topicQuality(model=poliblogPrevFit, documents=docs)
## [1] -93.72068 -75.22895 -87.26924 -74.95874 -94.00450 -95.56113
## [7] -79.04181 -90.05502 -105.19811 -99.33002
## [1] 9.568776 9.792392 9.718656 9.460848 9.309246 9.867268 9.228019
## [8] 9.884891 9.868550 9.675392

#### 50 topics
docs <- out$documents
poliblogPrevFit <- stm(out$documents, out$vocab, K =20,prevalence=~ Year.the.Paper.was.Presented + Reviewing.Committee.s.Name,
data = out$meta, init.type = "Spectral")
## Beginning Initialization.
## Calculating the gram matrix...
## Finding anchor words...
## ....................
## Recovering initialization...
## .............
## Initialization complete.
## ....................................................................................................
## Completed E-Step (12 seconds).
## Completed M-Step (5 seconds).
## Completing Iteration 1 (approx. per word bound = -6.580)
## ....................................................................................................
## Completed E-Step (11 seconds).
## Completed M-Step (4 seconds).
## Completing Iteration 2 (approx. per word bound = -6.479, relative change = 1.546e-02)
## ....................................................................................................
## Completed E-Step (11 seconds).
## Completed M-Step (4 seconds).
## Completing Iteration 3 (approx. per word bound = -6.450, relative change = 4.491e-03)
## ....................................................................................................
## Completed E-Step (10 seconds).
## Completed M-Step (4 seconds).
## Completing Iteration 4 (approx. per word bound = -6.437, relative change = 1.971e-03)
## ....................................................................................................
## Completed E-Step (10 seconds).
## Completed M-Step (4 seconds).
## Completing Iteration 5 (approx. per word bound = -6.430, relative change = 1.096e-03)
## Topic 1: parking, urban, car, areas, land
## Topic 2: mixtures, test, binder, performance, mixture
## Topic 3: pavement, design, data, performance, models
## Topic 4: crash, crashes, factors, risk, data
## Topic 5: bridge, design, bridges, steel, system
## Topic 6: air, quality, noise, airport, method
## Topic 7: data, method, travel, information, models
## Topic 8: lane, capacity, control, signal, lanes
## Topic 9: service, bus, passenger, rail, system
## Topic 10: concrete, aggregate, materials, strength, material
## Topic 11: pedestrian, pedestrians, crossing, street, walking
## Topic 12: freight, truck, models, demand, data
## Topic 13: vehicle, emissions, fuel, vehicles, energy
## Topic 14: network, problem, route, proposed, travel
## Topic 15: travel, choice, survey, activity, mode
## Topic 16: pavement, surface, sections, friction, treatment
## Topic 17: speed, drivers, driving, driver, vehicle
## Topic 18: project, management, agencies, system, projects
## Topic 19: test, results, soil, load, tests
## Topic 20: fatigue, damage, cracking, loading, test
## ....................................................................................................
## Completed E-Step (9 seconds).
## Completed M-Step (4 seconds).
## Completing Iteration 6 (approx. per word bound = -6.426, relative change = 6.438e-04)
## ....................................................................................................
## Completed E-Step (9 seconds).
## Completed M-Step (4 seconds).
## Completing Iteration 7 (approx. per word bound = -6.423, relative change = 4.166e-04)
## ....................................................................................................
## Completed E-Step (9 seconds).
## Completed M-Step (4 seconds).
## Completing Iteration 8 (approx. per word bound = -6.421, relative change = 2.654e-04)
## ....................................................................................................
## Completed E-Step (9 seconds).
## Completed M-Step (4 seconds).
## Completing Iteration 9 (approx. per word bound = -6.420, relative change = 1.701e-04)
## ....................................................................................................
## Completed E-Step (9 seconds).
## Completed M-Step (4 seconds).
## Completing Iteration 10 (approx. per word bound = -6.420, relative change = 1.110e-04)
## Topic 1: urban, parking, areas, car, congestion
## Topic 2: mixtures, test, binder, mix, mixture
## Topic 3: design, pavement, performance, data, models
## Topic 4: crash, crashes, risk, factors, data
## Topic 5: bridge, design, bridges, steel, system
## Topic 6: quality, air, noise, airport, method
## Topic 7: data, method, travel, information, used
## Topic 8: lane, control, capacity, flow, lanes
## Topic 9: service, bus, rail, passenger, system
## Topic 10: concrete, materials, aggregate, strength, material
## Topic 11: pedestrian, pedestrians, street, crossing, walking
## Topic 12: demand, truck, models, freight, data
## Topic 13: vehicle, emissions, vehicles, fuel, energy
## Topic 14: network, problem, proposed, route, algorithm
## Topic 15: travel, choice, survey, activity, mode
## Topic 16: pavement, surface, sections, maintenance, pavements
## Topic 17: speed, drivers, driving, driver, vehicle
## Topic 18: project, management, agencies, system, projects
## Topic 19: test, results, tests, load, soil
## Topic 20: fatigue, damage, cracking, loading, failure
## ....................................................................................................
## Completed E-Step (8 seconds).
## Completed M-Step (3 seconds).
## Completing Iteration 11 (approx. per word bound = -6.419, relative change = 7.557e-05)
## ....................................................................................................
## Completed E-Step (9 seconds).
## Completed M-Step (3 seconds).
## Completing Iteration 12 (approx. per word bound = -6.419, relative change = 5.888e-05)
## ....................................................................................................
## Completed E-Step (8 seconds).
## Completed M-Step (4 seconds).
## Completing Iteration 13 (approx. per word bound = -6.418, relative change = 4.245e-05)
## ....................................................................................................
## Completed E-Step (8 seconds).
## Completed M-Step (3 seconds).
## Completing Iteration 14 (approx. per word bound = -6.418, relative change = 3.428e-05)
## ....................................................................................................
## Completed E-Step (9 seconds).
## Completed M-Step (3 seconds).
## Completing Iteration 15 (approx. per word bound = -6.418, relative change = 2.772e-05)
## Topic 1: urban, parking, areas, congestion, car
## Topic 2: mixtures, test, binder, mix, mixture
## Topic 3: design, pavement, models, performance, data
## Topic 4: crash, crashes, risk, factors, weather
## Topic 5: bridge, bridges, design, steel, structures
## Topic 6: quality, air, value, noise, airport
## Topic 7: data, method, information, travel, used
## Topic 8: lane, control, flow, capacity, lanes
## Topic 9: service, bus, rail, passenger, system
## Topic 10: concrete, materials, aggregate, strength, material
## Topic 11: pedestrian, pedestrians, street, crossing, walking
## Topic 12: demand, models, truck, freight, spatial
## Topic 13: vehicle, emissions, vehicles, fuel, cost
## Topic 14: network, problem, proposed, route, algorithm
## Topic 15: travel, choice, survey, behavior, mode
## Topic 16: pavement, surface, maintenance, pavements, sections
## Topic 17: speed, drivers, driving, driver, vehicle
## Topic 18: project, management, agencies, system, projects
## Topic 19: test, results, tests, parameters, soil
## Topic 20: fatigue, cracking, damage, loading, failure
## ....................................................................................................
## Completed E-Step (8 seconds).
## Completed M-Step (3 seconds).
## Completing Iteration 16 (approx. per word bound = -6.418, relative change = 2.198e-05)
## ....................................................................................................
## Completed E-Step (8 seconds).
## Completed M-Step (3 seconds).
## Completing Iteration 17 (approx. per word bound = -6.418, relative change = 1.230e-05)
## ....................................................................................................
## Completed E-Step (8 seconds).
## Completed M-Step (3 seconds).
## Model Converged
plot.STM(poliblogPrevFit, type = "summary")

topicQuality(model=poliblogPrevFit, documents=docs)
## [1] -101.92734 -54.30599 -95.54238 -94.09291 -90.69681 -128.42601
## [7] -82.60370 -84.98504 -105.41771 -87.19903 -105.52033 -105.89442
## [13] -93.13068 -85.99347 -96.53880 -105.07810 -95.03942 -85.47494
## [19] -101.76570 -89.95134
## [1] 9.917536 9.874393 9.707715 9.737327 9.688569 9.916309 9.562054
## [8] 9.918612 9.934172 9.962553 9.864089 9.806485 9.924639 9.891028
## [15] 9.739729 9.932881 9.717994 9.368588 9.664888 9.902868

mod.out.corr <- topicCorr(poliblogPrevFit)
plot.topicCorr(mod.out.corr, vertex.color = "white", vertex.label.cex = 0.95,
vertex.label.color = "black")
