Load the relevant libraries.
# rm(list = ls())
library("tidyverse") # data manipulation
library("magrittr") # data manipulation (pipeing data)
library("stringr") # string manipulation
library("lubridate") # date manipulation
library("tidytext") # text manipulation
library("topicmodels") # topic modeling
library("ggplot2") # viz
library("doParallel") # parallel processing
library("ldatuning") # estimating the proper number of topics
# library("scales")
# library("cowplot") # grid plotting
Session Info.
sessionInfo()
R version 3.3.3 (2017-03-06)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: macOS Sierra 10.12.6
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] parallel stats graphics grDevices utils datasets methods base
other attached packages:
[1] ldatuning_0.2.0 doParallel_1.0.11 iterators_1.0.8 foreach_1.4.3
[5] topicmodels_0.2-7 tidytext_0.1.4 lubridate_1.7.1 stringr_1.2.0
[9] magrittr_1.5 dplyr_0.7.4 purrr_0.2.4 readr_1.1.1
[13] tidyr_0.7.2 tibble_1.3.4 ggplot2_2.2.1 tidyverse_1.1.1
loaded via a namespace (and not attached):
[1] modeltools_0.2-21 slam_0.1-40 NLP_0.1-11 reshape2_1.4.2
[5] haven_1.1.0 lattice_0.20-35 colorspace_1.3-2 SnowballC_0.5.1
[9] stats4_3.3.3 yaml_2.1.14 rlang_0.1.4 foreign_0.8-69
[13] glue_1.2.0 modelr_0.1.1 readxl_1.0.0 bindrcpp_0.2
[17] bindr_0.1 plyr_1.8.4 munsell_0.4.3 gtable_0.2.0
[21] cellranger_1.1.0 rvest_0.3.2 codetools_0.2-15 psych_1.7.8
[25] knitr_1.17 forcats_0.2.0 tm_0.7-1 broom_0.4.2
[29] tokenizers_0.1.4 Rcpp_0.12.13 scales_0.5.0 jsonlite_1.5
[33] mnormt_1.5-5 hms_0.3 stringi_1.1.5 grid_3.3.3
[37] tools_3.3.3 lazyeval_0.2.1 janeaustenr_0.1.5 pkgconfig_2.0.1
[41] Matrix_1.2-11 xml2_1.1.1 assertthat_0.2.0 httr_1.3.1
[45] R6_2.2.2 nlme_3.1-131
Setup the root directory.
Setting `wd` as the working directory.
wd <- getwd()
wd
[1] "/Users/mdturse/Desktop/Analytics/dc_doh_hackathon"
Get the raw data. Because of trouble maintaining a connection to Dropbox via R, I first downloaded the raw data from [https://www.dropbox.com/sh/4j7q53lltasez3h/AABQrwrQj3aenqG42wbW7-qra/years_combined/dc_311-2017-01-16.csv?dl=0](https://www.dropbox.com/sh/4j7q53lltasez3h/AABQrwrQj3aenqG42wbW7-qra/years_combined/dc_311-2017-01-16.csv?dl=0) and saved the file locally.
Raw311Data <- read_csv(paste0(wd,
"/Data_Raw/dc_311-2017-01-16.csv"
# "/Data_Raw/dc_311-2017-10-07.csv"
),
progress = FALSE
)
Parsed with column specification:
cols(
.default = col_character(),
SERVICEORDERDATE = col_datetime(format = ""),
SERVICECALLCOUNT = col_integer(),
INSPECTIONDATE = col_datetime(format = ""),
RESOLUTIONDATE = col_datetime(format = ""),
SERVICEDUEDATE = col_datetime(format = ""),
ADDDATE = col_datetime(format = ""),
LASTMODIFIEDDATE = col_datetime(format = ""),
LATITUDE = col_double(),
LONGITUDE = col_double(),
ZIPCODE = col_integer(),
MARADDRESSREPOSITORYID = col_integer(),
DCSTATADDRESSKEY = col_integer(),
DCSTATLOCATIONKEY = col_integer(),
WARD = col_integer(),
PSA = col_integer()
)
See spec(...) for full column specifications.
# saving is done to avoid having to download all the data again
saveRDS(Raw311Data,
paste0(wd,
"/Data_Processed/",
"Raw311Data.Rds"
)
)
str(Raw311Data)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 5093887 obs. of 36 variables:
$ SERVICEREQUESTID : chr "10-00020033" "10-00020034" "10-00020035" "10-00020036" ...
$ SERVICEPRIORITY : chr "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
$ SERVICECODE : chr "S0276" "S0276" "S0276" "S0276" ...
$ SERVICECODEDESCRIPTION : chr "Parking Meter Repair" "Parking Meter Repair" "Parking Meter Repair" "Parking Meter Repair" ...
$ SERVICETYPECODE : chr "TRAOP001" "TRAOP001" "TRAOP001" "TRAOP001" ...
$ SERVICETYPECODEDESCRIPTION: chr "TOA" "TOA" "TOA" "TOA" ...
$ SERVICEORDERDATE : POSIXct, format: "2010-01-21 14:58:07" "2010-01-21 14:58:08" ...
$ SERVICEORDERSTATUS : chr "OVERDUE CLOSED" "CLOSED" "CLOSED" "CLOSED" ...
$ SERVICECALLCOUNT : int 1 1 1 1 1 1 1 1 1 1 ...
$ AGENCYABBREVIATION : chr "DDOT" "DDOT" "DDOT" "DDOT" ...
$ INSPECTIONFLAG : chr "N" "N" "N" "N" ...
$ INSPECTIONDATE : POSIXct, format: NA NA ...
$ RESOLUTION : chr "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
$ RESOLUTIONDATE : POSIXct, format: "2010-01-26 19:31:32" "2010-01-21 15:43:45" ...
$ SERVICEDUEDATE : POSIXct, format: "2010-01-26 14:58:07" "2010-01-26 14:58:08" ...
$ SERVICENOTES : chr NA NA NA NA ...
$ PARENTSERVICEREQUESTID : chr NA NA NA NA ...
$ ADDDATE : POSIXct, format: "2010-01-21 14:58:07" "2010-01-21 14:58:08" ...
$ LASTMODIFIEDDATE : POSIXct, format: "2011-09-01 11:18:17" "2012-08-06 21:19:19" ...
$ SITEADDRESS : chr "17TH STREET NW & RHODE ISLAND AVENUE NW" "20TH STREET NW & K STREET NW" "800 I STREET NW" "901 15TH STREET NW" ...
$ LATITUDE : num 38.9 38.9 38.9 38.9 38.9 ...
$ LONGITUDE : num -77 -77 -77 -77 -77 ...
$ ZIPCODE : int NA NA 20001 20005 NA NA NA 20002 20018 NA ...
$ MARADDRESSREPOSITORYID : int 0 0 285559 218779 0 -100 -100 76069 50068 -100 ...
$ DCSTATADDRESSKEY : int 207502 207430 124416 62196 455845 -100 -100 49582 20159 -100 ...
$ DCSTATLOCATIONKEY : int 207502 207430 124416 62196 391873 348291 348291 49582 20159 348291 ...
$ WARD : int 2 2 2 2 2 NA NA 6 5 NA ...
$ ANC : chr "2B" "2A" "2C" "2F" ...
$ SMD : chr "2B05" "2A06" "2C03" "2F03" ...
$ DISTRICT : chr "SECOND" "SECOND" "FIRST" "SECOND" ...
$ PSA : int 208 207 102 207 208 NA NA 108 503 NA ...
$ NEIGHBORHOODCLUSTER : chr "6" "6" "8" "8" ...
$ HOTSPOT2006NAME : chr "NONE" "NONE" "NONE" "NONE" ...
$ HOTSPOT2005NAME : chr "NONE" "NONE" "NONE" "NONE" ...
$ HOTSPOT2004NAME : chr "NONE" "NONE" "NONE" "NONE" ...
$ SERVICESOURCECODE : chr "PHONE" "PHONE" "PHONE" "PHONE" ...
- attr(*, "problems")=Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 3874 obs. of 5 variables:
..$ row : int 3938387 3938538 3939027 3939187 3939240 3939316 3939514 3939869 3940110 3940469 ...
..$ col : chr "WARD" "WARD" "WARD" "WARD" ...
..$ expected: chr "an integer" "an integer" "an integer" "an integer" ...
..$ actual : chr "Null" "Null" "Null" "Null" ...
..$ file : chr "'/Users/mdturse/Desktop/Analytics/dc_doh_hackathon/Data_Raw/dc_311-2017-01-16.csv'" "'/Users/mdturse/Desktop/Analytics/dc_doh_hackathon/Data_Raw/dc_311-2017-01-16.csv'" "'/Users/mdturse/Desktop/Analytics/dc_doh_hackathon/Data_Raw/dc_311-2017-01-16.csv'" "'/Users/mdturse/Desktop/Analytics/dc_doh_hackathon/Data_Raw/dc_311-2017-01-16.csv'" ...
- attr(*, "spec")=List of 2
..$ cols :List of 36
.. ..$ SERVICEREQUESTID : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICEPRIORITY : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICECODE : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICECODEDESCRIPTION : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICETYPECODE : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICETYPECODEDESCRIPTION: list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICEORDERDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ SERVICEORDERSTATUS : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICECALLCOUNT : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ AGENCYABBREVIATION : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ INSPECTIONFLAG : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ INSPECTIONDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ RESOLUTION : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ RESOLUTIONDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ SERVICEDUEDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ SERVICENOTES : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ PARENTSERVICEREQUESTID : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ ADDDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ LASTMODIFIEDDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ SITEADDRESS : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ LATITUDE : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ LONGITUDE : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ ZIPCODE : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ MARADDRESSREPOSITORYID : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ DCSTATADDRESSKEY : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ DCSTATLOCATIONKEY : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ WARD : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ ANC : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SMD : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ DISTRICT : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ PSA : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ NEIGHBORHOODCLUSTER : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ HOTSPOT2006NAME : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ HOTSPOT2005NAME : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ HOTSPOT2004NAME : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICESOURCECODE : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
..$ default: list()
.. ..- attr(*, "class")= chr "collector_guess" "collector"
..- attr(*, "class")= chr "col_spec"
tail(Raw311Data, 500)
Un-comment the chunk below to load the saved raw data (to avoid having to download the raw data again).
# Raw311Data <- readRDS(paste0(wd,
# "/Data_Processed/",
# "Raw311Data.Rds"
# )
# )
#
# str(Raw311Data)
# tail(Raw311Data, 500)
# View(tail(Raw311Data, 1000))
Selecting those variables that may be useful to test breakdowns of topic modeling. For example, running a topic model separately for the different levels of `servicecode`.
SelectedVars <- select(Raw311Data,
SERVICEREQUESTID,
SERVICEPRIORITY,
SERVICECODE,
SERVICECODEDESCRIPTION,
SERVICETYPECODE,
SERVICETYPECODEDESCRIPTION,
SERVICEORDERDATE,
SERVICENOTES
)
names(SelectedVars) <- tolower(names(SelectedVars))
rm(Raw311Data)
str(SelectedVars)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 5093887 obs. of 8 variables:
$ servicerequestid : chr "10-00020033" "10-00020034" "10-00020035" "10-00020036" ...
$ servicepriority : chr "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
$ servicecode : chr "S0276" "S0276" "S0276" "S0276" ...
$ servicecodedescription : chr "Parking Meter Repair" "Parking Meter Repair" "Parking Meter Repair" "Parking Meter Repair" ...
$ servicetypecode : chr "TRAOP001" "TRAOP001" "TRAOP001" "TRAOP001" ...
$ servicetypecodedescription: chr "TOA" "TOA" "TOA" "TOA" ...
$ serviceorderdate : POSIXct, format: "2010-01-21 14:58:07" "2010-01-21 14:58:08" ...
$ servicenotes : chr NA NA NA NA ...
- attr(*, "problems")=Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 3874 obs. of 5 variables:
..$ row : int 3938387 3938538 3939027 3939187 3939240 3939316 3939514 3939869 3940110 3940469 ...
..$ col : chr "WARD" "WARD" "WARD" "WARD" ...
..$ expected: chr "an integer" "an integer" "an integer" "an integer" ...
..$ actual : chr "Null" "Null" "Null" "Null" ...
..$ file : chr "'/Users/mdturse/Desktop/Analytics/dc_doh_hackathon/Data_Raw/dc_311-2017-01-16.csv'" "'/Users/mdturse/Desktop/Analytics/dc_doh_hackathon/Data_Raw/dc_311-2017-01-16.csv'" "'/Users/mdturse/Desktop/Analytics/dc_doh_hackathon/Data_Raw/dc_311-2017-01-16.csv'" "'/Users/mdturse/Desktop/Analytics/dc_doh_hackathon/Data_Raw/dc_311-2017-01-16.csv'" ...
- attr(*, "spec")=List of 2
..$ cols :List of 36
.. ..$ SERVICEREQUESTID : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICEPRIORITY : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICECODE : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICECODEDESCRIPTION : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICETYPECODE : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICETYPECODEDESCRIPTION: list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICEORDERDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ SERVICEORDERSTATUS : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICECALLCOUNT : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ AGENCYABBREVIATION : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ INSPECTIONFLAG : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ INSPECTIONDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ RESOLUTION : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ RESOLUTIONDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ SERVICEDUEDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ SERVICENOTES : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ PARENTSERVICEREQUESTID : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ ADDDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ LASTMODIFIEDDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ SITEADDRESS : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ LATITUDE : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ LONGITUDE : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ ZIPCODE : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ MARADDRESSREPOSITORYID : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ DCSTATADDRESSKEY : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ DCSTATLOCATIONKEY : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ WARD : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ ANC : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SMD : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ DISTRICT : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ PSA : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ NEIGHBORHOODCLUSTER : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ HOTSPOT2006NAME : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ HOTSPOT2005NAME : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ HOTSPOT2004NAME : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICESOURCECODE : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
..$ default: list()
.. ..- attr(*, "class")= chr "collector_guess" "collector"
..- attr(*, "class")= chr "col_spec"
Quick visual inspection of filtering the data to only service calls with notes (i.e., removing NA values), and only those that are rat-related (`servicecode == "S0311`).
Removing NA values takes us from 5,093,887 rows to 3,018,411 rows.
Looking at only rat-related service calls takes us from 3,018,411 rows to 15,581 rows.
NoNAServiceNotes <- filter(SelectedVars,
!is.na(servicenotes)
)
# message("SelectedVars")
nrow(SelectedVars)
[1] 5093887
# message("NoNAServiceNotes")
nrow(NoNAServiceNotes)
[1] 3018411
rm(SelectedVars)
str(NoNAServiceNotes)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 3018411 obs. of 8 variables:
$ servicerequestid : chr "10-00020037" "10-00020041" "10-00020044" "10-00020050" ...
$ servicepriority : chr "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
$ servicecode : chr "DMV19" "S0003" "S0276" "S0276" ...
$ servicecodedescription : chr "DMV - Vehicle Registration Issues" "Abandoned Vehicle - On Private Property" "Parking Meter Repair" "Parking Meter Repair" ...
$ servicetypecode : chr "DRIVEHSE" "PRSVAVOP" "TRAOP001" "TRAOP001" ...
$ servicetypecodedescription: chr "Driver Vehicle Services" "Abandoned Vehicle Operations - Private" "TOA" "TOA" ...
$ serviceorderdate : POSIXct, format: "2010-01-21 14:58:22" "2010-01-21 14:58:56" ...
$ servicenotes : chr "customer was called informed a trace will be placed on the request" "VEHICLE IN QUESTION WAS GONE ON ARRIVAL PER INVESTIGATOR AUSTIN......TJ" "Comments update: 10-00020044-DEAD" "Comments update: 10-00020050-METER DISPLAYED OUT OF ORDER." ...
- attr(*, "problems")=Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 3874 obs. of 5 variables:
..$ row : int 3938387 3938538 3939027 3939187 3939240 3939316 3939514 3939869 3940110 3940469 ...
..$ col : chr "WARD" "WARD" "WARD" "WARD" ...
..$ expected: chr "an integer" "an integer" "an integer" "an integer" ...
..$ actual : chr "Null" "Null" "Null" "Null" ...
..$ file : chr "'/Users/mdturse/Desktop/Analytics/dc_doh_hackathon/Data_Raw/dc_311-2017-01-16.csv'" "'/Users/mdturse/Desktop/Analytics/dc_doh_hackathon/Data_Raw/dc_311-2017-01-16.csv'" "'/Users/mdturse/Desktop/Analytics/dc_doh_hackathon/Data_Raw/dc_311-2017-01-16.csv'" "'/Users/mdturse/Desktop/Analytics/dc_doh_hackathon/Data_Raw/dc_311-2017-01-16.csv'" ...
- attr(*, "spec")=List of 2
..$ cols :List of 36
.. ..$ SERVICEREQUESTID : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICEPRIORITY : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICECODE : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICECODEDESCRIPTION : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICETYPECODE : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICETYPECODEDESCRIPTION: list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICEORDERDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ SERVICEORDERSTATUS : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICECALLCOUNT : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ AGENCYABBREVIATION : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ INSPECTIONFLAG : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ INSPECTIONDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ RESOLUTION : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ RESOLUTIONDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ SERVICEDUEDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ SERVICENOTES : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ PARENTSERVICEREQUESTID : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ ADDDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ LASTMODIFIEDDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ SITEADDRESS : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ LATITUDE : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ LONGITUDE : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ ZIPCODE : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ MARADDRESSREPOSITORYID : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ DCSTATADDRESSKEY : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ DCSTATLOCATIONKEY : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ WARD : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ ANC : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SMD : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ DISTRICT : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ PSA : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ NEIGHBORHOODCLUSTER : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ HOTSPOT2006NAME : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ HOTSPOT2005NAME : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ HOTSPOT2004NAME : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICESOURCECODE : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
..$ default: list()
.. ..- attr(*, "class")= chr "collector_guess" "collector"
..- attr(*, "class")= chr "col_spec"
View(head(NoNAServiceNotes, 1000))
RatCalls <- filter(NoNAServiceNotes,
servicecode == "S0311"
)
# message("NoNAServiceNotes")
nrow(NoNAServiceNotes)
[1] 3018411
# message("RatCalls")
nrow(RatCalls)
[1] 15581
rm(NoNAServiceNotes)
View(RatCalls)
Add in time-related variables.
RatCalls_Time <- RatCalls %>%
mutate(serviceorder_date = as_date(serviceorderdate),
serviceorder_yr = year(serviceorderdate),
serviceorder_yr_posix = floor_date(serviceorderdate, "year"),
serviceorder_mth = month(serviceorderdate, label = TRUE),
serviceorder_yrmth = as.character(serviceorder_date) %>%
substr(1, 7),
serviceorder_yrmth_posix = floor_date(serviceorderdate, "month"),
serviceorder_day = day(serviceorderdate),
serviceorder_wkday = wday(serviceorderdate, label = TRUE)
)
rm(RatCalls)
str(RatCalls_Time)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 15581 obs. of 16 variables:
$ servicerequestid : chr "10-00020177" "10-00020346" "10-00020514" "10-00020754" ...
$ servicepriority : chr "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
$ servicecode : chr "S0311" "S0311" "S0311" "S0311" ...
$ servicecodedescription : chr "Rat Abatement" "Rat Abatement" "Rat Abatement" "Rat Abatement" ...
$ servicetypecode : chr "DEPAHEAL" "DEPAHEAL" "DEPAHEAL" "DEPAHEAL" ...
$ servicetypecodedescription: chr "DOH" "DOH" "DOH" "DOH" ...
$ serviceorderdate : POSIXct, format: "2010-01-21 15:46:23" "2010-01-21 16:32:51" ...
$ servicenotes : chr "On 1/25/10 L Rogers baited 3 rat burrows in the rear yd." "On 1/26/10 G Cornes found no rat burrows in the area." "On 1/28/10 L Rogers baited 4 rat burrows in the rear yd." "On 1/26/10 V Purvis baited 9 rat burrows in the park." ...
$ serviceorder_date : Date, format: "2010-01-21" "2010-01-21" ...
$ serviceorder_yr : num 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
$ serviceorder_yr_posix : POSIXct, format: "2010-01-01" "2010-01-01" ...
$ serviceorder_mth : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 1 1 1 1 1 1 1 1 1 1 ...
$ serviceorder_yrmth : chr "2010-01" "2010-01" "2010-01" "2010-01" ...
$ serviceorder_yrmth_posix : POSIXct, format: "2010-01-01" "2010-01-01" ...
$ serviceorder_day : int 21 21 21 21 21 22 22 22 22 22 ...
$ serviceorder_wkday : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 5 5 5 5 5 6 6 6 6 6 ...
tail(RatCalls_Time, 500)
View(tail(RatCalls_Time, 1000))
Next we need to clean up the text of the `servicenotes` variable - this will be done in multiple steps.
As the first step, we'll remove common "stopwords" (e.g., is, the, and, etc.) as they won't be very useful in finding topics in the `servicenotes` text. Although it's a stopwords, we specifically do not remove the word "no" as it is often used to distinguish between "rats found" and "no rats found".
NoStopWords_Unnest <-
RatCalls_Time %>%
select(servicerequestid,
servicenotes
) %>%
unnest_tokens(word,
servicenotes
) %>%
anti_join(filter(stop_words,
word != "no" # we don't remove the word "no" as it is often used to distinguish between "rats found" and "no rats found"
),
by = "word"
)
Servicenotes_NoStopWrds <- NoStopWords_Unnest %>%
nest(word) %>%
mutate(servicenotes_nostop = map(data,
unlist
),
servicenotes_nostop = map_chr(servicenotes_nostop,
paste,
collapse = " "
)
) %>%
select(-data)
Remove_StopWrds <- RatCalls_Time %>%
left_join(Servicenotes_NoStopWrds,
by = "servicerequestid"
)
dim(RatCalls_Time)
[1] 15581 16
dim(Remove_StopWrds)
[1] 15581 17
rm(NoStopWords_Unnest, Servicenotes_NoStopWrds)
head(Remove_StopWrds, 100)
View(head(Remove_StopWrds, 100))
# str(Remove_StopWrds)
# View(Remove_StopWrds %>%
# filter(str_detect(servicenotes_nostop,
# "washington dc"
# )
# ) %>%
# select(servicenotes,
# servicenotes_nostop
# )
# )
Then, we'll remove any numeric characters from 'servicenotes' to avoid distinctions not needed at this level (e.g., "baited 3 rat borrows" vs. "baited 6 rat burrows"). We'll also remove punctuation.
ServiceNotesCleaned <- Remove_StopWrds %>%
mutate(servicenotes_nonums_nopunc = str_replace_all(servicenotes_nostop,
"[[:digit:]]",
""
) %>%
str_replace_all("[[:punct:]]",
""
)
) %>%
select(-servicenotes_nostop)
dim(RatCalls_Time)
[1] 15581 16
dim(Remove_StopWrds)
[1] 15581 17
dim(ServiceNotesCleaned)
[1] 15581 17
# View(select(ServiceNotesCleaned,
# servicerequestid,
# servicenotes,
# servicenotes_nonums_nopunc
# ) %>%
# filter(servicerequestid %in% nomatch$servicerequestid)
# )
rm(RatCalls_Time, Remove_StopWrds)
head(ServiceNotesCleaned, 100)
View(head(ServiceNotesCleaned, 100))
Now, we can inspect the frequencies of rat-related service requests.
summary(ServiceNotesCleaned)
servicerequestid servicepriority servicecode servicecodedescription
Length:15581 Length:15581 Length:15581 Length:15581
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
servicetypecode servicetypecodedescription serviceorderdate
Length:15581 Length:15581 Min. :1999-04-27 12:59:00
Class :character Class :character 1st Qu.:2009-07-22 12:00:00
Mode :character Mode :character Median :2011-04-07 15:24:33
Mean :2010-01-30 02:23:50
3rd Qu.:2012-10-05 12:31:54
Max. :2016-12-14 10:45:49
servicenotes serviceorder_date serviceorder_yr serviceorder_yr_posix
Length:15581 Min. :1999-04-27 Min. :1999 Min. :1999-01-01 00:00:00
Class :character 1st Qu.:2009-07-22 1st Qu.:2009 1st Qu.:2009-01-01 00:00:00
Mode :character Median :2011-04-07 Median :2011 Median :2011-01-01 00:00:00
Mean :2010-01-29 Mean :2010 Mean :2009-07-25 06:25:01
3rd Qu.:2012-10-05 3rd Qu.:2012 3rd Qu.:2012-01-01 00:00:00
Max. :2016-12-14 Max. :2016 Max. :2016-01-01 00:00:00
serviceorder_mth serviceorder_yrmth serviceorder_yrmth_posix serviceorder_day
Aug :1923 Length:15581 Min. :1999-04-01 00:00:00 Min. : 1.00
Jun :1697 Class :character 1st Qu.:2009-07-01 00:00:00 1st Qu.: 8.00
Sep :1666 Mode :character Median :2011-04-01 00:00:00 Median :16.00
May :1549 Mean :2010-01-14 17:37:56 Mean :15.71
Oct :1523 3rd Qu.:2012-10-01 00:00:00 3rd Qu.:23.00
Jul :1507 Max. :2016-12-01 00:00:00 Max. :31.00
(Other):5716
serviceorder_wkday servicenotes_nonums_nopunc
Sun: 489 Length:15581
Mon:3140 Class :character
Tue:3132 Mode :character
Wed:3025
Thu:2654
Fri:2450
Sat: 691
# summary(RatCalls_Time$serviceorderdate)
# library("psych")
# describe(RatCalls_Time$serviceorderdate)
ggplot_theme_basic <-
theme(panel.background = element_blank(),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
# axis.text.x = element_blank(),
axis.ticks = element_blank(),
axis.line = element_line(size = 1, colour = "black")
)
# ggplot(data = RatCalls_Time,
# aes(x = serviceorder_date)
# ) +
# geom_histogram() +
# ggplot_theme_basic
yr_counts <- ServiceNotesCleaned %>%
group_by(serviceorder_yr_posix) %>%
summarise(Cnt = n()
) %>%
arrange(serviceorder_yr_posix)
ggplot(data = yr_counts,
aes(x = serviceorder_yr_posix,
y = Cnt
)
) +
geom_col(fill = "light blue") + #, alpha = 0.1) +
geom_text(aes(label = Cnt),
nudge_y = 50,
size = 3
) +
labs(title = "Counts of non-NA ServiceNotes",
# subtitle = "by year",
x = "Year",
y = "Count"
) +
ggplot_theme_basic +
theme(axis.text.x = element_text(angle = 90)
) +
scale_x_datetime(date_breaks = "1 year")

yrmth_counts <- ServiceNotesCleaned %>%
group_by(serviceorder_yrmth_posix) %>%
summarise(Cnt = n()
) %>%
arrange(serviceorder_yrmth_posix)
ggplot(data = yrmth_counts,
aes(x = serviceorder_yrmth_posix,
y = Cnt
)
) +
geom_col(fill = "light blue") + #, alpha = 0.1) +
# geom_text(aes(label = Cnt),
# nudge_y = 50,
# size = 3
# ) +
labs(title = "Counts of non-NA ServiceNotes",
x = "Year-Month",
y = "Count"
) +
ggplot_theme_basic +
theme(axis.text.x = element_text(angle = 90)
) +
coord_cartesian(xlim = c(as.POSIXct("1998-12-01"),
as.POSIXct("2017-12-01")
),
expand = TRUE
) +
scale_x_datetime(date_breaks = "6 months")

Based on the frequencies of when we actually have 'servicenotes' data, let's try limiting the dataset to service calls from 2010 or later. This reduces the dataset further, from 15,581 rows to 11,037 rows.
rm(yr_counts, yrmth_counts)
ServiceNotesCleanedAfter2010 <- ServiceNotesCleaned %>%
filter(serviceorderdate >= as_date("2010-01-01")
)
nrow(ServiceNotesCleaned)
[1] 15581
nrow(ServiceNotesCleanedAfter2010)
[1] 11037
summary(ServiceNotesCleanedAfter2010)
servicerequestid servicepriority servicecode servicecodedescription
Length:11037 Length:11037 Length:11037 Length:11037
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
servicetypecode servicetypecodedescription serviceorderdate
Length:11037 Length:11037 Min. :2010-01-02 12:51:20
Class :character Class :character 1st Qu.:2010-12-08 16:46:26
Mode :character Mode :character Median :2012-02-06 19:55:15
Mean :2012-05-08 12:20:41
3rd Qu.:2013-05-07 18:58:17
Max. :2016-12-14 10:45:49
servicenotes serviceorder_date serviceorder_yr serviceorder_yr_posix
Length:11037 Min. :2010-01-02 Min. :2010 Min. :2010-01-01 00:00:00
Class :character 1st Qu.:2010-12-08 1st Qu.:2010 1st Qu.:2010-01-01 00:00:00
Mode :character Median :2012-02-06 Median :2012 Median :2012-01-01 00:00:00
Mean :2012-05-07 Mean :2012 Mean :2011-11-01 14:15:37
3rd Qu.:2013-05-07 3rd Qu.:2013 3rd Qu.:2013-01-01 00:00:00
Max. :2016-12-14 Max. :2016 Max. :2016-01-01 00:00:00
serviceorder_mth serviceorder_yrmth serviceorder_yrmth_posix serviceorder_day
Aug :1429 Length:11037 Min. :2010-01-01 00:00:00 Min. : 1.00
Jun :1268 Class :character 1st Qu.:2010-12-01 00:00:00 1st Qu.: 8.00
Sep :1260 Mode :character Median :2012-02-01 00:00:00 Median :16.00
Jul :1075 Mean :2012-04-23 02:01:28 Mean :15.78
Oct :1056 3rd Qu.:2013-05-01 00:00:00 3rd Qu.:23.00
May :1030 Max. :2016-12-01 00:00:00 Max. :31.00
(Other):3919
serviceorder_wkday servicenotes_nonums_nopunc
Sun: 427 Length:11037
Mon:2211 Class :character
Tue:2166 Mode :character
Wed:2085
Thu:1846
Fri:1736
Sat: 566
Then, transform the `servicenotes` field into one row per n-gram. Because we don't know what level of 'n' to use, we'll cycle through the possibilities from n = 1 to n = 5.
ngram_list <- 1:5
Rat_Ngram_list <- list()
Rat_Ngram_list <- lapply(ngram_list,
function(i) {
# x <- paste0("0", i, "_gram")
ServiceNotesCleaned %>%
unnest_tokens(n_gram,
servicenotes_nonums_nopunc,
token = "ngrams",
n = i
)
}
)
# rm(ngram_list)
Rat_Ngram_list
[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
# str(Rat_Ngram_list[[1]])
Counting the n-grams in each `servicerequestid`.
word_counts_list <- list()
word_counts_list <- lapply(ngram_list,
function(i) {
Rat_Ngram_list[[i]] %>%
count(servicerequestid,
n_gram,
sort = TRUE
)
}
)
word_counts_list
[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
NA
Transforming the dataframe into a document term matrix - i.e., documents (`servicerequestid`s) are the rows and n-grams are the columns.
dtm_list <- list()
dtm_list <- lapply(ngram_list,
function(i) {
word_counts_list[[i]] %>%
cast_dtm(document = servicerequestid,
term = n_gram,
value = n,
# weighting = tm::weightTfIdf,
# using term frequency inverse document frequency (TfIdf) weighting is another, possibly more accurate measure, but topicmodels::LDA (used below) only accepts document term matrices with term-frequency weighting
weighting = tm::weightTf
)
}
)
dtm_list
[[1]]
<<DocumentTermMatrix (documents: 15580, terms: 3177)>>
Non-/sparse entries: 116407/49381253
Sparsity : 100%
Maximal term length: 20
Weighting : term frequency (tf)
[[2]]
<<DocumentTermMatrix (documents: 15268, terms: 12341)>>
Non-/sparse entries: 102353/188320035
Sparsity : 100%
Maximal term length: 32
Weighting : term frequency (tf)
[[3]]
<<DocumentTermMatrix (documents: 14543, terms: 18256)>>
Non-/sparse entries: 87424/265409584
Sparsity : 100%
Maximal term length: 40
Weighting : term frequency (tf)
[[4]]
<<DocumentTermMatrix (documents: 13157, terms: 20491)>>
Non-/sparse entries: 73020/269527067
Sparsity : 100%
Maximal term length: 47
Weighting : term frequency (tf)
[[5]]
<<DocumentTermMatrix (documents: 11846, terms: 21176)>>
Non-/sparse entries: 59946/250790950
Sparsity : 100%
Maximal term length: 58
Weighting : term frequency (tf)
To determine the "proper" number of topics, here I try using the `ldatuning::FindTopicsNumber` function. The code chunk is based on the vignette here: [https://cran.r-project.org/web/packages/ldatuning/vignettes/topics.html](https://cran.r-project.org/web/packages/ldatuning/vignettes/topics.html).
This analyses was done separately for each n-gram level (n = 1:5), and the overall results were inconclusive - the "proper" number of topics fluctuated between the highest level tried (12 topics) and one of the lowest levels tried (2, 3, or 4 topics).
detectCores(logical = TRUE) - 1
[1] 3
tunes_list <- dtm_list %>%
map(~ FindTopicsNumber(.x,
topics = c(2:12),
metrics = c("Griffiths2004",
"CaoJuan2009",
"Arun2010",
"Deveaud2014"
),
method = "Gibbs",
control = list(seed = 123456789),
mc.cores = 3L,
verbose = TRUE
)
)
fit models... done.
calculate metrics:
Griffiths2004... done.
CaoJuan2009... done.
Arun2010... done.
Deveaud2014... done.
fit models... done.
calculate metrics:
Griffiths2004... done.
CaoJuan2009... done.
Arun2010... done.
Deveaud2014... done.
fit models... done.
calculate metrics:
Griffiths2004... done.
CaoJuan2009... done.
Arun2010... done.
Deveaud2014... done.
fit models... done.
calculate metrics:
Griffiths2004... done.
CaoJuan2009... done.
Arun2010... done.
Deveaud2014... done.
fit models... done.
calculate metrics:
Griffiths2004... done.
CaoJuan2009... done.
Arun2010... done.
Deveaud2014... done.
# str(tunes_list[[5]])
topic_plots <-
tunes_list %>%
map(~ FindTopicsNumber_plot(.x)
)






topic_plots
[[1]]
NULL
[[2]]
NULL
[[3]]
NULL
[[4]]
NULL
[[5]]
NULL
As an alternative, here I try to determine the "proper" number of topics using `topicmodels::perplexity`. Perplexity measures how well a probability model predicts a sample, and I use it here via 10-folder cross validation. For computational purposes, I'm only trying this for the 5-gram model.
This is based on the method used here:
[http://ellisp.github.io/blog/2017/01/05/topic-model-cv](http://ellisp.github.io/blog/2017/01/05/topic-model-cv)
As with the `ldatuning::FindTopicsNumber` function used previously, `topicmodels::perplexity` is also inconclusive as there is no clear "elbow" in the perplexity plot.
full_data <- dtm_list[[5]]
n <- nrow(full_data)
seed <- 123456789
topic_guess <- 12
folds <- 10
burnin <- 1000
iter <-1000
keep <-50
#----------------10-fold cross-validation, different numbers of topics----------------
cluster <- makeCluster(detectCores(logical = TRUE) - 1
) # leave one CPU spare...
registerDoParallel(cluster)
clusterEvalQ(cluster, {
library(topicmodels)
})
[[1]]
[1] "topicmodels" "stats" "graphics" "grDevices" "utils" "datasets"
[7] "methods" "base"
[[2]]
[1] "topicmodels" "stats" "graphics" "grDevices" "utils" "datasets"
[7] "methods" "base"
[[3]]
[1] "topicmodels" "stats" "graphics" "grDevices" "utils" "datasets"
[7] "methods" "base"
folds <- folds
splitfolds <- sample(1:folds, n, replace = TRUE)
candidate_k <- c(2:topic_guess) # candidates for how many topics
clusterExport(cluster,
c("full_data", "burnin", "iter", "keep", "splitfolds", "folds", "candidate_k")
)
# we parallelize by the different number of topics. A processor is allocated a value
# of k, and does the cross-validation serially. This is because it is assumed there
# are more candidate values of k than there are cross-validation folds, hence it
# will be more efficient to parallelise
system.time({
results <- foreach(j = 1:length(candidate_k),
.combine = rbind
) %dopar%{
k <- candidate_k[j]
results_1k <- matrix(0,
nrow = folds,
ncol = 2
)
colnames(results_1k) <- c("k", "perplexity")
for(i in 1:folds){
train_set <- full_data[splitfolds != i , ]
valid_set <- full_data[splitfolds == i, ]
fitted <- LDA(train_set,
k = k,
method = "Gibbs",
control = list(seed = seed,
verbose = 1,
burnin = burnin,
iter = iter,
keep = keep
)
)
results_1k[i, ] <- c(k, perplexity(fitted, newdata = valid_set)
)
}
return(results_1k)
}
})
user system elapsed
1.684 2.252 1080.057
stopCluster(cluster)
results_df <- as.data.frame(results)
# ggplot(data = results_df,
# aes(x = k,
# y = perplexity)
# ) +
# geom_point() +
# geom_smooth(se = FALSE) +
# coord_cartesian(xlim = c(0, 12)
# ) +
# scale_x_continuous(breaks = seq(0, 12, 2)
# ) +
# ggplot_theme_basic +
# ggtitle(label = "10-fold cross-validation of topic modelling",
# subtitle = "(i.e., 10 different models fit for each potential number of topics)"
# ) +
# labs(x = "Potential Number of Topics",
# y = "Perplexity When Fitting the Trained Model to the Hold-Out Set"
# )
ggplot(data = results_df,
aes(x = k,
y = perplexity)
) +
geom_point() +
geom_smooth(se = TRUE) +
coord_cartesian(xlim = c(0, 12),
ylim = c(0, 6000)
) +
scale_x_continuous(breaks = seq(0, 12, 2)
) +
# ggplot_theme_basic +
ggtitle(label = "10-fold cross-validation of topic modelling",
subtitle = "(i.e., 10 different models fit for each potential number of topics)"
) +
labs(x = "Potential Number of Topics",
y = "Perplexity When Fitting the Trained Model to the Hold-Out Set"
)

Remove the no-longer-needed files.
rm(cluster, folds, full_data, results, topic_plots, tunes_list, burnin, candidate_k, folds, iter, k, keep, n, seed, splitfolds, topic_guess)
object 'folds' not foundobject 'k' not found
Here I use Latent Dirichlet allocation for topic modeling. As determining the "proper" number of topics was inconclusive, I'm cycling through every combination of ngrams (1:5) and topics (2:12).
topic_guess <- 2:12
lda_list <- list()
cluster <- makeCluster(detectCores(logical = TRUE) - 1
) # leave one CPU spare...
registerDoParallel(cluster)
for(i in ngram_list) {
for(j in topic_guess) {
x <- LDA(dtm_list[[i]],
k = j,
control = list(seed = 123456789,
verbose = 1
)
)
ifelse((i == min(ngram_list) &
j == min(topic_guess)
),
countx <- 1,
countx <- countx + 1
)
lda_list[[countx]] <- list(ngram = i,
topic = j,
lda_model = x
)
}
}
stopCluster(cluster)
rm(ngram_list, topic_guess, i, j, x, countx, cluster)
lda_list
Creating a dataframe with `beta` - the per-topic-per-ngram probability (i.e., the probability that each ngram is in each topic).
PerTopicPerNgram <- list()
for(i in 1:length(lda_list)
) {
x <- tidy(lda_list[[i]]$lda_model,
matrix = "beta"
) %>%
arrange(term,
desc(beta)
)
PerTopicPerNgram[[i]] <- list(ngram = lda_list[[i]]$ngram,
topic = lda_list[[i]]$topic,
PerTopicPerNgram = x
)
}
rm(i, x)
str(PerTopicPerNgram[[55]]$PerTopicPerNgram)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 254112 obs. of 3 variables:
$ topic: int 5 6 8 2 9 3 1 4 11 7 ...
$ term : chr "a covnes parker bated rat" "a covnes parker bated rat" "a covnes parker bated rat" "a covnes parker bated rat" ...
$ beta : num 2.33e-04 3.29e-13 9.58e-153 8.26e-153 5.04e-153 ...
# rm(serv_req_id_lda)
head(PerTopicPerNgram[[55]]$PerTopicPerNgram, 500)
Creating a dataframe with just the top ten terms (ranked by beta) in each topic.
top_terms <- list()
for(i in 1:length(PerTopicPerNgram)
) {
x <- PerTopicPerNgram[[i]]$PerTopicPerNgram %>%
group_by(topic) %>%
top_n(10,
beta
) %>%
ungroup() %>%
arrange(topic,
-beta
)
top_terms[[i]] <- list(ngram = PerTopicPerNgram[[i]]$ngram,
topic = PerTopicPerNgram[[i]]$topic,
top_terms = x
)
}
rm(i, x)
top_terms[[55]]$top_terms
View(top_terms[[55]]$top_terms)
Now we can plot the top 10 n-grams in each topic to visually inspect if the topic classifications "make sense" based on the n-gram text.
Here, we're just creating and saving the plots themselves.
TopNgrams_ByTopic_BarGraphs <-
top_terms %>%
# to_graph %>%
map(function(x)
x$top_terms %>%
mutate(term = reorder(term,
beta
),
topic = paste0("Topic ",
str_pad(as.character(topic),
width = 2,
side = "left",
pad = "0"
)
)
) %>%
ggplot(aes(x = term,
y = beta,
fill = factor(topic)
)
) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic,
scales = "free",
ncol = 2
) +
ggplot_theme_basic +
theme(plot.title = element_text(size = 11),
axis.title = element_text(size = 10),
axis.text = element_text(size = 9)
) +
labs(title = "Most Common Terms Per Topic",
subtitle = paste0("(",
str_pad(x$ngram,
width = 2,
side = "left",
pad = "0"
),
"gram",
str_pad(x$topic,
width = 2,
side = "left",
pad = "0"
),
"topic)"
),
x = paste0(str_pad(x$ngram,
width = 2,
side = "left",
pad = "0"
),
"gram"
),
y = paste0("probability of the ",
str_pad(x$ngram,
width = 2,
side = "left",
pad = "0"
),
"gram in the topic"
)
) +
coord_flip()
)
# TopNgrams_ByTopic_BarGraphs
TopNgrams_ByTopic_BarGraphs[[24]] # ngram = 3 & topics = 3

TopNgrams_ByTopic_BarGraphs[[25]] # ngram = 3 & topics = 4

TopNgrams_ByTopic_BarGraphs[[35]] # ngram = 4 & topics = 3

TopNgrams_ByTopic_BarGraphs[[36]] # ngram = 4 & topics = 4

TopNgrams_ByTopic_BarGraphs[[46]] # ngram = 5 & topics = 3

TopNgrams_ByTopic_BarGraphs[[47]] # ngram = 5 & topics = 4
TopNgrams_ByTopic_BarGraphs %>%
map(function(x)
ggsave(paste0(wd,
"/Viz/",
# "New_",
substr(x$labels$subtitle,
2,
(nchar(x$labels$subtitle) - 1)
),
"_Top10Terms_facet.png"
),
x,
scale = 4,
width = 6,
height = 6,
units = "cm"
)
)
[[1]]
NULL
[[2]]
NULL
[[3]]
NULL
[[4]]
NULL
[[5]]
NULL
[[6]]
NULL
[[7]]
NULL
[[8]]
NULL
[[9]]
NULL
[[10]]
NULL
[[11]]
NULL
[[12]]
NULL
[[13]]
NULL
[[14]]
NULL
[[15]]
NULL
[[16]]
NULL
[[17]]
NULL
[[18]]
NULL
[[19]]
NULL
[[20]]
NULL
[[21]]
NULL
[[22]]
NULL
[[23]]
NULL
[[24]]
NULL
[[25]]
NULL
[[26]]
NULL
[[27]]
NULL
[[28]]
NULL
[[29]]
NULL
[[30]]
NULL
[[31]]
NULL
[[32]]
NULL
[[33]]
NULL
[[34]]
NULL
[[35]]
NULL
[[36]]
NULL
[[37]]
NULL
[[38]]
NULL
[[39]]
NULL
[[40]]
NULL
[[41]]
NULL
[[42]]
NULL
[[43]]
NULL
[[44]]
NULL
[[45]]
NULL
[[46]]
NULL
[[47]]
NULL
[[48]]
NULL
[[49]]
NULL
[[50]]
NULL
[[51]]
NULL
[[52]]
NULL
[[53]]
NULL
[[54]]
NULL
[[55]]
NULL

Creating a dataframe with `gamma` - the per-document-per-topic probability (i.e., the probability that each document (serv_req_id) is in each topic).
I chose to do this for six different combinations of ngrams and topics (ngram = 3 & topic = 3, 3 & 4, 4 & 3, 4 & 4, 5 & 3, 5 & 4). This was chosen in part becasue after looking at the graphs produced above, these models seemed (by visual inspection) to perform better. It was also done in part becasue a portion of the analyses below requries defining the topics as `unknown`, `no_rats_found`, or `rats_found` by visual inspection of the graphs produced above. Six also seemed to be a good medium between too few and too many visual inspections to do.
rm(TopNgrams_ByTopic_BarGraphs)
top_terms[[55]] #lda model with ngram = 5 & topics = 12
$ngram
[1] 5
$topic
[1] 12
$top_terms
top_terms[[2]] #lda model with ngram = 1 & topics = 3
$ngram
[1] 1
$topic
[1] 3
$top_terms
top_terms[[3]] #lda model with ngram = 1 & topics = 4
$ngram
[1] 1
$topic
[1] 4
$top_terms
top_terms[[24]] #lda model with ngram = 3 & topics = 3
$ngram
[1] 3
$topic
[1] 3
$top_terms
top_terms[[25]] #lda model with ngram = 3 & topics = 4
$ngram
[1] 3
$topic
[1] 4
$top_terms
top_terms[[35]] #lda model with ngram = 4 & topics = 3
$ngram
[1] 4
$topic
[1] 3
$top_terms
top_terms[[36]] #lda model with ngram = 4 & topics = 4
$ngram
[1] 4
$topic
[1] 4
$top_terms
top_terms[[46]] #lda model with ngram = 5 & topics = 3
$ngram
[1] 5
$topic
[1] 3
$top_terms
top_terms[[47]] #lda model with ngram = 5 & topics = 4
$ngram
[1] 5
$topic
[1] 4
$top_terms
ProbDocInTopic_ngram03_topic03 <-
list(ngram = lda_list[[24]]$ngram,
topic = lda_list[[24]]$topic,
data = tidy(lda_list[[24]]$lda_model,
matrix = "gamma"
) %>%
arrange(document,
desc(gamma)
) %>%
mutate(topic_name = case_when(topic %in% c(1) ~ "unknown",
topic %in% c(2) ~ "no_rats_found",
topic %in% c(3) ~ "rats_found"
),
model_ngram = lda_list[[24]]$ngram,
model_topic = lda_list[[24]]$topic
)
)
ProbDocInTopic_ngram03_topic04 <-
list(ngram = lda_list[[25]]$ngram,
topic = lda_list[[25]]$topic,
data = tidy(lda_list[[25]]$lda_model,
matrix = "gamma"
) %>%
arrange(document,
desc(gamma)
) %>%
mutate(topic_name = case_when(topic %in% c(1) ~ "unknown",
topic %in% c(2) ~ "no_rats_found",
topic %in% c(3, 4) ~ "rats_found"
),
model_ngram = lda_list[[25]]$ngram,
model_topic = lda_list[[25]]$topic
)
)
ProbDocInTopic_ngram04_topic03 <-
list(ngram = lda_list[[35]]$ngram,
topic = lda_list[[35]]$topic,
data = tidy(lda_list[[35]]$lda_model,
matrix = "gamma"
) %>%
arrange(document,
desc(gamma)
) %>%
mutate(topic_name = case_when(#topic %in% c() ~ "unknown",
topic %in% c(1) ~ "no_rats_found",
topic %in% c(2, 3) ~ "rats_found"
),
model_ngram = lda_list[[35]]$ngram,
model_topic = lda_list[[35]]$topic
)
)
ProbDocInTopic_ngram04_topic04 <-
list(ngram = lda_list[[36]]$ngram,
topic = lda_list[[36]]$topic,
data = tidy(lda_list[[36]]$lda_model,
matrix = "gamma"
) %>%
arrange(document,
desc(gamma)
) %>%
mutate(topic_name = case_when(topic %in% c(4) ~ "unknown",
topic %in% c(1) ~ "no_rats_found",
topic %in% c(2, 3) ~ "rats_found"
),
model_ngram = lda_list[[36]]$ngram,
model_topic = lda_list[[36]]$topic
)
)
ProbDocInTopic_ngram05_topic03 <-
list(ngram = lda_list[[46]]$ngram,
topic = lda_list[[46]]$topic,
data = tidy(lda_list[[46]]$lda_model,
matrix = "gamma"
) %>%
arrange(document,
desc(gamma)
) %>%
mutate(topic_name = case_when(topic %in% c(2) ~ "unknown",
topic %in% c(3) ~ "no_rats_found",
topic %in% c(1) ~ "rats_found"
),
model_ngram = lda_list[[46]]$ngram,
model_topic = lda_list[[46]]$topic
)
)
ProbDocInTopic_ngram05_topic04 <-
list(ngram = lda_list[[47]]$ngram,
topic = lda_list[[47]]$topic,
data = tidy(lda_list[[47]]$lda_model,
matrix = "gamma"
) %>%
arrange(document,
desc(gamma)
) %>%
mutate(topic_name = case_when(topic %in% c(4) ~ "unknown",
topic %in% c(3) ~ "no_rats_found",
topic %in% c(1, 2) ~ "rats_found"
),
model_ngram = lda_list[[47]]$ngram,
model_topic = lda_list[[47]]$topic
)
)
Here, we put the six individual ProbDocInTopic models together, and add in some of the original information (e.g., the original `servicenotes`) for context.
ProbDocInTopic_AllModels <-
bind_rows(ProbDocInTopic_ngram03_topic03[[3]],
ProbDocInTopic_ngram03_topic04[[3]],
ProbDocInTopic_ngram04_topic03[[3]],
ProbDocInTopic_ngram04_topic04[[3]],
ProbDocInTopic_ngram05_topic03[[3]],
ProbDocInTopic_ngram05_topic04[[3]]
) %>%
arrange(document,
model_ngram,
model_topic,
gamma
) %>%
rename("servicerequestid" = "document") %>%
left_join(select(ServiceNotesCleaned,
servicerequestid,
servicenotes,
serviceorderdate
),
by = c("servicerequestid" = "servicerequestid")
)
rm(list = ls(pattern = "ProbDocInTopic_ngram"))
str(ProbDocInTopic_AllModels)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 276822 obs. of 8 variables:
$ servicerequestid: chr "09-00001211" "09-00001211" "09-00001211" "09-00001211" ...
$ topic : int 2 3 1 2 3 4 1 2 3 1 ...
$ gamma : num 0.0142 0.0142 0.9717 0.018 0.018 ...
$ topic_name : chr "no_rats_found" "rats_found" "unknown" "no_rats_found" ...
$ model_ngram : int 3 3 3 3 3 3 3 4 4 4 ...
$ model_topic : int 3 3 3 4 4 4 4 3 3 3 ...
$ servicenotes : chr "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" ...
$ serviceorderdate: POSIXct, format: "1999-04-27 12:59:00" "1999-04-27 12:59:00" ...
head(ProbDocInTopic_AllModels, 100)
View(head(ProbDocInTopic_AllModels, 1000))
Remove the datafiles that are no longer needed.
rm(list = ls(pattern = "_list"))
rm(PerTopicPerNgram, top_terms)
Next, for each model (e.g., 3-gram 4-topic), we sum the probabilities given for each numeric topic, into the "rats topics" (e.g., `rats_found`) which were defined above via visual inspection of the graphs on the Top 10 ngrams in each numeric topic.
I also pull out the 5-gram 4-topic model because it appeared (visually) to be the most accurate individual model.
ProbDocInTopic_ProbsSummed_ByModel <-
ProbDocInTopic_AllModels %>%
group_by(servicerequestid,
model_ngram,
model_topic,
topic_name
) %>%
summarise(prob_ = sum(gamma)
) %>%
ungroup() %>%
left_join(select(ServiceNotesCleaned,
servicerequestid,
servicenotes,
serviceorderdate
),
by = c("servicerequestid" = "servicerequestid")
)
str(ProbDocInTopic_ProbsSummed_ByModel)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 224119 obs. of 7 variables:
$ servicerequestid: chr "09-00001211" "09-00001211" "09-00001211" "09-00001211" ...
$ model_ngram : int 3 3 3 3 3 3 4 4 4 4 ...
$ model_topic : int 3 3 3 4 4 4 3 3 4 4 ...
$ topic_name : chr "no_rats_found" "rats_found" "unknown" "no_rats_found" ...
$ prob_ : num 0.0142 0.0142 0.9717 0.018 0.0359 ...
$ servicenotes : chr "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" ...
$ serviceorderdate: POSIXct, format: "1999-04-27 12:59:00" "1999-04-27 12:59:00" ...
head(ProbDocInTopic_ProbsSummed_ByModel, 100)
View(head(ProbDocInTopic_ProbsSummed_ByModel, 1000))
ProbDocInTopic_ProbsSummed_05gram04topic <-
ProbDocInTopic_ProbsSummed_ByModel %>%
filter(model_ngram == 5 &
model_topic == 4)
str(ProbDocInTopic_ProbsSummed_05gram04topic)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 35538 obs. of 7 variables:
$ servicerequestid: chr "09-00001211" "09-00001211" "09-00001211" "09-00001410" ...
$ model_ngram : int 5 5 5 5 5 5 5 5 5 5 ...
$ model_topic : int 4 4 4 4 4 4 4 4 4 4 ...
$ topic_name : chr "no_rats_found" "rats_found" "unknown" "no_rats_found" ...
$ prob_ : num 0.0208 0.0415 0.9377 0.0063 0.0126 ...
$ servicenotes : chr "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "the rat are coming from an apartment building adjacent to the alley. there is alot of trash pilled up behind the apartm"| __truncated__ ...
$ serviceorderdate: POSIXct, format: "1999-04-27 12:59:00" "1999-04-27 12:59:00" ...
head(ProbDocInTopic_ProbsSummed_05gram04topic, 100)
View(head(ProbDocInTopic_ProbsSummed_05gram04topic, 1000))
Next, for each ngram-topic combination, I create histograms of the probabilities assigned to each topic. This is done to help visualy determine if the topic assignments are clearly separating documents (serv_request_id values).
A log10 transformation of the probability is done to help more clearly see any differences.
# str(ProbDocInTopic_ProbsSummed_ByModel)
ProbDocInTopic_ProbsSummed_ByModel_Details <-
ProbDocInTopic_ProbsSummed_ByModel %>%
mutate(model = paste0("0",
model_ngram,
"gram_",
"0",
model_topic,
"topic"
),
serviceorder_yr = year(serviceorderdate),
model_and_yr = paste0(model,
"_",
as.character(serviceorder_yr)
)
)
head(ProbDocInTopic_ProbsSummed_ByModel_Details, 100)
View(head(ProbDocInTopic_ProbsSummed_ByModel_Details, 1000))
TopicDistro_MainModels_Histogram_ByModel <-
ProbDocInTopic_ProbsSummed_ByModel_Details %>%
split(.$model) %>%
map(~ ggplot(data = .x,
aes(x = prob_,
fill = topic_name
)
) +
geom_histogram(binwidth = 0.05) +
scale_x_continuous(limits = c(0, 1)
) +
scale_y_log10() + # this transformation is used to help more clearly see any differences in the probability values
ggtitle(label = paste0(.x$model,
"_Histogram"
)
) +
theme(legend.position = "none") +
labs(x = "Prob of ServiceRequestId in the Topic",
y = "log10 of counts"
) +
facet_wrap(~topic_name)
)
TopicDistro_MainModels_Histogram_ByModelYr <-
ProbDocInTopic_ProbsSummed_ByModel_Details %>%
split(.$model_and_yr) %>%
map(~ ggplot(data = .x,
aes(x = prob_,
fill = topic_name
)
) +
geom_histogram(binwidth = 0.05) +
scale_x_continuous(limits = c(0, 1)
) +
scale_y_log10() + # this transformation is used to help more clearly see any differences in the probability values
ggtitle(label = paste0(.x$model_and_yr,
"_Histogram"
)
) +
theme(legend.position = "none") +
labs(x = "Prob of ServiceRequestId in the Topic",
y = "log10 of counts"
) +
facet_wrap(~topic_name)
)
Saving the histograms.
Removing unneeded files.
rm(list = ls(pattern = "TopicDistro_"))
Comparing how the histograms look with a regular y-scale vs a log10 y-scale.
ProbDocInTopic_ProbsSummed_ByModel_Details %>%
split(.$model) %>%
map(~ ggplot(data = .x,
aes(x = prob_,
fill = topic_name
)
) +
geom_histogram(binwidth = 0.05) +
scale_x_continuous(limits = c(0, 1)
) +
scale_y_log10() +
ggtitle(label = paste0(.x$model,
"_Histogram"
)
) +
theme(legend.position = "none") +
labs(x = "Prob of ServiceRequestId in the Topic",
y = "log10 of counts"
) +
facet_wrap(~topic_name,
scales = "fixed"
)
)
$`03gram_03topic`
$`03gram_04topic`
$`04gram_03topic`
$`04gram_04topic`
$`05gram_03topic`
$`05gram_04topic`






ProbDocInTopic_ProbsSummed_ByModel_Details %>%
split(.$model) %>%
map(~ ggplot(data = .x,
aes(x = prob_,
fill = topic_name
)
) +
geom_histogram(binwidth = 0.05) +
scale_x_continuous(limits = c(0, 1)
) +
# scale_y_log10() +
ggtitle(label = paste0(.x$model,
"_Histogram"
)
) +
theme(legend.position = "none") +
labs(x = "Prob of ServiceRequestId in the Topic",
y = "counts"
) +
coord_cartesian(ylim = c(0, 5000)
) +
facet_wrap(~topic_name,
scales = "fixed"
)
)
$`03gram_03topic`
$`03gram_04topic`
$`04gram_03topic`
$`04gram_04topic`
$`05gram_03topic`
$`05gram_04topic`






Then, for each `servicerequestid` and `topic_name` we calculate the mean topic probability across all the models. NOTE: This step could be modified more as my instinct is that more weight should probably be given to the models with larger ngrams and topics (e.g., the 5-gram & 4-topic model). However, using larger values of n-grams will not be able to analyze those records that do not have at least `n` words. Meaning that smaller values of n-grams can analyze more documents, but possibly less accurately.
ProbDocInTopic_MeanProb_BySrvcRqstId <-
ProbDocInTopic_ProbsSummed_ByModel %>%
group_by(servicerequestid,
topic_name
) %>%
summarise(MeanProb = mean(prob_, na.rm = TRUE)
) %>%
left_join(select(ServiceNotesCleaned,
servicerequestid,
servicenotes,
serviceorderdate
),
by = c("servicerequestid" = "servicerequestid")
) %>%
arrange(servicerequestid,
desc(MeanProb)
)
str(ProbDocInTopic_MeanProb_BySrvcRqstId)
Classes ‘grouped_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 43629 obs. of 5 variables:
$ servicerequestid: chr "09-00001211" "09-00001211" "09-00001211" "09-00001323" ...
$ topic_name : chr "unknown" "no_rats_found" "rats_found" "unknown" ...
$ MeanProb : num 0.581 0.333 0.183 0.479 0.466 ...
$ servicenotes : chr "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "rats in the alley behind house" ...
$ serviceorderdate: POSIXct, format: "1999-04-27 12:59:00" "1999-04-27 12:59:00" ...
- attr(*, "vars")= chr "servicerequestid"
- attr(*, "indices")=List of 14543
..$ : int 0 1 2
..$ : int 3 4 5
..$ : int 6 7 8
..$ : int 9 10 11
..$ : int 12 13 14
..$ : int 15 16 17
..$ : int 18 19 20
..$ : int 21 22 23
..$ : int 24 25 26
..$ : int 27 28 29
..$ : int 30 31 32
..$ : int 33 34 35
..$ : int 36 37 38
..$ : int 39 40 41
..$ : int 42 43 44
..$ : int 45 46 47
..$ : int 48 49 50
..$ : int 51 52 53
..$ : int 54 55 56
..$ : int 57 58 59
..$ : int 60 61 62
..$ : int 63 64 65
..$ : int 66 67 68
..$ : int 69 70 71
..$ : int 72 73 74
..$ : int 75 76 77
..$ : int 78 79 80
..$ : int 81 82 83
..$ : int 84 85 86
..$ : int 87 88 89
..$ : int 90 91 92
..$ : int 93 94 95
..$ : int 96 97 98
..$ : int 99 100 101
..$ : int 102 103 104
..$ : int 105 106 107
..$ : int 108 109 110
..$ : int 111 112 113
..$ : int 114 115 116
..$ : int 117 118 119
..$ : int 120 121 122
..$ : int 123 124 125
..$ : int 126 127 128
..$ : int 129 130 131
..$ : int 132 133 134
..$ : int 135 136 137
..$ : int 138 139 140
..$ : int 141 142 143
..$ : int 144 145 146
..$ : int 147 148 149
..$ : int 150 151 152
..$ : int 153 154 155
..$ : int 156 157 158
..$ : int 159 160 161
..$ : int 162 163 164
..$ : int 165 166 167
..$ : int 168 169 170
..$ : int 171 172 173
..$ : int 174 175 176
..$ : int 177 178 179
..$ : int 180 181 182
..$ : int 183 184 185
..$ : int 186 187 188
..$ : int 189 190 191
..$ : int 192 193 194
..$ : int 195 196 197
..$ : int 198 199 200
..$ : int 201 202 203
..$ : int 204 205 206
..$ : int 207 208 209
..$ : int 210 211 212
..$ : int 213 214 215
..$ : int 216 217 218
..$ : int 219 220 221
..$ : int 222 223 224
..$ : int 225 226 227
..$ : int 228 229 230
..$ : int 231 232 233
..$ : int 234 235 236
..$ : int 237 238 239
..$ : int 240 241 242
..$ : int 243 244 245
..$ : int 246 247 248
..$ : int 249 250 251
..$ : int 252 253 254
..$ : int 255 256 257
..$ : int 258 259 260
..$ : int 261 262 263
..$ : int 264 265 266
..$ : int 267 268 269
..$ : int 270 271 272
..$ : int 273 274 275
..$ : int 276 277 278
..$ : int 279 280 281
..$ : int 282 283 284
..$ : int 285 286 287
..$ : int 288 289 290
..$ : int 291 292 293
..$ : int 294 295 296
.. [list output truncated]
- attr(*, "group_sizes")= int 3 3 3 3 3 3 3 3 3 3 ...
- attr(*, "biggest_group_size")= int 3
- attr(*, "labels")='data.frame': 14543 obs. of 1 variable:
..$ servicerequestid: chr "09-00001211" "09-00001323" "09-00001410" "09-00001865" ...
..- attr(*, "vars")= chr "servicerequestid"
tail(ProbDocInTopic_MeanProb_BySrvcRqstId, 100)
View(head(ProbDocInTopic_MeanProb_BySrvcRqstId, 1000))
View(tail(ProbDocInTopic_MeanProb_BySrvcRqstId, 1000))
Here, we create a dataset suitable for graphing and analyzing, by simply selecting the highest topic probability (the `MeanProb` value) assigned to each topic. We also do this for the single 5-gram 4-topic model.
# ProbDocInTopic_AllModels
# ProbDocInTopic_ProbsSummed_ByModel
# ProbDocInTopic_MeanProb_BySrvcRqstId
#
#
# ProbDocInTopic_AllModels %>% select(servicerequestid) %>% distinct() %>% nrow
# ProbDocInTopic_ProbsSummed_ByModel %>% select(servicerequestid) %>% distinct() %>% nrow
# ProbDocInTopic_MeanProb_BySrvcRqstId %>% select(servicerequestid) %>% distinct() %>% nrow
TopProb_BySrvcRqstId <-
ProbDocInTopic_MeanProb_BySrvcRqstId %>%
mutate(serviceorder_yr = year(serviceorderdate),
# serviceorder_yr2 = as.factor(serviceorder_yr),
yr_group = paste0(as.character(serviceorder_yr),
"_",
as.character(topic_name)
),
model = "AllModels_MeanProb"
) %>%
rename("prob" = "MeanProb") %>%
group_by(servicerequestid) %>%
top_n(1,
prob
) %>%
ungroup() %>%
arrange(prob)
str(TopProb_BySrvcRqstId)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 14543 obs. of 8 variables:
$ servicerequestid: chr "09-00119541" "09-00029707" "09-00053342" "09-00040528" ...
$ topic_name : chr "no_rats_found" "unknown" "rats_found" "rats_found" ...
$ prob : num 0.36 0.365 0.365 0.366 0.366 ...
$ servicenotes : chr "3-8-01 baited 2 burrows front yard 3-30-01 baited 2 rat burrows RH/GB 4-6-01 baited 1 burrow in front gb rh" "3350 C ST SE, NO RAT BURROWS WERE FOUND ON THE PROPERTY. AT 3346 & 3348 C ST SE BAITED REAR GARAGE. RICKS/DD" "THE VECTOR CONTROL BRANCH BAITED 2 BURROWS, RIGHT FRONT STEPS. FOLLOW UP BAITING WILL TAKE PLACE SOON; RICKS/PRICE; DD" "THE VECTOR CONTROL BRANCH BAITED THE REAR YARD; 10 RAT BURROWS WERE FOUND. FOLLOW-UP BAITING WILL TAKE PLACE SHORTLY.HERRINGTON"| __truncated__ ...
$ serviceorderdate: POSIXct, format: "2001-02-22 12:25:00" "1999-10-28 16:11:00" ...
$ serviceorder_yr : num 2001 1999 2000 2000 2000 ...
$ yr_group : chr "2001_no_rats_found" "1999_unknown" "2000_rats_found" "2000_rats_found" ...
$ model : chr "AllModels_MeanProb" "AllModels_MeanProb" "AllModels_MeanProb" "AllModels_MeanProb" ...
TopProb_BySrvcRqstId
View(TopProb_BySrvcRqstId)
TopProb_BySrvcRqstId_05gram04topic <-
ProbDocInTopic_ProbsSummed_05gram04topic %>%
mutate(serviceorder_yr = year(serviceorderdate),
# serviceorder_yr2 = as.factor(serviceorder_yr),
yr_group = paste0(as.character(serviceorder_yr),
"_",
as.character(topic_name)
),
model = "5gram4topic"
) %>%
rename("prob" = "prob_") %>%
group_by(servicerequestid) %>%
top_n(1,
prob
) %>%
ungroup() %>%
arrange(prob)
str(TopProb_BySrvcRqstId_05gram04topic)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 11846 obs. of 10 variables:
$ servicerequestid: chr "10-00147588" "11-00106490" "10-00165933" "11-00138104" ...
$ model_ngram : int 5 5 5 5 5 5 5 5 5 5 ...
$ model_topic : int 4 4 4 4 4 4 4 4 4 4 ...
$ topic_name : chr "rats_found" "unknown" "unknown" "unknown" ...
$ prob : num 0.343 0.383 0.391 0.414 0.442 ...
$ servicenotes : chr "On 5/24/10 G Cornes baited 10 rat burrows in front of bldg around sidewalk." "On 4/27/11@9:11am L rogers baited 4 rat burrows in the rear yd by shed. Eaton Bait Block/Peanut Butter, EPA# 56-42, .005% 4oz, "| __truncated__ "On 5/18/10 10:25AM G Curtis baited 2 rat burrows in the alley and tree box" "On 5/18/11@10:45am L Rogers baited 1 rat burrow at tree box. Eaton Bait Block/Peanut Butter, EPA# 56-42, .005% 1oz, Gloves & Sh"| __truncated__ ...
$ serviceorderdate: POSIXct, format: "2010-04-29 14:45:58" "2011-04-15 03:57:15" ...
$ serviceorder_yr : num 2010 2011 2010 2011 2010 ...
$ yr_group : chr "2010_rats_found" "2011_unknown" "2010_unknown" "2011_unknown" ...
$ model : chr "5gram4topic" "5gram4topic" "5gram4topic" "5gram4topic" ...
TopProb_BySrvcRqstId_05gram04topic
View(TopProb_BySrvcRqstId_05gram04topic)
Now, we can simply create the freqpoly plots and density plots for each year-topic combination to investigate how the topic assignment did over time, and then save them.
# str(TopProb_BySrvcRqstId)
# str(TopProb_BySrvcRqstId_05gram04topic)
TopicDistro_AllModelsMean_Freqpoly <-
TopProb_BySrvcRqstId %>%
split(.$serviceorder_yr) %>%
map(~ ggplot(data = .x,
aes(x = prob,
colour = topic_name
)
) +
geom_freqpoly(binwidth = 0.05,
alpha = 0.6
) +
scale_x_continuous(limits = c(0, 1)
) +
ggtitle(label = paste0("TopicDistro_",
.x$model,
"_Freqpoly_",
as.character(.x$serviceorder_yr)
)
) +
labs(x = "Prob of ServiceRequestId in the Topic")
)
TopicDistro_AllModelsMean_Density <-
TopProb_BySrvcRqstId %>%
split(.$serviceorder_yr) %>%
map(~ ggplot(data = .x,
aes(x = prob,
colour = topic_name
)
) +
geom_density() +
scale_x_continuous(limits = c(0, 1)
) +
ggtitle(label = paste0("TopicDistro_",
.x$model,
"_Density_",
as.character(.x$serviceorder_yr)
)
) +
labs(x = "Prob of ServiceRequestId in the Topic")
)
TopicDistro_05gram04topic_Freqpoly <-
TopProb_BySrvcRqstId_05gram04topic %>%
split(.$serviceorder_yr) %>%
map(~ ggplot(data = .x,
aes(x = prob,
colour = topic_name
)
) +
geom_freqpoly(binwidth = 0.05,
alpha = 0.6
) +
scale_x_continuous(limits = c(0, 1)
) +
ggtitle(label = paste0("TopicDistro_",
.x$model,
"_Freqpoly_",
as.character(.x$serviceorder_yr)
)
) +
labs(x = "Prob of ServiceRequestId in the Topic")
)
TopicDistro_05gram04topic_Density <-
TopProb_BySrvcRqstId_05gram04topic %>%
split(.$serviceorder_yr) %>%
map(~ ggplot(data = .x,
aes(x = prob,
colour = topic_name
)
) +
geom_density() +
scale_x_continuous(limits = c(0, 1)
) +
ggtitle(label = paste0("TopicDistro_",
.x$model,
"_Density_",
as.character(.x$serviceorder_yr)
)
) +
labs(x = "Prob of ServiceRequestId in the Topic")
)
# str(TopicDistro_AllModelsMean_Freqpoly[[18]])
# str(TopicDistro_AllModelsMean_Density[[18]])
# str(TopicDistro_05gram04topic_Freqpoly[[18]])
# str(TopicDistro_05gram04topic_Density[[18]])
TopicDistro_AllModelsMean_Freqpoly[[18]]

TopicDistro_AllModelsMean_Density[[18]]

TopicDistro_05gram04topic_Freqpoly[[18]]

TopicDistro_05gram04topic_Density[[18]]

Saving the visuals created above.
Removing unneeded files.
rm(list = ls(pattern = "TopicDistro_"))
As another method to determine the "correct" topic assignment, here I simply count the number of times a topic assignment was given to each document (servicerequestid), and assign the "correct" topic to the topic with the most assignments. In the case of ties (e.g., all three topics, each assigned twice), I use the `MeanProb` calculated above in the `ProbDocInTopic_MeanProb_BySrvcRqstId` dataframe previously.
str(ProbDocInTopic_ProbsSummed_ByModel)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 224119 obs. of 7 variables:
$ servicerequestid: chr "09-00001211" "09-00001211" "09-00001211" "09-00001211" ...
$ model_ngram : int 3 3 3 3 3 3 4 4 4 4 ...
$ model_topic : int 3 3 3 4 4 4 3 3 4 4 ...
$ topic_name : chr "no_rats_found" "rats_found" "unknown" "no_rats_found" ...
$ prob_ : num 0.0142 0.0142 0.9717 0.018 0.0359 ...
$ servicenotes : chr "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" ...
$ serviceorderdate: POSIXct, format: "1999-04-27 12:59:00" "1999-04-27 12:59:00" ...
View(ProbDocInTopic_ProbsSummed_ByModel)
# ProbDocInTopic_ProbsSummed_ByModel %>% select(model_ngram, model_topic) %>% distinct()
str(ProbDocInTopic_MeanProb_BySrvcRqstId)
Classes ‘grouped_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 43629 obs. of 5 variables:
$ servicerequestid: chr "09-00001211" "09-00001211" "09-00001211" "09-00001323" ...
$ topic_name : chr "unknown" "no_rats_found" "rats_found" "unknown" ...
$ MeanProb : num 0.581 0.333 0.183 0.479 0.466 ...
$ servicenotes : chr "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "rats in the alley behind house" ...
$ serviceorderdate: POSIXct, format: "1999-04-27 12:59:00" "1999-04-27 12:59:00" ...
- attr(*, "vars")= chr "servicerequestid"
- attr(*, "indices")=List of 14543
..$ : int 0 1 2
..$ : int 3 4 5
..$ : int 6 7 8
..$ : int 9 10 11
..$ : int 12 13 14
..$ : int 15 16 17
..$ : int 18 19 20
..$ : int 21 22 23
..$ : int 24 25 26
..$ : int 27 28 29
..$ : int 30 31 32
..$ : int 33 34 35
..$ : int 36 37 38
..$ : int 39 40 41
..$ : int 42 43 44
..$ : int 45 46 47
..$ : int 48 49 50
..$ : int 51 52 53
..$ : int 54 55 56
..$ : int 57 58 59
..$ : int 60 61 62
..$ : int 63 64 65
..$ : int 66 67 68
..$ : int 69 70 71
..$ : int 72 73 74
..$ : int 75 76 77
..$ : int 78 79 80
..$ : int 81 82 83
..$ : int 84 85 86
..$ : int 87 88 89
..$ : int 90 91 92
..$ : int 93 94 95
..$ : int 96 97 98
..$ : int 99 100 101
..$ : int 102 103 104
..$ : int 105 106 107
..$ : int 108 109 110
..$ : int 111 112 113
..$ : int 114 115 116
..$ : int 117 118 119
..$ : int 120 121 122
..$ : int 123 124 125
..$ : int 126 127 128
..$ : int 129 130 131
..$ : int 132 133 134
..$ : int 135 136 137
..$ : int 138 139 140
..$ : int 141 142 143
..$ : int 144 145 146
..$ : int 147 148 149
..$ : int 150 151 152
..$ : int 153 154 155
..$ : int 156 157 158
..$ : int 159 160 161
..$ : int 162 163 164
..$ : int 165 166 167
..$ : int 168 169 170
..$ : int 171 172 173
..$ : int 174 175 176
..$ : int 177 178 179
..$ : int 180 181 182
..$ : int 183 184 185
..$ : int 186 187 188
..$ : int 189 190 191
..$ : int 192 193 194
..$ : int 195 196 197
..$ : int 198 199 200
..$ : int 201 202 203
..$ : int 204 205 206
..$ : int 207 208 209
..$ : int 210 211 212
..$ : int 213 214 215
..$ : int 216 217 218
..$ : int 219 220 221
..$ : int 222 223 224
..$ : int 225 226 227
..$ : int 228 229 230
..$ : int 231 232 233
..$ : int 234 235 236
..$ : int 237 238 239
..$ : int 240 241 242
..$ : int 243 244 245
..$ : int 246 247 248
..$ : int 249 250 251
..$ : int 252 253 254
..$ : int 255 256 257
..$ : int 258 259 260
..$ : int 261 262 263
..$ : int 264 265 266
..$ : int 267 268 269
..$ : int 270 271 272
..$ : int 273 274 275
..$ : int 276 277 278
..$ : int 279 280 281
..$ : int 282 283 284
..$ : int 285 286 287
..$ : int 288 289 290
..$ : int 291 292 293
..$ : int 294 295 296
.. [list output truncated]
- attr(*, "group_sizes")= int 3 3 3 3 3 3 3 3 3 3 ...
- attr(*, "biggest_group_size")= int 3
- attr(*, "labels")='data.frame': 14543 obs. of 1 variable:
..$ servicerequestid: chr "09-00001211" "09-00001323" "09-00001410" "09-00001865" ...
..- attr(*, "vars")= chr "servicerequestid"
View(ProbDocInTopic_MeanProb_BySrvcRqstId)
TopicAssigned_ByCounts_ByMeanProb <-
ProbDocInTopic_ProbsSummed_ByModel %>%
group_by(servicerequestid,
model_ngram,
model_topic
) %>%
top_n(1,
prob_
) %>%
ungroup() %>%
count(servicerequestid,
topic_name
) %>%
left_join(ProbDocInTopic_MeanProb_BySrvcRqstId,
by = c("servicerequestid" = "servicerequestid",
"topic_name" = "topic_name"
)
) %>%
select(servicerequestid,
topic_name,
n,
MeanProb
) %>%
group_by(servicerequestid) %>%
arrange(servicerequestid,
desc(n),
desc(MeanProb)
) %>%
ungroup() %>%
group_by(servicerequestid) %>%
mutate(RowNum = row_number()
) %>%
ungroup() %>%
filter(RowNum == 1) %>%
rename(times_topic_assigned = n) %>%
left_join(ServiceNotesCleaned,
by = "servicerequestid"
) %>%
select(servicerequestid,
topic_name,
times_topic_assigned,
MeanProb,
servicenotes,
servicenotes_nonums_nopunc,
serviceorderdate,
serviceorder_date,
serviceorder_yr,
serviceorder_yr_posix,
serviceorder_mth,
serviceorder_yrmth,
serviceorder_yrmth_posix,
serviceorder_day,
serviceorder_wkday
)
str(ServiceNotesCleaned)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 15581 obs. of 17 variables:
$ servicerequestid : chr "10-00020177" "10-00020346" "10-00020514" "10-00020754" ...
$ servicepriority : chr "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
$ servicecode : chr "S0311" "S0311" "S0311" "S0311" ...
$ servicecodedescription : chr "Rat Abatement" "Rat Abatement" "Rat Abatement" "Rat Abatement" ...
$ servicetypecode : chr "DEPAHEAL" "DEPAHEAL" "DEPAHEAL" "DEPAHEAL" ...
$ servicetypecodedescription: chr "DOH" "DOH" "DOH" "DOH" ...
$ serviceorderdate : POSIXct, format: "2010-01-21 15:46:23" "2010-01-21 16:32:51" ...
$ servicenotes : chr "On 1/25/10 L Rogers baited 3 rat burrows in the rear yd." "On 1/26/10 G Cornes found no rat burrows in the area." "On 1/28/10 L Rogers baited 4 rat burrows in the rear yd." "On 1/26/10 V Purvis baited 9 rat burrows in the park." ...
$ serviceorder_date : Date, format: "2010-01-21" "2010-01-21" ...
$ serviceorder_yr : num 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
$ serviceorder_yr_posix : POSIXct, format: "2010-01-01" "2010-01-01" ...
$ serviceorder_mth : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 1 1 1 1 1 1 1 1 1 1 ...
$ serviceorder_yrmth : chr "2010-01" "2010-01" "2010-01" "2010-01" ...
$ serviceorder_yrmth_posix : POSIXct, format: "2010-01-01" "2010-01-01" ...
$ serviceorder_day : int 21 21 21 21 21 22 22 22 22 22 ...
$ serviceorder_wkday : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 5 5 5 5 5 6 6 6 6 6 ...
$ servicenotes_nonums_nopunc: chr " rogers baited rat burrows rear yd" " cornes found no rat burrows" " rogers baited rat burrows rear yd" " purvis baited rat burrows park" ...
str(TopicAssigned_ByCounts_ByMeanProb)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 14543 obs. of 15 variables:
$ servicerequestid : chr "09-00001211" "09-00001323" "09-00001410" "09-00001865" ...
$ topic_name : chr "unknown" "unknown" "rats_found" "no_rats_found" ...
$ times_topic_assigned : int 3 1 4 4 3 3 3 4 2 2 ...
$ MeanProb : num 0.581 0.479 0.662 0.651 0.593 ...
$ servicenotes : chr "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "rats in the alley behind house" "the rat are coming from an apartment building adjacent to the alley. there is alot of trash pilled up behind the apartm"| __truncated__ "The vector control branch baited at 2874 Perry St. NE for rats on 5-25-99." ...
$ servicenotes_nonums_nopunc: chr "customer called vector control control no " "rats alley house" "rat coming apartment building adjacent alley alot trash pilled apartment building" "vector control branch baited perry st ne rats " ...
$ serviceorderdate : POSIXct, format: "1999-04-27 12:59:00" "1999-04-30 19:59:00" ...
$ serviceorder_date : Date, format: "1999-04-27" "1999-04-30" ...
$ serviceorder_yr : num 1999 1999 1999 1999 1999 ...
$ serviceorder_yr_posix : POSIXct, format: "1999-01-01" "1999-01-01" ...
$ serviceorder_mth : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 4 4 5 5 5 5 5 5 6 6 ...
$ serviceorder_yrmth : chr "1999-04" "1999-04" "1999-05" "1999-05" ...
$ serviceorder_yrmth_posix : POSIXct, format: "1999-04-01" "1999-04-01" ...
$ serviceorder_day : int 27 30 6 14 19 21 26 28 3 8 ...
$ serviceorder_wkday : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 3 6 5 6 4 6 4 6 5 3 ...
head(TopicAssigned_ByCounts_ByMeanProb, 1000)
View(TopicAssigned_ByCounts_ByMeanProb)
So now, we can compare two different models, each being slight variations of six LDA models.
1) `TopProb_BySrvcRqstId` assigns the topic by taking the mean topic probability score across all six models.
2) `TopicAssigned_ByCounts_ByMeanProb` assigns the topic by taking the most frequently assigned topic across all six models.
str(TopicAssigned_ByCounts_ByMeanProb)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 14543 obs. of 15 variables:
$ servicerequestid : chr "09-00001211" "09-00001323" "09-00001410" "09-00001865" ...
$ topic_name : chr "unknown" "unknown" "rats_found" "no_rats_found" ...
$ times_topic_assigned : int 3 1 4 4 3 3 3 4 2 2 ...
$ MeanProb : num 0.581 0.479 0.662 0.651 0.593 ...
$ servicenotes : chr "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "rats in the alley behind house" "the rat are coming from an apartment building adjacent to the alley. there is alot of trash pilled up behind the apartm"| __truncated__ "The vector control branch baited at 2874 Perry St. NE for rats on 5-25-99." ...
$ servicenotes_nonums_nopunc: chr "customer called vector control control no " "rats alley house" "rat coming apartment building adjacent alley alot trash pilled apartment building" "vector control branch baited perry st ne rats " ...
$ serviceorderdate : POSIXct, format: "1999-04-27 12:59:00" "1999-04-30 19:59:00" ...
$ serviceorder_date : Date, format: "1999-04-27" "1999-04-30" ...
$ serviceorder_yr : num 1999 1999 1999 1999 1999 ...
$ serviceorder_yr_posix : POSIXct, format: "1999-01-01" "1999-01-01" ...
$ serviceorder_mth : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 4 4 5 5 5 5 5 5 6 6 ...
$ serviceorder_yrmth : chr "1999-04" "1999-04" "1999-05" "1999-05" ...
$ serviceorder_yrmth_posix : POSIXct, format: "1999-04-01" "1999-04-01" ...
$ serviceorder_day : int 27 30 6 14 19 21 26 28 3 8 ...
$ serviceorder_wkday : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 3 6 5 6 4 6 4 6 5 3 ...
str(TopProb_BySrvcRqstId)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 14543 obs. of 8 variables:
$ servicerequestid: chr "09-00119541" "09-00029707" "09-00053342" "09-00040528" ...
$ topic_name : chr "no_rats_found" "unknown" "rats_found" "rats_found" ...
$ prob : num 0.36 0.365 0.365 0.366 0.366 ...
$ servicenotes : chr "3-8-01 baited 2 burrows front yard 3-30-01 baited 2 rat burrows RH/GB 4-6-01 baited 1 burrow in front gb rh" "3350 C ST SE, NO RAT BURROWS WERE FOUND ON THE PROPERTY. AT 3346 & 3348 C ST SE BAITED REAR GARAGE. RICKS/DD" "THE VECTOR CONTROL BRANCH BAITED 2 BURROWS, RIGHT FRONT STEPS. FOLLOW UP BAITING WILL TAKE PLACE SOON; RICKS/PRICE; DD" "THE VECTOR CONTROL BRANCH BAITED THE REAR YARD; 10 RAT BURROWS WERE FOUND. FOLLOW-UP BAITING WILL TAKE PLACE SHORTLY.HERRINGTON"| __truncated__ ...
$ serviceorderdate: POSIXct, format: "2001-02-22 12:25:00" "1999-10-28 16:11:00" ...
$ serviceorder_yr : num 2001 1999 2000 2000 2000 ...
$ yr_group : chr "2001_no_rats_found" "1999_unknown" "2000_rats_found" "2000_rats_found" ...
$ model : chr "AllModels_MeanProb" "AllModels_MeanProb" "AllModels_MeanProb" "AllModels_MeanProb" ...
str(TopProb_BySrvcRqstId_05gram04topic)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 11846 obs. of 10 variables:
$ servicerequestid: chr "10-00147588" "11-00106490" "10-00165933" "11-00138104" ...
$ model_ngram : int 5 5 5 5 5 5 5 5 5 5 ...
$ model_topic : int 4 4 4 4 4 4 4 4 4 4 ...
$ topic_name : chr "rats_found" "unknown" "unknown" "unknown" ...
$ prob : num 0.343 0.383 0.391 0.414 0.442 ...
$ servicenotes : chr "On 5/24/10 G Cornes baited 10 rat burrows in front of bldg around sidewalk." "On 4/27/11@9:11am L rogers baited 4 rat burrows in the rear yd by shed. Eaton Bait Block/Peanut Butter, EPA# 56-42, .005% 4oz, "| __truncated__ "On 5/18/10 10:25AM G Curtis baited 2 rat burrows in the alley and tree box" "On 5/18/11@10:45am L Rogers baited 1 rat burrow at tree box. Eaton Bait Block/Peanut Butter, EPA# 56-42, .005% 1oz, Gloves & Sh"| __truncated__ ...
$ serviceorderdate: POSIXct, format: "2010-04-29 14:45:58" "2011-04-15 03:57:15" ...
$ serviceorder_yr : num 2010 2011 2010 2011 2010 ...
$ yr_group : chr "2010_rats_found" "2011_unknown" "2010_unknown" "2011_unknown" ...
$ model : chr "5gram4topic" "5gram4topic" "5gram4topic" "5gram4topic" ...
dim(TopicAssigned_ByCounts_ByMeanProb)
[1] 14543 15
dim(TopProb_BySrvcRqstId)
[1] 14543 8
dim(TopProb_BySrvcRqstId_05gram04topic)
[1] 11846 10
a <- TopProb_BySrvcRqstId %>%
select(servicerequestid,
topic_name,
prob
) %>%
rename(topic_name_meanprob = topic_name,
prob_meanprob = prob
)
b <- TopicAssigned_ByCounts_ByMeanProb %>%
rename(topic_name_topcounts = topic_name,
times_topic_assigned_topcounts = times_topic_assigned,
prob_topcounts = MeanProb
)
ModelsCompare <- a %>%
left_join(b,
by = "servicerequestid"
)
rm(a, b)
str(ModelsCompare)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 14543 obs. of 17 variables:
$ servicerequestid : chr "09-00119541" "09-00029707" "09-00053342" "09-00040528" ...
$ topic_name_meanprob : chr "no_rats_found" "unknown" "rats_found" "rats_found" ...
$ prob_meanprob : num 0.36 0.365 0.365 0.366 0.366 ...
$ topic_name_topcounts : chr "no_rats_found" "unknown" "rats_found" "rats_found" ...
$ times_topic_assigned_topcounts: int 3 2 2 2 2 2 2 2 2 2 ...
$ prob_topcounts : num 0.36 0.365 0.365 0.366 0.366 ...
$ servicenotes : chr "3-8-01 baited 2 burrows front yard 3-30-01 baited 2 rat burrows RH/GB 4-6-01 baited 1 burrow in front gb rh" "3350 C ST SE, NO RAT BURROWS WERE FOUND ON THE PROPERTY. AT 3346 & 3348 C ST SE BAITED REAR GARAGE. RICKS/DD" "THE VECTOR CONTROL BRANCH BAITED 2 BURROWS, RIGHT FRONT STEPS. FOLLOW UP BAITING WILL TAKE PLACE SOON; RICKS/PRICE; DD" "THE VECTOR CONTROL BRANCH BAITED THE REAR YARD; 10 RAT BURROWS WERE FOUND. FOLLOW-UP BAITING WILL TAKE PLACE SHORTLY.HERRINGTON"| __truncated__ ...
$ servicenotes_nonums_nopunc : chr " baited burrows front yard baited rat burrows rh gb baited burrow front gb rh" " st se no rat burrows found property st se baited rear garage ricks dd" "vector control branch baited burrows front steps follow baiting ricks price dd" "vector control branch baited rear yard rat burrows found follow baiting shortlyherrington brown dd" ...
$ serviceorderdate : POSIXct, format: "2001-02-22 12:25:00" "1999-10-28 16:11:00" ...
$ serviceorder_date : Date, format: "2001-02-22" "1999-10-28" ...
$ serviceorder_yr : num 2001 1999 2000 2000 2000 ...
$ serviceorder_yr_posix : POSIXct, format: "2001-01-01" "1999-01-01" ...
$ serviceorder_mth : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 2 10 3 1 1 4 4 5 5 5 ...
$ serviceorder_yrmth : chr "2001-02" "1999-10" "2000-03" "2000-01" ...
$ serviceorder_yrmth_posix : POSIXct, format: "2001-02-01" "1999-10-01" ...
$ serviceorder_day : int 22 28 22 13 21 17 1 13 16 24 ...
$ serviceorder_wkday : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 5 5 4 5 6 2 4 5 1 2 ...
ModelsCompare
View(ModelsCompare)
When comparing the two models, they coincide nearly 99% of the time...but the disagreements still raise questions.
filter(ModelsCompare,
topic_name_meanprob != topic_name_topcounts
) %>%
nrow() / nrow(ModelsCompare) * 100
[1] 1.155195
filter(ModelsCompare,
topic_name_meanprob == topic_name_topcounts
) %>%
nrow() / nrow(ModelsCompare) * 100
[1] 98.84481
# View(filter(ModelsCompare,
# topic_name_meanprob != topic_name_topcounts
# )
# )
View(filter(ModelsCompare,
topic_name_meanprob != topic_name_topcounts
) %>%
select(servicerequestid,
topic_name_meanprob,
topic_name_topcounts,
servicenotes,
servicenotes_nonums_nopunc
)
)
As the models assign topics almost identically, let's use the `topcounts` model to do some quick inspections about how often each of the topics were assigned, when, changes over time, etc.
Counts_AllYears <- ModelsCompare %>%
mutate(topic_name = factor(topic_name_topcounts,
levels = c("unknown",
"no_rats_found",
"rats_found"
)
)
) %>%
group_by(topic_name) %>%
count() %>%
rename(counts = n)
Counts_AcrossYears <- ModelsCompare %>%
mutate(topic_name = factor(topic_name_topcounts,
levels = c("unknown",
"no_rats_found",
"rats_found"
)
)
) %>%
group_by(topic_name,
serviceorder_yr
) %>%
count() %>%
rename(counts = n)
ggplot(data = Counts_AllYears,
aes(x = topic_name,
y = counts,
fill = topic_name
)
) +
geom_col() +
geom_text(aes(label = counts),
nudge_y = -200,
size = 3
) +
# theme_minimal() +
theme(legend.position = "none") +
coord_flip()
ggsave(paste0(wd,
"/Viz/",
# "New_",
"Topics_CountModel_Counts_AllYears.png"
),
scale = 4,
width = 6,
height = 6,
units = "cm"
)

ggplot(data = Counts_AcrossYears,
aes(x = topic_name,
y = counts,
fill = topic_name
)
) +
geom_col() +
geom_text(aes(label = counts),
nudge_y = 100,
size = 2.5
) +
scale_y_continuous(limits = c(0, 1800),
breaks = seq(0, 1800, 300)
) +
facet_wrap(~serviceorder_yr) +
theme(legend.position = "none") +
coord_flip()
ggsave(paste0(wd,
"/Viz/",
# "New_",
"Topics_CountModel_Counts_AcrossYears.png"
),
scale = 4,
width = 8,
height = 6,
units = "cm"
)

---
title: "Example of Using Topic Modeling (via Latent Dirichlet allocation) to Pull Out Features from 311 Service Notes"
output: html_notebook
---
  
    
    Load the relevant libraries.
```{r, message=FALSE, warning=FALSE}

# rm(list = ls())


library("tidyverse")          # data manipulation
library("magrittr")           # data manipulation (pipeing data)
library("stringr")            # string manipulation
library("lubridate")          # date manipulation
library("tidytext")           # text manipulation
library("topicmodels")        # topic modeling
library("ggplot2")            # viz
library("doParallel")         # parallel processing
library("ldatuning")          # estimating the proper number of topics
# library("scales")
# library("cowplot")            # grid plotting

```
  
    
    Session Info.
```{r}

sessionInfo()

```
  
    
    Setup the root directory.
```{r "setup", include = FALSE}

require("knitr")

opts_knit$set(root.dir = "/Users/mdturse/Desktop/Analytics/dc_doh_hackathon")

```
  
    
    Setting `wd` as the working directory.
```{r}

wd <- getwd()

wd

```
  
    
    Get the raw data. Because of trouble maintaining a connection to Dropbox via R, I first downloaded the raw data from [https://www.dropbox.com/sh/4j7q53lltasez3h/AABQrwrQj3aenqG42wbW7-qra/years_combined/dc_311-2017-01-16.csv?dl=0](https://www.dropbox.com/sh/4j7q53lltasez3h/AABQrwrQj3aenqG42wbW7-qra/years_combined/dc_311-2017-01-16.csv?dl=0) and saved the file locally.
```{r, warning=FALSE}

Raw311Data <- read_csv(paste0(wd,
                              "/Data_Raw/dc_311-2017-01-16.csv"
                              # "/Data_Raw/dc_311-2017-10-07.csv"
                              ),
                       progress = FALSE
                       )

# saving is done to avoid having to download all the data again
saveRDS(Raw311Data,
        paste0(wd,
               "/Data_Processed/",
               "Raw311Data.Rds"
               )
        )

str(Raw311Data)
tail(Raw311Data, 500)

```
  
    
    Un-comment the chunk below to load the saved raw data (to avoid having to download the raw data again).
```{r}

# Raw311Data <- readRDS(paste0(wd,
#                              "/Data_Processed/",
#                              "Raw311Data.Rds"
#                              )
#                       )
# 
# str(Raw311Data)
# tail(Raw311Data, 500)
# View(tail(Raw311Data, 1000))

```
  
    
    Selecting those variables that may be useful to test breakdowns of topic modeling.  For example, running a topic model separately for the different levels of `servicecode`.
```{r}

SelectedVars <- select(Raw311Data,
                       SERVICEREQUESTID,
                       SERVICEPRIORITY,
                       SERVICECODE,
                       SERVICECODEDESCRIPTION,
                       SERVICETYPECODE,
                       SERVICETYPECODEDESCRIPTION,
                       SERVICEORDERDATE,
                       SERVICENOTES
                       )

names(SelectedVars) <- tolower(names(SelectedVars))

rm(Raw311Data)
str(SelectedVars)

```
  
    
    Quick visual inspection of filtering the data to only service calls with notes (i.e., removing NA values), and only those that are rat-related (`servicecode == "S0311`).  
      
    Removing NA values takes us from 5,093,887 rows to 3,018,411 rows.  
    
    Looking at only rat-related service calls takes us from 3,018,411 rows to 15,581 rows.
```{r}

NoNAServiceNotes <- filter(SelectedVars,
                           !is.na(servicenotes)
                           )

# message("SelectedVars")
nrow(SelectedVars)
# message("NoNAServiceNotes")
nrow(NoNAServiceNotes)

rm(SelectedVars)
str(NoNAServiceNotes)

View(head(NoNAServiceNotes, 1000))



RatCalls <- filter(NoNAServiceNotes,
                   servicecode == "S0311"
                   )

# message("NoNAServiceNotes")
nrow(NoNAServiceNotes)
# message("RatCalls")
nrow(RatCalls)

rm(NoNAServiceNotes)
View(RatCalls)

```
  
    
    Add in time-related variables.
```{r}

RatCalls_Time <- RatCalls %>%
  mutate(serviceorder_date = as_date(serviceorderdate),
         serviceorder_yr = year(serviceorderdate),
         serviceorder_yr_posix = floor_date(serviceorderdate, "year"),
         serviceorder_mth = month(serviceorderdate, label = TRUE),
         serviceorder_yrmth = as.character(serviceorder_date) %>% 
           substr(1, 7),
         serviceorder_yrmth_posix = floor_date(serviceorderdate, "month"),
         serviceorder_day = day(serviceorderdate),
         serviceorder_wkday = wday(serviceorderdate, label = TRUE)
         )

rm(RatCalls)
str(RatCalls_Time)
tail(RatCalls_Time, 500)
View(tail(RatCalls_Time, 1000))

```
  
    
    Next we need to clean up the text of the `servicenotes` variable - this will be done in multiple steps.  
      
    As the first step, we'll remove common "stopwords" (e.g., is, the, and, etc.) as they won't be very useful in finding topics in the `servicenotes` text. Although it's a stopwords, we specifically do not remove the word "no" as it is often used to distinguish between "rats found" and "no rats found".
```{r}

NoStopWords_Unnest <- 
  RatCalls_Time %>% 
  select(servicerequestid,
         servicenotes
         ) %>% 
  unnest_tokens(word,
                servicenotes
                ) %>% 
  anti_join(filter(stop_words,
                   word != "no" # we don't remove the word "no" as it is often used to distinguish between "rats found" and "no rats found"
                   ),
            by = "word"
            )

Servicenotes_NoStopWrds <- NoStopWords_Unnest %>% 
  nest(word) %>% 
  mutate(servicenotes_nostop = map(data,
                                   unlist
                                   ),
         servicenotes_nostop = map_chr(servicenotes_nostop,
                                       paste,
                                       collapse = " "
                                       )
         ) %>% 
  select(-data)


Remove_StopWrds <- RatCalls_Time %>% 
  left_join(Servicenotes_NoStopWrds,
            by = "servicerequestid"
            )

dim(RatCalls_Time)
dim(Remove_StopWrds)

rm(NoStopWords_Unnest, Servicenotes_NoStopWrds)

head(Remove_StopWrds, 100)
View(head(Remove_StopWrds, 100))

```
  
    
    
```{r}

# str(Remove_StopWrds)

# View(Remove_StopWrds %>% 
#        filter(str_detect(servicenotes_nostop,
#                          "washington dc"
#                          )
#               ) %>% 
#        select(servicenotes,
#               servicenotes_nostop
#               )
#      )

```
  
    
    Then, we'll remove any numeric characters from 'servicenotes' to avoid distinctions not needed at this level (e.g., "baited 3 rat borrows" vs. "baited 6 rat burrows"). We'll also remove punctuation.
```{r}

ServiceNotesCleaned <- Remove_StopWrds %>% 
  mutate(servicenotes_nonums_nopunc = str_replace_all(servicenotes_nostop,
                                                      "[[:digit:]]",
                                                      ""
                                                      ) %>% 
           str_replace_all("[[:punct:]]",
                           ""
                           )
         ) %>% 
  select(-servicenotes_nostop)

dim(RatCalls_Time)
dim(Remove_StopWrds)
dim(ServiceNotesCleaned)

# View(select(ServiceNotesCleaned,
#             servicerequestid,
#             servicenotes,
#             servicenotes_nonums_nopunc
#             ) %>% 
#        filter(servicerequestid %in% nomatch$servicerequestid)
#      )

rm(RatCalls_Time, Remove_StopWrds)

head(ServiceNotesCleaned, 100)
View(head(ServiceNotesCleaned, 100))

```
  
    
    Now, we can inspect the frequencies of rat-related service requests.
```{r}

summary(ServiceNotesCleaned)
# summary(RatCalls_Time$serviceorderdate)
# library("psych")
# describe(RatCalls_Time$serviceorderdate)


ggplot_theme_basic <-
  theme(panel.background = element_blank(),
        panel.grid.minor = element_blank(),
        panel.grid.major = element_blank(),
        # axis.text.x = element_blank(),
        axis.ticks = element_blank(),
        axis.line = element_line(size = 1, colour = "black")
        )

# ggplot(data = RatCalls_Time,
#        aes(x = serviceorder_date)
#        ) +
#   geom_histogram() +
#   ggplot_theme_basic

yr_counts <- ServiceNotesCleaned %>% 
  group_by(serviceorder_yr_posix) %>% 
  summarise(Cnt = n()
            ) %>% 
  arrange(serviceorder_yr_posix)

ggplot(data = yr_counts,
       aes(x = serviceorder_yr_posix,
           y = Cnt
           )
       ) +
  geom_col(fill = "light blue") + #, alpha = 0.1) +
  geom_text(aes(label = Cnt),
            nudge_y = 50,
            size = 3
            ) +
  labs(title = "Counts of non-NA ServiceNotes",
       # subtitle = "by year",
       x = "Year",
       y = "Count"
       ) +
  ggplot_theme_basic +
  theme(axis.text.x = element_text(angle = 90)
        ) +
  scale_x_datetime(date_breaks = "1 year")



yrmth_counts <- ServiceNotesCleaned %>% 
  group_by(serviceorder_yrmth_posix) %>% 
  summarise(Cnt = n()
            ) %>% 
  arrange(serviceorder_yrmth_posix)

ggplot(data = yrmth_counts,
       aes(x = serviceorder_yrmth_posix,
           y = Cnt
           )
       ) +
  geom_col(fill = "light blue") + #, alpha = 0.1) +
  # geom_text(aes(label = Cnt),
  #           nudge_y = 50,
  #           size = 3
  #           ) +
  labs(title = "Counts of non-NA ServiceNotes",
       x = "Year-Month",
       y = "Count"
       ) +
  ggplot_theme_basic +
  theme(axis.text.x = element_text(angle = 90)
        ) +
  coord_cartesian(xlim = c(as.POSIXct("1998-12-01"),
                           as.POSIXct("2017-12-01")
                           ),
                  expand = TRUE
                  ) +
  scale_x_datetime(date_breaks = "6 months")

```
  
    
    Based on the frequencies of when we actually have 'servicenotes' data, let's try limiting the dataset to service calls from 2010 or later. This reduces the dataset further, from 15,581 rows to 11,037 rows. 
```{r}

rm(yr_counts, yrmth_counts)


ServiceNotesCleanedAfter2010 <- ServiceNotesCleaned %>%
  filter(serviceorderdate >= as_date("2010-01-01")
         )

nrow(ServiceNotesCleaned)
nrow(ServiceNotesCleanedAfter2010)

summary(ServiceNotesCleanedAfter2010)

```
  
    
    Then, transform the `servicenotes` field into one row per n-gram. Because we don't know what level of 'n' to use, we'll cycle through the possibilities from n = 1 to n = 5.
```{r}

ngram_list <- 1:5

Rat_Ngram_list <- list()

Rat_Ngram_list <- lapply(ngram_list,
                         function(i) {
                           # x <- paste0("0", i, "_gram")
                           ServiceNotesCleaned %>% 
                             unnest_tokens(n_gram,
                                           servicenotes_nonums_nopunc,
                                           token = "ngrams",
                                           n = i
                                           )
                           }
                         )

# rm(ngram_list)
Rat_Ngram_list
# str(Rat_Ngram_list[[1]])

```
  
    
    Counting the n-grams in each `servicerequestid`.
```{r}

word_counts_list <- list()

word_counts_list <- lapply(ngram_list,
                           function(i) {
                             Rat_Ngram_list[[i]] %>% 
                               count(servicerequestid,
                                     n_gram,
                                     sort = TRUE
                                     )
                             }
                           )

word_counts_list

```
  
    
    Transforming the dataframe into a document term matrix - i.e., documents (`servicerequestid`s) are the rows and n-grams are the columns.
```{r}

dtm_list <- list()

dtm_list <- lapply(ngram_list,
                   function(i) {
                     word_counts_list[[i]] %>% 
                       cast_dtm(document = servicerequestid,
                                term = n_gram,
                                value = n,
                                # weighting = tm::weightTfIdf,
                                # using term frequency inverse document frequency (TfIdf) weighting is another, possibly more accurate measure, but topicmodels::LDA (used below) only accepts document term matrices with term-frequency weighting
                                weighting = tm::weightTf
                                )
                     }
                   )

dtm_list

```
  
    
    To determine the "proper" number of topics, here I try using the `ldatuning::FindTopicsNumber` function. The code chunk is based on the vignette here:  [https://cran.r-project.org/web/packages/ldatuning/vignettes/topics.html](https://cran.r-project.org/web/packages/ldatuning/vignettes/topics.html).  
    
    This analyses was done separately for each n-gram level (n = 1:5), and the overall results were inconclusive - the "proper" number of topics fluctuated between the highest level tried (12 topics) and one of the lowest levels tried (2, 3, or 4 topics).
```{r}

detectCores(logical = TRUE) - 1


tunes_list <- dtm_list %>% 
  map(~ FindTopicsNumber(.x,
                         topics = c(2:12),
                         metrics = c("Griffiths2004",
                                     "CaoJuan2009",
                                     "Arun2010",
                                     "Deveaud2014"
                                     ),
                         method = "Gibbs",
                         control = list(seed = 123456789),
                         mc.cores = 3L,
                         verbose = TRUE
                         )
      )


# str(tunes_list[[5]])

topic_plots <-
  tunes_list %>% 
  map(~ FindTopicsNumber_plot(.x)
      )

topic_plots

```
  
    
    As an alternative, here I try to determine the "proper" number of topics using `topicmodels::perplexity`. Perplexity measures how well a probability model predicts a sample, and I use it here via 10-folder cross validation. For computational purposes, I'm only trying this for the 5-gram model.  
    
    This is based on the method used here:
    [http://ellisp.github.io/blog/2017/01/05/topic-model-cv](http://ellisp.github.io/blog/2017/01/05/topic-model-cv)
    
    As with the `ldatuning::FindTopicsNumber` function used previously, `topicmodels::perplexity` is also inconclusive as there is no clear "elbow" in the perplexity plot.
```{r}

full_data <- dtm_list[[5]]

n <- nrow(full_data)
seed <- 123456789
topic_guess <- 12
folds <- 10
burnin <- 1000
iter <-1000
keep <-50

#----------------10-fold cross-validation, different numbers of topics----------------
cluster <- makeCluster(detectCores(logical = TRUE) - 1
                       ) # leave one CPU spare...
registerDoParallel(cluster)

clusterEvalQ(cluster, {
   library(topicmodels)
})

folds <- folds
splitfolds <- sample(1:folds, n, replace = TRUE)
candidate_k <- c(2:topic_guess) # candidates for how many topics
clusterExport(cluster,
              c("full_data", "burnin", "iter", "keep", "splitfolds", "folds", "candidate_k")
              )

# we parallelize by the different number of topics.  A processor is allocated a value
# of k, and does the cross-validation serially.  This is because it is assumed there
# are more candidate values of k than there are cross-validation folds, hence it
# will be more efficient to parallelise
system.time({
results <- foreach(j = 1:length(candidate_k),
                   .combine = rbind
                   ) %dopar%{
   k <- candidate_k[j]
   
   results_1k <- matrix(0,
                        nrow = folds,
                        ncol = 2
                        )
   
   colnames(results_1k) <- c("k", "perplexity")
   
   for(i in 1:folds){
      train_set <- full_data[splitfolds != i , ]
      valid_set <- full_data[splitfolds == i, ]
      
      fitted <- LDA(train_set,
                    k = k,
                    method = "Gibbs",
                    control = list(seed = seed,
                                   verbose = 1,
                                   burnin = burnin,
                                   iter = iter,
                                   keep = keep
                                   )
                    )
      
      results_1k[i, ] <- c(k, perplexity(fitted, newdata = valid_set)
                           )
   }
   
   return(results_1k)
}
})

stopCluster(cluster)

results_df <- as.data.frame(results)


# ggplot(data = results_df,
#        aes(x = k,
#            y = perplexity)
#        ) +
#   geom_point() +
#   geom_smooth(se = FALSE) +
#   coord_cartesian(xlim = c(0, 12)
#                   ) +
#   scale_x_continuous(breaks = seq(0, 12, 2)
#                      ) +
#   ggplot_theme_basic +
#   ggtitle(label = "10-fold cross-validation of topic modelling",
#           subtitle = "(i.e., 10 different models fit for each potential number of topics)"
#           ) +
#   labs(x = "Potential Number of Topics",
#        y = "Perplexity When Fitting the Trained Model to the Hold-Out Set"
#        )


ggplot(data = results_df,
       aes(x = k,
           y = perplexity)
       ) +
  geom_point() +
  geom_smooth(se = TRUE) +
  coord_cartesian(xlim = c(0, 12),
                  ylim = c(0, 6000)
                  ) +
  scale_x_continuous(breaks = seq(0, 12, 2)
                     ) +
  # ggplot_theme_basic +
  ggtitle(label = "10-fold cross-validation of topic modelling",
          subtitle = "(i.e., 10 different models fit for each potential number of topics)"
          ) +
  labs(x = "Potential Number of Topics",
       y = "Perplexity When Fitting the Trained Model to the Hold-Out Set"
       )

```
  
    
    Remove the no-longer-needed files.
```{r}

rm(cluster, full_data, results, topic_plots, tunes_list, burnin, candidate_k, folds, iter, keep, n, seed, splitfolds, topic_guess)

```
  
    
    Here I use Latent Dirichlet allocation for topic modeling. As determining the "proper" number of topics was inconclusive, I'm cycling through every combination of ngrams (1:5) and topics (2:12).
```{r}

topic_guess <- 2:12

lda_list <- list()


cluster <- makeCluster(detectCores(logical = TRUE) - 1
                       ) # leave one CPU spare...
registerDoParallel(cluster)

for(i in ngram_list) {
  for(j in topic_guess) {
    x <- LDA(dtm_list[[i]],
             k = j,
             control = list(seed = 123456789,
                            verbose = 1
                            )
             )
    
    ifelse((i == min(ngram_list) &
              j == min(topic_guess)
            ),
           countx <- 1,
           countx <- countx + 1
           )
    
    lda_list[[countx]] <- list(ngram = i,
                               topic = j,
                               lda_model = x
                               )
    }
  }

stopCluster(cluster)

rm(ngram_list, topic_guess, i, j, x, countx, cluster)

lda_list

```
  
    
    Creating a dataframe with `beta` - the per-topic-per-ngram probability (i.e., the probability that each ngram is in each topic).
```{r}

PerTopicPerNgram <- list()

for(i in 1:length(lda_list)
    ) {
  x <- tidy(lda_list[[i]]$lda_model,
            matrix = "beta"
            ) %>% 
    arrange(term,
            desc(beta)
            )
  
  PerTopicPerNgram[[i]] <- list(ngram = lda_list[[i]]$ngram,
                                topic = lda_list[[i]]$topic,
                                PerTopicPerNgram = x
                                )
  }

rm(i, x)


str(PerTopicPerNgram[[55]]$PerTopicPerNgram)

# rm(serv_req_id_lda)
head(PerTopicPerNgram[[55]]$PerTopicPerNgram, 500)

```
  
    
    Creating a dataframe with just the top ten terms (ranked by beta) in each topic.
```{r}

top_terms <- list()

for(i in 1:length(PerTopicPerNgram)
    ) {
  x <- PerTopicPerNgram[[i]]$PerTopicPerNgram %>% 
    group_by(topic) %>% 
    top_n(10,
          beta
          ) %>% 
    ungroup() %>% 
    arrange(topic,
            -beta
            )
  
  top_terms[[i]] <- list(ngram = PerTopicPerNgram[[i]]$ngram,
                         topic = PerTopicPerNgram[[i]]$topic,
                         top_terms = x
                         )
  }

rm(i, x)


top_terms[[55]]$top_terms
View(top_terms[[55]]$top_terms)

```
  
    
    Now we can plot the top 10 n-grams in each topic to visually inspect if the topic classifications "make sense" based on the n-gram text.  
    
    Here, we're just creating and saving the plots themselves.
```{r}

TopNgrams_ByTopic_BarGraphs <-
  top_terms %>%
  # to_graph %>%
  map(function(x) 
    x$top_terms %>%
      mutate(term = reorder(term,
                            beta
                            ),
             topic = paste0("Topic ",
                            str_pad(as.character(topic),
                                    width = 2,
                                    side = "left",
                                    pad = "0"
                                    )
                            )
             ) %>% 
      ggplot(aes(x = term,
                 y = beta,
                 fill = factor(topic)
                 )
             ) +
      geom_col(show.legend = FALSE) +
      facet_wrap(~ topic,
                 scales = "free",
                 ncol = 2
                 ) +
      ggplot_theme_basic +
      theme(plot.title = element_text(size = 11),
            axis.title = element_text(size = 10),
            axis.text = element_text(size = 9)
            ) +
      labs(title = "Most Common Terms Per Topic",
           subtitle = paste0("(",
                             str_pad(x$ngram,
                                     width = 2,
                                     side = "left",
                                     pad = "0"
                                     ),
                             "gram",
                             str_pad(x$topic,
                                     width = 2,
                                     side = "left",
                                     pad = "0"
                             ),
                             "topic)"
                             ),
           x = paste0(str_pad(x$ngram,
                              width = 2,
                              side = "left",
                              pad = "0"
                              ),
                      "gram"
                      ),
           y = paste0("probability of the ",
                      str_pad(x$ngram,
                              width = 2,
                              side = "left",
                              pad = "0"
                              ),
                      "gram in the topic"
                      )
           ) +
      coord_flip()
    )

# TopNgrams_ByTopic_BarGraphs
TopNgrams_ByTopic_BarGraphs[[24]] # ngram = 3 & topics = 3
TopNgrams_ByTopic_BarGraphs[[25]] # ngram = 3 & topics = 4
TopNgrams_ByTopic_BarGraphs[[35]] # ngram = 4 & topics = 3
TopNgrams_ByTopic_BarGraphs[[36]] # ngram = 4 & topics = 4
TopNgrams_ByTopic_BarGraphs[[46]] # ngram = 5 & topics = 3
TopNgrams_ByTopic_BarGraphs[[47]] # ngram = 5 & topics = 4


TopNgrams_ByTopic_BarGraphs %>% 
  map(function(x)
    ggsave(paste0(wd,
                  "/Viz/",
                  # "New_",
                  substr(x$labels$subtitle,
                         2,
                         (nchar(x$labels$subtitle) - 1)
                  ),
                  "_Top10Terms_facet.png"
                  ),
           x,
           scale = 4,
           width = 6,
           height = 6,
           units = "cm"
           )
    )

```
  
    
    Creating a dataframe with `gamma` - the per-document-per-topic probability (i.e., the probability that each document (serv_req_id) is in each topic).  
    
    I chose to do this for six different combinations of ngrams and topics (ngram = 3 & topic = 3, 3 & 4, 4 & 3, 4 & 4, 5 & 3, 5 & 4). This was chosen in part becasue after looking at the graphs produced above, these models seemed (by visual inspection) to perform better. It was also done in part becasue a portion of the analyses below requries defining the topics as `unknown`, `no_rats_found`, or `rats_found` by visual inspection of the graphs produced above. Six also seemed to be a good medium between too few and too many visual inspections to do.
```{r}

rm(TopNgrams_ByTopic_BarGraphs)


top_terms[[55]] #lda model with ngram = 5 & topics = 12
top_terms[[2]] #lda model with ngram = 1 & topics = 3
top_terms[[3]] #lda model with ngram = 1 & topics = 4

top_terms[[24]] #lda model with ngram = 3 & topics = 3
top_terms[[25]] #lda model with ngram = 3 & topics = 4
top_terms[[35]] #lda model with ngram = 4 & topics = 3
top_terms[[36]] #lda model with ngram = 4 & topics = 4
top_terms[[46]] #lda model with ngram = 5 & topics = 3
top_terms[[47]] #lda model with ngram = 5 & topics = 4


ProbDocInTopic_ngram03_topic03 <-
  list(ngram = lda_list[[24]]$ngram,
       topic = lda_list[[24]]$topic,
       data = tidy(lda_list[[24]]$lda_model,
                   matrix = "gamma"
                   ) %>% 
         arrange(document,
                 desc(gamma)
                 ) %>% 
         mutate(topic_name = case_when(topic %in% c(1) ~ "unknown",
                                       topic %in% c(2) ~ "no_rats_found",
                                       topic %in% c(3) ~ "rats_found"
                                       ),
                model_ngram = lda_list[[24]]$ngram,
                model_topic = lda_list[[24]]$topic
                )
       )


ProbDocInTopic_ngram03_topic04 <-
  list(ngram = lda_list[[25]]$ngram,
       topic = lda_list[[25]]$topic,
       data = tidy(lda_list[[25]]$lda_model,
                   matrix = "gamma"
                   ) %>% 
         arrange(document,
                 desc(gamma)
                 ) %>% 
         mutate(topic_name = case_when(topic %in% c(1) ~ "unknown",
                                       topic %in% c(2) ~ "no_rats_found",
                                       topic %in% c(3, 4) ~ "rats_found"
                                       ),
                model_ngram = lda_list[[25]]$ngram,
                model_topic = lda_list[[25]]$topic
                )
       )


ProbDocInTopic_ngram04_topic03 <-
  list(ngram = lda_list[[35]]$ngram,
       topic = lda_list[[35]]$topic,
       data = tidy(lda_list[[35]]$lda_model,
                   matrix = "gamma"
                   ) %>% 
         arrange(document,
                 desc(gamma)
                 ) %>% 
         mutate(topic_name = case_when(#topic %in% c() ~ "unknown",
                                       topic %in% c(1) ~ "no_rats_found",
                                       topic %in% c(2, 3) ~ "rats_found"
                                       ),
                model_ngram = lda_list[[35]]$ngram,
                model_topic = lda_list[[35]]$topic
                )
       )


ProbDocInTopic_ngram04_topic04 <-
  list(ngram = lda_list[[36]]$ngram,
       topic = lda_list[[36]]$topic,
       data = tidy(lda_list[[36]]$lda_model,
                   matrix = "gamma"
                   ) %>% 
         arrange(document,
                 desc(gamma)
                 ) %>% 
         mutate(topic_name = case_when(topic %in% c(4) ~ "unknown",
                                       topic %in% c(1) ~ "no_rats_found",
                                       topic %in% c(2, 3) ~ "rats_found"
                                       ),
                model_ngram = lda_list[[36]]$ngram,
                model_topic = lda_list[[36]]$topic
                )
       )


ProbDocInTopic_ngram05_topic03 <-
  list(ngram = lda_list[[46]]$ngram,
       topic = lda_list[[46]]$topic,
       data = tidy(lda_list[[46]]$lda_model,
                   matrix = "gamma"
                   ) %>% 
         arrange(document,
                 desc(gamma)
                 ) %>% 
         mutate(topic_name = case_when(topic %in% c(2) ~ "unknown",
                                       topic %in% c(3) ~ "no_rats_found",
                                       topic %in% c(1) ~ "rats_found"
                                       ),
                model_ngram = lda_list[[46]]$ngram,
                model_topic = lda_list[[46]]$topic
                )
       )


ProbDocInTopic_ngram05_topic04 <-
  list(ngram = lda_list[[47]]$ngram,
       topic = lda_list[[47]]$topic,
       data = tidy(lda_list[[47]]$lda_model,
                   matrix = "gamma"
                   ) %>% 
         arrange(document,
                 desc(gamma)
                 ) %>% 
         mutate(topic_name = case_when(topic %in% c(4) ~ "unknown",
                                       topic %in% c(3) ~ "no_rats_found",
                                       topic %in% c(1, 2) ~ "rats_found"
                                       ),
                model_ngram = lda_list[[47]]$ngram,
                model_topic = lda_list[[47]]$topic
                )
       )

```
  
    
    Here, we put the six individual ProbDocInTopic models together, and add in some of the original information (e.g., the original `servicenotes`) for context.
```{r}

ProbDocInTopic_AllModels <-
  bind_rows(ProbDocInTopic_ngram03_topic03[[3]],
            ProbDocInTopic_ngram03_topic04[[3]],
            ProbDocInTopic_ngram04_topic03[[3]],
            ProbDocInTopic_ngram04_topic04[[3]],
            ProbDocInTopic_ngram05_topic03[[3]],
            ProbDocInTopic_ngram05_topic04[[3]]
            ) %>% 
  arrange(document,
          model_ngram,
          model_topic,
          gamma
          ) %>% 
  rename("servicerequestid" = "document") %>% 
  left_join(select(ServiceNotesCleaned,
                   servicerequestid,
                   servicenotes,
                   serviceorderdate
                   ),
            by = c("servicerequestid" = "servicerequestid")
            )

rm(list = ls(pattern = "ProbDocInTopic_ngram"))

str(ProbDocInTopic_AllModels)
head(ProbDocInTopic_AllModels, 100)
View(head(ProbDocInTopic_AllModels, 1000))

```
  
    
    Remove the datafiles that are no longer needed.
```{r}

rm(list = ls(pattern = "_list"))
rm(PerTopicPerNgram, top_terms)

```
  
    
    Next, for each model (e.g., 3-gram 4-topic), we sum the probabilities given for each numeric topic, into the "rats topics" (e.g., `rats_found`) which were defined above via visual inspection of the graphs on the Top 10 ngrams in each numeric topic.  
    
    I also pull out the 5-gram 4-topic model because it appeared (visually) to be the most accurate individual model.
```{r}

ProbDocInTopic_ProbsSummed_ByModel <-
  ProbDocInTopic_AllModels %>% 
  group_by(servicerequestid,
           model_ngram,
           model_topic,
           topic_name
           ) %>% 
  summarise(prob_ = sum(gamma)
            ) %>% 
  ungroup() %>% 
  left_join(select(ServiceNotesCleaned,
                   servicerequestid,
                   servicenotes,
                   serviceorderdate
                   ),
            by = c("servicerequestid" = "servicerequestid")
            )

str(ProbDocInTopic_ProbsSummed_ByModel)
head(ProbDocInTopic_ProbsSummed_ByModel, 100)
View(head(ProbDocInTopic_ProbsSummed_ByModel, 1000))



ProbDocInTopic_ProbsSummed_05gram04topic <-
  ProbDocInTopic_ProbsSummed_ByModel %>% 
  filter(model_ngram == 5 &
           model_topic == 4)

str(ProbDocInTopic_ProbsSummed_05gram04topic)
head(ProbDocInTopic_ProbsSummed_05gram04topic, 100)
View(head(ProbDocInTopic_ProbsSummed_05gram04topic, 1000))

```
  
    
    Next, for each ngram-topic combination, I create histograms of the probabilities assigned to each topic.  This is done to help visualy determine if the topic assignments are clearly separating documents (serv_request_id values).  
    
    A log10 transformation of the probability is done to help more clearly see any differences.
```{r}

# str(ProbDocInTopic_ProbsSummed_ByModel)

ProbDocInTopic_ProbsSummed_ByModel_Details <-
  ProbDocInTopic_ProbsSummed_ByModel %>% 
  mutate(model = paste0("0",
                        model_ngram,
                        "gram_",
                        "0",
                        model_topic,
                        "topic"
                        ),
         serviceorder_yr = year(serviceorderdate),
         model_and_yr = paste0(model,
                               "_",
                               as.character(serviceorder_yr)
                               )
         )

head(ProbDocInTopic_ProbsSummed_ByModel_Details, 100)
View(head(ProbDocInTopic_ProbsSummed_ByModel_Details, 1000))



TopicDistro_MainModels_Histogram_ByModel <-
  ProbDocInTopic_ProbsSummed_ByModel_Details %>% 
  split(.$model) %>%
  map(~ ggplot(data = .x,
               aes(x = prob_,
                   fill = topic_name
                   )
               ) +
        geom_histogram(binwidth = 0.05) +
        scale_x_continuous(limits = c(0, 1)
                           ) +
        scale_y_log10() + # this transformation is used to help more clearly see any differences in the probability values
        ggtitle(label = paste0(.x$model,
                               "_Histogram"
                               )
                ) +
        theme(legend.position = "none") +
        labs(x = "Prob of ServiceRequestId in the Topic",
             y = "log10 of counts"
             ) +
        facet_wrap(~topic_name)
      )



TopicDistro_MainModels_Histogram_ByModelYr <-
  ProbDocInTopic_ProbsSummed_ByModel_Details %>% 
  split(.$model_and_yr) %>% 
  map(~ ggplot(data = .x,
               aes(x = prob_,
                   fill = topic_name
                   )
               ) +
        geom_histogram(binwidth = 0.05) +
        scale_x_continuous(limits = c(0, 1)
                           ) +
        scale_y_log10() + # this transformation is used to help more clearly see any differences in the probability values
        ggtitle(label = paste0(.x$model_and_yr,
                               "_Histogram"
                               )
                ) +
        theme(legend.position = "none") +
        labs(x = "Prob of ServiceRequestId in the Topic",
             y = "log10 of counts"
             ) +
        facet_wrap(~topic_name)
      )

```
  
    
    Saving the histograms.
```{r message=FALSE, warning=FALSE, include=FALSE, paged.print=FALSE}

TopicDistro_MainModels_Histogram_ByModel %>%
  map(~ ggsave(paste0(wd,
                      "/Viz/",
                      # "New_",
                      "TopicDistro_MainModels_ByModel_",
                      .x$labels$title[[1]],
                      ".png"
                      ),
               .x,
               scale = 4,
               width = 6,
               height = 4,
               units = "cm"
               )
      )



TopicDistro_MainModels_Histogram_ByModelYr %>% 
  map(~ ggsave(paste0(wd,
                      "/Viz/",
                      # "New_",
                      "TopicDistro_MainModels_ByModelYr_",
                      .x$labels$title[[1]],
                      ".png"
                      ),
               .x,
               scale = 4,
               width = 6,
               height = 4,
               units = "cm"
               )
      )

```
  
    
    Removing unneeded files.
```{r}

rm(list = ls(pattern = "TopicDistro_"))

```
  
    
    Comparing how the histograms look with a regular y-scale vs a log10 y-scale.
```{r}

ProbDocInTopic_ProbsSummed_ByModel_Details %>% 
  split(.$model) %>%
  map(~ ggplot(data = .x,
               aes(x = prob_,
                   fill = topic_name
                   )
               ) +
        geom_histogram(binwidth = 0.05) +
        scale_x_continuous(limits = c(0, 1)
                           ) +
        scale_y_log10() +
        ggtitle(label = paste0(.x$model,
                               "_Histogram"
                               )
                ) +
        theme(legend.position = "none") +
        labs(x = "Prob of ServiceRequestId in the Topic",
             y = "log10 of counts"
             ) +
        facet_wrap(~topic_name,
                   scales = "fixed"
                   )
      )


ProbDocInTopic_ProbsSummed_ByModel_Details %>% 
  split(.$model) %>%
  map(~ ggplot(data = .x,
               aes(x = prob_,
                   fill = topic_name
                   )
               ) +
        geom_histogram(binwidth = 0.05) +
        scale_x_continuous(limits = c(0, 1)
                           ) +
        # scale_y_log10() +
        ggtitle(label = paste0(.x$model,
                               "_Histogram"
                               )
                ) +
        theme(legend.position = "none") +
        labs(x = "Prob of ServiceRequestId in the Topic",
             y = "counts"
             ) +
        coord_cartesian(ylim = c(0, 5000)
                        ) +
        facet_wrap(~topic_name,
                   scales = "fixed"
                   )
      )

```
  
    
    Then, for each `servicerequestid` and `topic_name` we calculate the mean topic probability across all the models. NOTE: This step could be modified more as my instinct is that more weight should probably be given to the models with larger ngrams and topics (e.g., the 5-gram & 4-topic model). However, using larger values of n-grams will not be able to analyze those records that do not have at least `n` words. Meaning that smaller values of n-grams can analyze more documents, but possibly less accurately.
```{r}

ProbDocInTopic_MeanProb_BySrvcRqstId <-
  ProbDocInTopic_ProbsSummed_ByModel %>% 
  group_by(servicerequestid,
           topic_name
           ) %>% 
  summarise(MeanProb = mean(prob_, na.rm = TRUE)
            ) %>% 
  left_join(select(ServiceNotesCleaned,
                   servicerequestid,
                   servicenotes,
                   serviceorderdate
                   ),
            by = c("servicerequestid" = "servicerequestid")
            ) %>% 
  arrange(servicerequestid,
          desc(MeanProb)
          )

str(ProbDocInTopic_MeanProb_BySrvcRqstId)
tail(ProbDocInTopic_MeanProb_BySrvcRqstId, 100)
View(head(ProbDocInTopic_MeanProb_BySrvcRqstId, 1000))
View(tail(ProbDocInTopic_MeanProb_BySrvcRqstId, 1000))

```
  
    
    Here, we create a dataset suitable for graphing and analyzing, by simply selecting the highest topic probability (the `MeanProb` value) assigned to each topic. We also do this for the single 5-gram 4-topic model.
```{r}

# ProbDocInTopic_AllModels
# ProbDocInTopic_ProbsSummed_ByModel
# ProbDocInTopic_MeanProb_BySrvcRqstId
# 
# 
# ProbDocInTopic_AllModels %>% select(servicerequestid) %>% distinct() %>% nrow
# ProbDocInTopic_ProbsSummed_ByModel %>% select(servicerequestid) %>% distinct() %>% nrow
# ProbDocInTopic_MeanProb_BySrvcRqstId %>% select(servicerequestid) %>% distinct() %>% nrow


TopProb_BySrvcRqstId <-
  ProbDocInTopic_MeanProb_BySrvcRqstId %>% 
  mutate(serviceorder_yr = year(serviceorderdate),
         # serviceorder_yr2 = as.factor(serviceorder_yr),
         yr_group = paste0(as.character(serviceorder_yr),
                           "_",
                           as.character(topic_name)
                           ),
         model = "AllModels_MeanProb"
         ) %>% 
  rename("prob" = "MeanProb") %>% 
  group_by(servicerequestid) %>% 
  top_n(1,
        prob
        ) %>% 
  ungroup() %>% 
  arrange(prob)

str(TopProb_BySrvcRqstId)
TopProb_BySrvcRqstId
View(TopProb_BySrvcRqstId)



TopProb_BySrvcRqstId_05gram04topic <- 
  ProbDocInTopic_ProbsSummed_05gram04topic %>% 
  mutate(serviceorder_yr = year(serviceorderdate),
         # serviceorder_yr2 = as.factor(serviceorder_yr),
         yr_group = paste0(as.character(serviceorder_yr),
                           "_",
                           as.character(topic_name)
                           ),
         model = "5gram4topic"
         ) %>% 
  rename("prob" = "prob_") %>% 
  group_by(servicerequestid) %>% 
  top_n(1,
        prob
        ) %>% 
  ungroup() %>% 
  arrange(prob)

str(TopProb_BySrvcRqstId_05gram04topic)
TopProb_BySrvcRqstId_05gram04topic
View(TopProb_BySrvcRqstId_05gram04topic)
  
```
  
    
    Now, we can simply create the freqpoly plots and density plots for each year-topic combination to investigate how the topic assignment did over time, and then save them.
```{r}

# str(TopProb_BySrvcRqstId)
# str(TopProb_BySrvcRqstId_05gram04topic)


TopicDistro_AllModelsMean_Freqpoly <-
  TopProb_BySrvcRqstId %>%
  split(.$serviceorder_yr) %>%
  map(~ ggplot(data = .x,
               aes(x = prob,
                   colour = topic_name
                   )
               ) +
        geom_freqpoly(binwidth = 0.05,
                      alpha = 0.6
                      ) +
        scale_x_continuous(limits = c(0, 1)
                           ) +
        ggtitle(label = paste0("TopicDistro_",
                               .x$model,
                               "_Freqpoly_",
                               as.character(.x$serviceorder_yr)
                               )
                ) +
        labs(x = "Prob of ServiceRequestId in the Topic")
      )

TopicDistro_AllModelsMean_Density <-
  TopProb_BySrvcRqstId %>%
  split(.$serviceorder_yr) %>%
  map(~ ggplot(data = .x,
               aes(x = prob,
                   colour = topic_name
                   )
               ) +
        geom_density() +
        scale_x_continuous(limits = c(0, 1)
                           ) +
        ggtitle(label = paste0("TopicDistro_",
                               .x$model,
                               "_Density_",
                               as.character(.x$serviceorder_yr)
                               )
                ) +
        labs(x = "Prob of ServiceRequestId in the Topic")
      )



TopicDistro_05gram04topic_Freqpoly <-
  TopProb_BySrvcRqstId_05gram04topic %>%
  split(.$serviceorder_yr) %>%
  map(~ ggplot(data = .x,
               aes(x = prob,
                   colour = topic_name
                   )
               ) +
        geom_freqpoly(binwidth = 0.05,
                      alpha = 0.6
                      ) +
        scale_x_continuous(limits = c(0, 1)
                           ) +
        ggtitle(label = paste0("TopicDistro_",
                               .x$model,
                               "_Freqpoly_",
                               as.character(.x$serviceorder_yr)
                               )
                ) +
        labs(x = "Prob of ServiceRequestId in the Topic")
      )

TopicDistro_05gram04topic_Density <-
  TopProb_BySrvcRqstId_05gram04topic %>%
  split(.$serviceorder_yr) %>%
  map(~ ggplot(data = .x,
               aes(x = prob,
                   colour = topic_name
                   )
               ) +
        geom_density() +
        scale_x_continuous(limits = c(0, 1)
                           ) +
        ggtitle(label = paste0("TopicDistro_",
                               .x$model,
                               "_Density_",
                               as.character(.x$serviceorder_yr)
                               )
                ) +
        labs(x = "Prob of ServiceRequestId in the Topic")
      )



# str(TopicDistro_AllModelsMean_Freqpoly[[18]])
# str(TopicDistro_AllModelsMean_Density[[18]])
# str(TopicDistro_05gram04topic_Freqpoly[[18]])
# str(TopicDistro_05gram04topic_Density[[18]])


TopicDistro_AllModelsMean_Freqpoly[[18]]
TopicDistro_AllModelsMean_Density[[18]]
TopicDistro_05gram04topic_Freqpoly[[18]]
TopicDistro_05gram04topic_Density[[18]]

```
  
    
    Saving the visuals created above.
```{r message=FALSE, warning=FALSE, include=FALSE, paged.print=FALSE}

TopicDistro_AllModelsMean_Freqpoly %>%
  map(function(x)
    ggsave(paste0(wd,
                  "/Viz/",
                  # "New_",
                  x$labels$title[[1]],
                  ".png"
                  ),
           x,
           scale = 4,
           width = 6,
           height = 4,
           units = "cm"
           )
    )



TopicDistro_AllModelsMean_Density %>%
  map(function(x)
    ggsave(paste0(wd,
                  "/Viz/",
                  # "New_",
                  x$labels$title[[1]],
                  ".png"
                  ),
           x,
           scale = 4,
           width = 6,
           height = 4,
           units = "cm"
           )
    )



TopicDistro_05gram04topic_Freqpoly %>%
  map(function(x)
    ggsave(paste0(wd,
                  "/Viz/",
                  # "New_",
                  x$labels$title[[1]],
                  ".png"
                  ),
           x,
           scale = 4,
           width = 6,
           height = 4,
           units = "cm"
           )
    )



TopicDistro_05gram04topic_Density %>%
  map(function(x)
    ggsave(paste0(wd,
                  "/Viz/",
                  # "New_",
                  x$labels$title[[1]],
                  ".png"
                  ),
           x,
           scale = 4,
           width = 6,
           height = 4,
           units = "cm"
           )
    )

```
  
    
    Removing unneeded files.
```{r}

rm(list = ls(pattern = "TopicDistro_"))

```
  
    
    As another method to determine the "correct" topic assignment, here I simply count the number of times a topic assignment was given to each document (servicerequestid), and assign the "correct" topic to the topic with the most assignments. In the case of ties (e.g., all three topics, each assigned twice), I use the `MeanProb` calculated above in the `ProbDocInTopic_MeanProb_BySrvcRqstId` dataframe previously.
```{r}

str(ProbDocInTopic_ProbsSummed_ByModel)
View(ProbDocInTopic_ProbsSummed_ByModel)
# ProbDocInTopic_ProbsSummed_ByModel %>% select(model_ngram, model_topic) %>% distinct()

str(ProbDocInTopic_MeanProb_BySrvcRqstId)
View(ProbDocInTopic_MeanProb_BySrvcRqstId)


TopicAssigned_ByCounts_ByMeanProb <-
  ProbDocInTopic_ProbsSummed_ByModel %>% 
  group_by(servicerequestid,
           model_ngram,
           model_topic
           ) %>% 
  top_n(1,
        prob_
        ) %>% 
  ungroup() %>% 
  count(servicerequestid,
        topic_name
        ) %>% 
  left_join(ProbDocInTopic_MeanProb_BySrvcRqstId,
            by = c("servicerequestid" = "servicerequestid",
                   "topic_name" = "topic_name"
                   )
            ) %>% 
  select(servicerequestid,
         topic_name,
         n,
         MeanProb
         ) %>% 
  group_by(servicerequestid) %>%
  arrange(servicerequestid,
          desc(n),
          desc(MeanProb)
          ) %>% 
  ungroup() %>% 
  group_by(servicerequestid) %>%
  mutate(RowNum = row_number()
         ) %>% 
  ungroup() %>% 
  filter(RowNum == 1) %>% 
  rename(times_topic_assigned = n) %>% 
  left_join(ServiceNotesCleaned,
            by = "servicerequestid"
            ) %>% 
  select(servicerequestid,
         topic_name,
         times_topic_assigned,
         MeanProb,
         servicenotes,
         servicenotes_nonums_nopunc,
         serviceorderdate,
         serviceorder_date,
         serviceorder_yr,
         serviceorder_yr_posix,
         serviceorder_mth,
         serviceorder_yrmth,
         serviceorder_yrmth_posix,
         serviceorder_day,
         serviceorder_wkday
         )

str(ServiceNotesCleaned)
str(TopicAssigned_ByCounts_ByMeanProb)
head(TopicAssigned_ByCounts_ByMeanProb, 1000)
View(TopicAssigned_ByCounts_ByMeanProb)

```
  
    
    So now, we can compare two different models, each being slight variations of six LDA models.  
    
    1) `TopProb_BySrvcRqstId` assigns the topic by taking the mean topic probability score across all six models.
    2) `TopicAssigned_ByCounts_ByMeanProb` assigns the topic by taking the most frequently assigned topic across all six models.
```{r}

str(TopicAssigned_ByCounts_ByMeanProb)
str(TopProb_BySrvcRqstId)
str(TopProb_BySrvcRqstId_05gram04topic)


dim(TopicAssigned_ByCounts_ByMeanProb)
dim(TopProb_BySrvcRqstId)
dim(TopProb_BySrvcRqstId_05gram04topic)


a <- TopProb_BySrvcRqstId %>% 
  select(servicerequestid,
         topic_name,
         prob
         ) %>% 
  rename(topic_name_meanprob = topic_name,
         prob_meanprob = prob
         )


b <- TopicAssigned_ByCounts_ByMeanProb %>% 
  rename(topic_name_topcounts = topic_name,
         times_topic_assigned_topcounts = times_topic_assigned,
         prob_topcounts = MeanProb
         )


ModelsCompare <- a %>% 
  left_join(b,
            by = "servicerequestid"
            )

rm(a, b)
str(ModelsCompare)
ModelsCompare
View(ModelsCompare)

```
  
    
    When comparing the two models, they coincide nearly 99% of the time...but the disagreements still raise questions.
```{r}

filter(ModelsCompare,
       topic_name_meanprob != topic_name_topcounts
       ) %>% 
  nrow() / nrow(ModelsCompare) * 100

filter(ModelsCompare,
       topic_name_meanprob == topic_name_topcounts
       ) %>% 
  nrow() / nrow(ModelsCompare) * 100


# View(filter(ModelsCompare,
#             topic_name_meanprob != topic_name_topcounts
#             )
#      )

View(filter(ModelsCompare,
            topic_name_meanprob != topic_name_topcounts
            ) %>% 
       select(servicerequestid,
              topic_name_meanprob,
              topic_name_topcounts,
              servicenotes,
              servicenotes_nonums_nopunc
              )
     )

```
  
    
    As the models assign topics almost identically, let's use the `topcounts` model to do some quick inspections about how often each of the topics were assigned, when, changes over time, etc.
```{r}


Counts_AllYears <- ModelsCompare %>% 
  mutate(topic_name = factor(topic_name_topcounts,
                             levels = c("unknown",
                                        "no_rats_found",
                                        "rats_found"
                                        )
                             )
         ) %>% 
group_by(topic_name) %>% 
  count() %>% 
  rename(counts = n)



Counts_AcrossYears <- ModelsCompare %>% 
  mutate(topic_name = factor(topic_name_topcounts,
                             levels = c("unknown",
                                        "no_rats_found",
                                        "rats_found"
                                        )
                             )
         ) %>% 
group_by(topic_name,
         serviceorder_yr
         ) %>% 
  count() %>% 
  rename(counts = n)



ggplot(data = Counts_AllYears,
       aes(x = topic_name,
           y = counts,
           fill = topic_name
           )
       ) +
  geom_col() +
  geom_text(aes(label = counts),
            nudge_y = -200,
            size = 3
            ) +
  # theme_minimal() +
  theme(legend.position = "none") +
  coord_flip()

 ggsave(paste0(wd,
               "/Viz/",
               # "New_",
               "Topics_CountModel_Counts_AllYears.png"
                  ),
        scale = 4,
        width = 6,
        height = 6,
        units = "cm"
        )


ggplot(data = Counts_AcrossYears,
       aes(x = topic_name,
           y = counts,
           fill = topic_name
           )
       ) +
  geom_col() +
  geom_text(aes(label = counts),
            nudge_y = 100,
            size = 2.5
            ) +
  scale_y_continuous(limits = c(0, 1800),
                     breaks = seq(0, 1800, 300)
                     ) +
  facet_wrap(~serviceorder_yr) +
  theme(legend.position = "none") +
  coord_flip()

 ggsave(paste0(wd,
               "/Viz/",
               # "New_",
               "Topics_CountModel_Counts_AcrossYears.png"
                  ),
        scale = 4,
        width = 8,
        height = 6,
        units = "cm"
        )

```



