Load the relevant libraries.
# rm(list = ls())
library("tidyverse") # data manipulation
library("magrittr") # data manipulation (pipeing data)
library("stringr") # string manipulation
library("lubridate") # date manipulation
library("tidytext") # text manipulation
library("topicmodels") # topic modeling
library("ggplot2") # viz
library("doParallel") # parallel processing
library("ldatuning") # estimating the proper number of topics
Session Info.
sessionInfo()
R version 3.3.3 (2017-03-06)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: macOS 10.13.1
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] parallel stats graphics grDevices utils datasets methods base
other attached packages:
[1] bindrcpp_0.2 knitr_1.17 ldatuning_0.2.0 doParallel_1.0.11
[5] iterators_1.0.8 foreach_1.4.3 topicmodels_0.2-7 tidytext_0.1.5
[9] lubridate_1.7.1 magrittr_1.5 forcats_0.2.0 stringr_1.2.0
[13] dplyr_0.7.4 purrr_0.2.4 readr_1.1.1 tidyr_0.7.2
[17] tibble_1.3.4 ggplot2_2.2.1 tidyverse_1.2.1
loaded via a namespace (and not attached):
[1] Rcpp_0.12.13 lattice_0.20-35 rprojroot_1.2 digest_0.6.12
[5] gmp_0.5-13.1 assertthat_0.2.0 psych_1.7.8 slam_0.1-40
[9] R6_2.2.2 cellranger_1.1.0 plyr_1.8.4 backports_1.1.1
[13] stats4_3.3.3 evaluate_0.10.1 httr_1.3.1 rlang_0.1.4
[17] lazyeval_0.2.1 readxl_1.0.0 rstudioapi_0.7 Matrix_1.2-12
[21] rmarkdown_1.8 labeling_0.3 foreign_0.8-69 munsell_0.4.3
[25] broom_0.4.3 compiler_3.3.3 janeaustenr_0.1.5 modelr_0.1.1
[29] base64enc_0.1-3 pkgconfig_2.0.1 mnormt_1.5-5 htmltools_0.3.6
[33] tidyselect_0.2.3 codetools_0.2-15 crayon_1.3.4 SnowballC_0.5.1
[37] grid_3.3.3 nlme_3.1-131 jsonlite_1.5 gtable_0.2.0
[41] scales_0.5.0 tokenizers_0.1.4 cli_1.0.0 stringi_1.1.6
[45] Rmpfr_0.6-1 reshape2_1.4.2 NLP_0.1-11 xml2_1.1.1
[49] tools_3.3.3 glue_1.2.0 hms_0.3 rsconnect_0.8.5
[53] yaml_2.1.14 tm_0.7-2 colorspace_1.3-2 rvest_0.3.2
[57] bindr_0.1 haven_1.1.0 modeltools_0.2-21
Setup the root directory.
Setting wd as the working directory.
wd <- getwd()
wd
[1] "/Users/mdturse/Desktop/Analytics/dc_doh_hackathon"
Get the raw data. Because of trouble maintaining a connection to Dropbox via R, I first downloaded the raw data from https://www.dropbox.com/sh/4j7q53lltasez3h/AACt3doRbsVDj8lBwX5YB1Rqa/years_combined/dc_311-2017-10-07.csv?dl=0 and saved the file locally. Note that this is the “new” data, updated on 2017-10-07.
Raw311Data <- read_csv(paste0(wd,
# "/Data_Raw/dc_311-2017-01-16.csv"
"/Data_Raw/dc_311-2017-10-07.csv"
),
progress = FALSE
)
Parsed with column specification:
cols(
.default = col_character(),
SERVICEORDERDATE = col_datetime(format = ""),
SERVICECALLCOUNT = col_integer(),
INSPECTIONDATE = col_datetime(format = ""),
RESOLUTIONDATE = col_datetime(format = ""),
SERVICEDUEDATE = col_datetime(format = ""),
ADDDATE = col_datetime(format = ""),
LASTMODIFIEDDATE = col_datetime(format = ""),
ZIPCODE = col_integer(),
MARADDRESSREPOSITORYID = col_integer(),
DCSTATADDRESSKEY = col_integer(),
DCSTATLOCATIONKEY = col_integer(),
WARD = col_integer(),
PSA = col_integer(),
NEIGHBORHOODCLUSTER = col_integer(),
LATITUDE = col_double(),
LONGITUDE = col_double()
)
See spec(...) for full column specifications.
# saving is done to avoid having to download all the data again
saveRDS(Raw311Data,
paste0(wd,
"/Data_Processed/",
"Raw311Data_NEW.Rds"
)
)
str(Raw311Data)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 5339514 obs. of 40 variables:
$ SERVICEREQUESTID : chr "09-00008592" "09-00037102" "09-00031664" "09-00013930" ...
$ SERVICEPRIORITY : chr "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
$ SERVICECODE : chr "TREEMAIN" "S0361" "S0406" "S0301" ...
$ SERVICECODEDESCRIPTION : chr "xxx_Tree Maintenance LOOK UP ONLY" "Sidewalk Repair" "Street Repair" "Pothole" ...
$ SERVICETYPECODE : chr "URBAFORR" "SISYINOD" "SISYINOD" "STRBRIMA" ...
$ SERVICETYPECODEDESCRIPTION: chr "Urban Forrestry" "SIOD" "SIOD" "Street & Bridge Maintenance" ...
$ SERVICEORDERDATE : POSIXct, format: "1996-07-26 12:28:00" "1998-11-17 14:12:00" ...
$ SERVICEORDERSTATUS : chr "CLOSED" "OVERDUE CLOSED" "OVERDUE CLOSED" "CLOSED" ...
$ SERVICECALLCOUNT : int 1 1 1 1 1 1 1 1 1 1 ...
$ AGENCYABBREVIATION : chr "DDOT" "DDOT" "DDOT" "DDOT" ...
$ INSPECTIONFLAG : chr "N" "N" "N" "N" ...
$ INSPECTIONDATE : POSIXct, format: "1999-07-07 19:29:00" "1999-12-17 14:11:00" ...
$ RESOLUTION : chr "Complete" "Complete" "Complete" "Complete" ...
$ RESOLUTIONDATE : POSIXct, format: "1999-07-07 19:29:00" "1999-12-17 14:11:00" ...
$ SERVICEDUEDATE : POSIXct, format: NA "1998-11-30 14:12:00" ...
$ SERVICENOTES : chr "A VERY LARGE TREE HAS BEEN DEAD FOR OVER SIX MONTHS.SEVERAL LARGE LIMBS FELL THURSDAY LAST WEEK AND SUNDAY THIS WEEK. THERE HAS"| __truncated__ "TEMP REPRS WERE DONE ABOUT ONE YEAR AGO. CITIZEN IS REQUESTING PERM REPRS" "reprd large area" "7 P/H REPRD" ...
$ PARENTSERVICEREQUESTID : chr NA NA NA NA ...
$ ADDDATE : POSIXct, format: "1999-07-07 04:00:00" "1999-12-17 05:00:00" ...
$ LASTMODIFIEDDATE : POSIXct, format: "2009-08-22 04:00:00" "2009-08-22 04:00:00" ...
$ SITEADDRESS : chr "1815 KEARNY STREET NE" "4553 DIX STREET NE" NA "311 V STREET NE" ...
$ LAT : chr NA NA NA NA ...
$ LONG : chr NA NA NA NA ...
$ ZIPCODE : int 20018 20019 NA 20002 20007 20015 20008 20002 NA 20018 ...
$ MARADDRESSREPOSITORYID : int 55530 19616 -1 40329 273430 277354 220194 1557 -1 51328 ...
$ DCSTATADDRESSKEY : int 33132 11265 0 26118 116566 119445 63260 2041 0 30682 ...
$ DCSTATLOCATIONKEY : int 33132 11265 0 26118 116566 119445 63260 2041 0 30682 ...
$ WARD : int 5 7 NA 5 2 3 3 6 NA 5 ...
$ ANC : chr "5A" "7D" "NONE" "5C" ...
$ SMD : chr "5A10" "7D05" "NONE" "5C05" ...
$ DISTRICT : chr "FIFTH" "SIXTH" NA "FIFTH" ...
$ PSA : int 504 602 NA 502 206 201 204 103 NA 503 ...
$ NEIGHBORHOODCLUSTER : int 22 30 NA 21 4 10 15 25 NA 24 ...
$ HOTSPOT2006NAME : chr "NONE" "NONE" "NONE" "NONE" ...
$ HOTSPOT2005NAME : chr "NONE" "NONE" "NONE" "NONE" ...
$ HOTSPOT2004NAME : chr "NONE" "NONE" "NONE" "NONE" ...
$ SERVICESOURCECODE : chr "PHONE" "PHONE" "PHONE" "PHONE" ...
$ LATITUDE : num 38.9 38.9 0 38.9 38.9 ...
$ LONGITUDE : num -77 -76.9 0 -77 -77.1 ...
$ INSPECTORNAME : chr NA NA NA NA ...
$ STATUSCODE : chr NA NA NA NA ...
- attr(*, "spec")=List of 2
..$ cols :List of 40
.. ..$ SERVICEREQUESTID : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICEPRIORITY : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICECODE : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICECODEDESCRIPTION : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICETYPECODE : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICETYPECODEDESCRIPTION: list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICEORDERDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ SERVICEORDERSTATUS : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICECALLCOUNT : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ AGENCYABBREVIATION : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ INSPECTIONFLAG : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ INSPECTIONDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ RESOLUTION : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ RESOLUTIONDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ SERVICEDUEDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ SERVICENOTES : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ PARENTSERVICEREQUESTID : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ ADDDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ LASTMODIFIEDDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ SITEADDRESS : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ LAT : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ LONG : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ ZIPCODE : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ MARADDRESSREPOSITORYID : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ DCSTATADDRESSKEY : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ DCSTATLOCATIONKEY : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ WARD : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ ANC : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SMD : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ DISTRICT : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ PSA : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ NEIGHBORHOODCLUSTER : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ HOTSPOT2006NAME : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ HOTSPOT2005NAME : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ HOTSPOT2004NAME : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICESOURCECODE : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ LATITUDE : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ LONGITUDE : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ INSPECTORNAME : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ STATUSCODE : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
..$ default: list()
.. ..- attr(*, "class")= chr "collector_guess" "collector"
..- attr(*, "class")= chr "col_spec"
tail(Raw311Data, 500)
Un-comment the chunk below to load the saved raw data (to avoid having to download the raw data again).
# Raw311Data <- readRDS(paste0(wd,
# "/Data_Processed/",
# "Raw311Data_NEW.Rds"
# )
# )
#
# str(Raw311Data)
# tail(Raw311Data, 500)
# View(tail(Raw311Data, 1000))
Selecting those variables that may be useful to test breakdowns of topic modeling. For example, running a topic model separately for the different levels of servicecode.
SelectedVars <- select(Raw311Data,
SERVICEREQUESTID,
SERVICEPRIORITY,
SERVICECODE,
SERVICECODEDESCRIPTION,
SERVICETYPECODE,
SERVICETYPECODEDESCRIPTION,
SERVICEORDERDATE,
SERVICENOTES
)
names(SelectedVars) <- tolower(names(SelectedVars))
rm(Raw311Data)
str(SelectedVars)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 5339514 obs. of 8 variables:
$ servicerequestid : chr "09-00008592" "09-00037102" "09-00031664" "09-00013930" ...
$ servicepriority : chr "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
$ servicecode : chr "TREEMAIN" "S0361" "S0406" "S0301" ...
$ servicecodedescription : chr "xxx_Tree Maintenance LOOK UP ONLY" "Sidewalk Repair" "Street Repair" "Pothole" ...
$ servicetypecode : chr "URBAFORR" "SISYINOD" "SISYINOD" "STRBRIMA" ...
$ servicetypecodedescription: chr "Urban Forrestry" "SIOD" "SIOD" "Street & Bridge Maintenance" ...
$ serviceorderdate : POSIXct, format: "1996-07-26 12:28:00" "1998-11-17 14:12:00" ...
$ servicenotes : chr "A VERY LARGE TREE HAS BEEN DEAD FOR OVER SIX MONTHS.SEVERAL LARGE LIMBS FELL THURSDAY LAST WEEK AND SUNDAY THIS WEEK. THERE HAS"| __truncated__ "TEMP REPRS WERE DONE ABOUT ONE YEAR AGO. CITIZEN IS REQUESTING PERM REPRS" "reprd large area" "7 P/H REPRD" ...
- attr(*, "spec")=List of 2
..$ cols :List of 40
.. ..$ SERVICEREQUESTID : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICEPRIORITY : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICECODE : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICECODEDESCRIPTION : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICETYPECODE : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICETYPECODEDESCRIPTION: list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICEORDERDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ SERVICEORDERSTATUS : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICECALLCOUNT : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ AGENCYABBREVIATION : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ INSPECTIONFLAG : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ INSPECTIONDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ RESOLUTION : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ RESOLUTIONDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ SERVICEDUEDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ SERVICENOTES : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ PARENTSERVICEREQUESTID : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ ADDDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ LASTMODIFIEDDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ SITEADDRESS : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ LAT : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ LONG : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ ZIPCODE : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ MARADDRESSREPOSITORYID : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ DCSTATADDRESSKEY : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ DCSTATLOCATIONKEY : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ WARD : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ ANC : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SMD : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ DISTRICT : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ PSA : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ NEIGHBORHOODCLUSTER : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ HOTSPOT2006NAME : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ HOTSPOT2005NAME : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ HOTSPOT2004NAME : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICESOURCECODE : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ LATITUDE : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ LONGITUDE : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ INSPECTORNAME : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ STATUSCODE : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
..$ default: list()
.. ..- attr(*, "class")= chr "collector_guess" "collector"
..- attr(*, "class")= chr "col_spec"
Quick visual inspection of filtering the data to only service calls with notes (i.e., removing NA values), and only those that are rat-related (servicecode == "S0311).
Removing NA values takes us from 5,339,514 rows to 3,640,359 rows.
Looking at only rat-related service calls takes us from 3,640,359 rows to 26,302 rows.
NoNAServiceNotes <- filter(SelectedVars,
!is.na(servicenotes)
)
# message("SelectedVars")
nrow(SelectedVars)
[1] 5339514
# message("NoNAServiceNotes")
nrow(NoNAServiceNotes)
[1] 3640359
rm(SelectedVars)
str(NoNAServiceNotes)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 3640359 obs. of 8 variables:
$ servicerequestid : chr "09-00008592" "09-00037102" "09-00031664" "09-00013930" ...
$ servicepriority : chr "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
$ servicecode : chr "TREEMAIN" "S0361" "S0406" "S0301" ...
$ servicecodedescription : chr "xxx_Tree Maintenance LOOK UP ONLY" "Sidewalk Repair" "Street Repair" "Pothole" ...
$ servicetypecode : chr "URBAFORR" "SISYINOD" "SISYINOD" "STRBRIMA" ...
$ servicetypecodedescription: chr "Urban Forrestry" "SIOD" "SIOD" "Street & Bridge Maintenance" ...
$ serviceorderdate : POSIXct, format: "1996-07-26 12:28:00" "1998-11-17 14:12:00" ...
$ servicenotes : chr "A VERY LARGE TREE HAS BEEN DEAD FOR OVER SIX MONTHS.SEVERAL LARGE LIMBS FELL THURSDAY LAST WEEK AND SUNDAY THIS WEEK. THERE HAS"| __truncated__ "TEMP REPRS WERE DONE ABOUT ONE YEAR AGO. CITIZEN IS REQUESTING PERM REPRS" "reprd large area" "7 P/H REPRD" ...
- attr(*, "spec")=List of 2
..$ cols :List of 40
.. ..$ SERVICEREQUESTID : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICEPRIORITY : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICECODE : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICECODEDESCRIPTION : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICETYPECODE : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICETYPECODEDESCRIPTION: list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICEORDERDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ SERVICEORDERSTATUS : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICECALLCOUNT : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ AGENCYABBREVIATION : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ INSPECTIONFLAG : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ INSPECTIONDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ RESOLUTION : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ RESOLUTIONDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ SERVICEDUEDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ SERVICENOTES : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ PARENTSERVICEREQUESTID : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ ADDDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ LASTMODIFIEDDATE :List of 1
.. .. ..$ format: chr ""
.. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
.. ..$ SITEADDRESS : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ LAT : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ LONG : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ ZIPCODE : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ MARADDRESSREPOSITORYID : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ DCSTATADDRESSKEY : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ DCSTATLOCATIONKEY : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ WARD : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ ANC : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SMD : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ DISTRICT : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ PSA : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ NEIGHBORHOODCLUSTER : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ HOTSPOT2006NAME : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ HOTSPOT2005NAME : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ HOTSPOT2004NAME : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ SERVICESOURCECODE : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ LATITUDE : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ LONGITUDE : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ INSPECTORNAME : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
.. ..$ STATUSCODE : list()
.. .. ..- attr(*, "class")= chr "collector_character" "collector"
..$ default: list()
.. ..- attr(*, "class")= chr "collector_guess" "collector"
..- attr(*, "class")= chr "col_spec"
View(head(NoNAServiceNotes, 1000))
RatCalls <- filter(NoNAServiceNotes,
servicecode == "S0311"
)
# message("NoNAServiceNotes")
nrow(NoNAServiceNotes)
[1] 3640359
# message("RatCalls")
nrow(RatCalls)
[1] 26302
rm(NoNAServiceNotes)
View(RatCalls)
Add in time-related variables.
RatCalls_Time <- RatCalls %>%
mutate(serviceorder_date = as_date(serviceorderdate),
serviceorder_yr = year(serviceorderdate),
serviceorder_yr_posix = floor_date(serviceorderdate, "year"),
serviceorder_mth = month(serviceorderdate, label = TRUE),
serviceorder_yrmth = as.character(serviceorder_date) %>%
substr(1, 7),
serviceorder_yrmth_posix = floor_date(serviceorderdate, "month"),
serviceorder_day = day(serviceorderdate),
serviceorder_wkday = wday(serviceorderdate, label = TRUE)
)
rm(RatCalls)
str(RatCalls_Time)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 26302 obs. of 16 variables:
$ servicerequestid : chr "09-00001211" "09-00001323" "09-00001410" "09-00001865" ...
$ servicepriority : chr "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
$ servicecode : chr "S0311" "S0311" "S0311" "S0311" ...
$ servicecodedescription : chr "Rat Abatement" "Rat Abatement" "Rat Abatement" "Rat Abatement" ...
$ servicetypecode : chr "DEPAHEAL" "DEPAHEAL" "DEPAHEAL" "DEPAHEAL" ...
$ servicetypecodedescription: chr "DOH" "DOH" "DOH" "DOH" ...
$ serviceorderdate : POSIXct, format: "1999-04-27 12:59:00" "1999-04-30 19:59:00" ...
$ servicenotes : chr "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "rats in the alley behind house" "the rat are coming from an apartment building adjacent to the alley. there is alot of trash pilled up behind the apartm"| __truncated__ "The vector control branch baited at 2874 Perry St. NE for rats on 5-25-99." ...
$ serviceorder_date : Date, format: "1999-04-27" "1999-04-30" ...
$ serviceorder_yr : num 1999 1999 1999 1999 1999 ...
$ serviceorder_yr_posix : POSIXct, format: "1999-01-01" "1999-01-01" ...
$ serviceorder_mth : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 4 4 5 5 5 5 5 5 6 6 ...
$ serviceorder_yrmth : chr "1999-04" "1999-04" "1999-05" "1999-05" ...
$ serviceorder_yrmth_posix : POSIXct, format: "1999-04-01" "1999-04-01" ...
$ serviceorder_day : int 27 30 6 14 19 21 26 28 3 8 ...
$ serviceorder_wkday : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 3 6 5 6 4 6 4 6 5 3 ...
tail(RatCalls_Time, 500)
View(tail(RatCalls_Time, 1000))
Next we need to clean up the text of the servicenotes variable - this will be done in multiple steps.
As the first step, we’ll remove common “stopwords” (e.g., is, the, and, etc.) as they won’t be very useful in finding topics in the servicenotes text. Although they are stopwords, we specifically do not remove the words “no” or “not” as they are often used to distinguish between “rats found” and “no rats found”, or between “did find” and “did not find”.
# View(stop_words %>%
# select(word) %>%
# distinct() %>%
# arrange(word)
# )
#
# View(filter(stop_words,
# word != "no" &
# word != "not"
# ) %>%
# select(word) %>%
# distinct() %>%
# arrange(word)
# )
NoStopWords_Unnest <-
RatCalls_Time %>%
select(servicerequestid,
servicenotes
) %>%
unnest_tokens(word,
servicenotes
) %>%
anti_join(filter(stop_words,
word != "no" &
word != "not" # we don't remove the words "no" or "not" as they are often used to distinguish between "rats found" and "no rats found", or "find" and "not find"
),
by = "word"
)
Servicenotes_NoStopWrds <- NoStopWords_Unnest %>%
nest(word) %>%
mutate(servicenotes_nostop = map(data,
unlist
),
servicenotes_nostop = map_chr(servicenotes_nostop,
paste,
collapse = " "
)
) %>%
select(-data)
Remove_StopWrds <- RatCalls_Time %>%
left_join(Servicenotes_NoStopWrds,
by = "servicerequestid"
)
dim(RatCalls_Time)
[1] 26302 16
dim(Remove_StopWrds)
[1] 26302 17
rm(NoStopWords_Unnest, Servicenotes_NoStopWrds)
head(Remove_StopWrds, 100)
View(head(Remove_StopWrds, 100))
Then, we’ll remove any numeric characters from ‘servicenotes’ to avoid distinctions not needed at this level (e.g., “baited 3 rat borrows” vs. “baited 6 rat burrows”). We’ll also remove punctuation.
ServiceNotesCleaned <- Remove_StopWrds %>%
mutate(servicenotes_nonums_nopunc = str_replace_all(servicenotes_nostop,
"[[:digit:]]",
""
) %>%
str_replace_all("[[:punct:]]",
""
)
) %>%
select(-servicenotes_nostop)
dim(RatCalls_Time)
[1] 26302 16
dim(Remove_StopWrds)
[1] 26302 17
dim(ServiceNotesCleaned)
[1] 26302 17
# View(select(ServiceNotesCleaned,
# servicerequestid,
# servicenotes,
# servicenotes_nonums_nopunc
# ) %>%
# filter(servicerequestid %in% nomatch$servicerequestid)
# )
rm(RatCalls_Time, Remove_StopWrds)
head(ServiceNotesCleaned, 100)
View(head(ServiceNotesCleaned, 100))
Now, we can inspect the frequencies of rat-related service requests.
summary(ServiceNotesCleaned)
servicerequestid servicepriority servicecode servicecodedescription
Length:26302 Length:26302 Length:26302 Length:26302
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
servicetypecode servicetypecodedescription serviceorderdate
Length:26302 Length:26302 Min. :1999-04-27 12:59:00
Class :character Class :character 1st Qu.:2010-07-03 16:34:11
Mode :character Mode :character Median :2012-11-20 07:33:25
Mean :2012-04-18 00:35:17
3rd Qu.:2016-01-03 22:48:12
Max. :2017-10-04 09:13:46
servicenotes serviceorder_date serviceorder_yr serviceorder_yr_posix
Length:26302 Min. :1999-04-27 Min. :1999 Min. :1999-01-01 00:00:00
Class :character 1st Qu.:2010-07-03 1st Qu.:2010 1st Qu.:2010-01-01 00:00:00
Mode :character Median :2012-11-20 Median :2012 Median :2012-01-01 00:00:00
Mean :2012-04-17 Mean :2012 Mean :2011-10-14 21:39:30
3rd Qu.:2016-01-03 3rd Qu.:2016 3rd Qu.:2016-01-01 00:00:00
Max. :2017-10-04 Max. :2017 Max. :2017-01-01 00:00:00
serviceorder_mth serviceorder_yrmth serviceorder_yrmth_posix serviceorder_day
Aug :3097 Length:26302 Min. :1999-04-01 00:00:00 Min. : 1.00
Jun :3064 Class :character 1st Qu.:2010-07-01 00:00:00 1st Qu.: 8.00
Jul :2932 Mode :character Median :2012-11-01 00:00:00 Median :16.00
May :2781 Mean :2012-04-02 16:36:58 Mean :15.74
Sep :2688 3rd Qu.:2016-01-01 00:00:00 3rd Qu.:23.00
Apr :2241 Max. :2017-10-01 00:00:00 Max. :31.00
(Other):9499
serviceorder_wkday servicenotes_nonums_nopunc
Sun:1085 Length:26302
Mon:5296 Class :character
Tue:5213 Mode :character
Wed:4979
Thu:4431
Fri:4067
Sat:1231
# summary(RatCalls_Time$serviceorderdate)
# library("psych")
# describe(RatCalls_Time$serviceorderdate)
ggplot_theme_basic <-
theme(panel.background = element_blank(),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
# axis.text.x = element_blank(),
axis.ticks = element_blank(),
axis.line = element_line(size = 1, colour = "black")
)
# ggplot(data = RatCalls_Time,
# aes(x = serviceorder_date)
# ) +
# geom_histogram() +
# ggplot_theme_basic
yr_counts <- ServiceNotesCleaned %>%
group_by(serviceorder_yr_posix) %>%
summarise(Cnt = n()
) %>%
arrange(serviceorder_yr_posix)
ggplot(data = yr_counts,
aes(x = serviceorder_yr_posix,
y = Cnt
)
) +
geom_col(fill = "light blue") +
geom_text(aes(label = Cnt),
nudge_y = 50,
size = 3
) +
labs(title = "Counts of non-NA ServiceNotes",
# subtitle = "by year",
x = "Year",
y = "Count"
) +
ggplot_theme_basic +
theme(axis.text.x = element_text(angle = 90)
) +
scale_x_datetime(date_breaks = "1 year")
yrmth_counts <- ServiceNotesCleaned %>%
group_by(serviceorder_yrmth_posix) %>%
summarise(Cnt = n()
) %>%
arrange(serviceorder_yrmth_posix)
ggplot(data = yrmth_counts,
aes(x = serviceorder_yrmth_posix,
y = Cnt
)
) +
geom_col(fill = "light blue") +
labs(title = "Counts of non-NA ServiceNotes",
x = "Year-Month",
y = "Count"
) +
ggplot_theme_basic +
theme(axis.text.x = element_text(angle = 90)
) +
coord_cartesian(xlim = c(as.POSIXct("1998-12-01"),
as.POSIXct("2017-12-01")
),
expand = TRUE
) +
scale_x_datetime(date_breaks = "6 months")
Based on the frequencies of when we actually have ‘servicenotes’ data, let’s try limiting the dataset to service calls from 2010 or later. This reduces the dataset further, from 26,302 rows to 21,201 rows.
rm(yr_counts, yrmth_counts)
ServiceNotesCleanedAfter2010 <- ServiceNotesCleaned %>%
filter(serviceorderdate >= as_date("2010-01-01")
)
nrow(ServiceNotesCleaned)
[1] 26302
nrow(ServiceNotesCleanedAfter2010)
[1] 21201
summary(ServiceNotesCleanedAfter2010)
servicerequestid servicepriority servicecode servicecodedescription
Length:21201 Length:21201 Length:21201 Length:21201
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
servicetypecode servicetypecodedescription serviceorderdate
Length:21201 Length:21201 Min. :2010-01-02 07:51:20
Class :character Class :character 1st Qu.:2011-10-30 21:47:39
Mode :character Mode :character Median :2013-10-10 08:28:58
Mean :2014-01-19 10:19:50
3rd Qu.:2016-06-21 13:23:25
Max. :2017-10-04 09:13:46
servicenotes serviceorder_date serviceorder_yr serviceorder_yr_posix
Length:21201 Min. :2010-01-02 Min. :2010 Min. :2010-01-01 00:00:00
Class :character 1st Qu.:2011-10-30 1st Qu.:2011 1st Qu.:2011-01-01 00:00:00
Mode :character Median :2013-10-10 Median :2013 Median :2013-01-01 00:00:00
Mean :2014-01-18 Mean :2014 Mean :2013-07-18 07:33:55
3rd Qu.:2016-06-21 3rd Qu.:2016 3rd Qu.:2016-01-01 00:00:00
Max. :2017-10-04 Max. :2017 Max. :2017-01-01 00:00:00
serviceorder_mth serviceorder_yrmth serviceorder_yrmth_posix serviceorder_day
Jun :2575 Length:21201 Min. :2010-01-01 00:00:00 Min. : 1.00
Aug :2538 Class :character 1st Qu.:2011-10-01 00:00:00 1st Qu.: 8.00
Jul :2395 Mode :character Median :2013-10-01 00:00:00 Median :16.00
May :2218 Mean :2014-01-04 01:50:50 Mean :15.77
Sep :2191 3rd Qu.:2016-06-01 00:00:00 3rd Qu.:23.00
Apr :1852 Max. :2017-10-01 00:00:00 Max. :31.00
(Other):7432
serviceorder_wkday servicenotes_nonums_nopunc
Sun:1005 Length:21201
Mon:4241 Class :character
Tue:4133 Mode :character
Wed:3936
Thu:3526
Fri:3266
Sat:1094
With the newer dataset (from 2017-10-07), it looks like some text related to general descriptions (inclding the street address), and related to image attachments, was added to the servicenotes field. So here, we inspect that a bit and then do some cleanup.
str(ServiceNotesCleaned)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 26302 obs. of 17 variables:
$ servicerequestid : chr "09-00001211" "09-00001323" "09-00001410" "09-00001865" ...
$ servicepriority : chr "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
$ servicecode : chr "S0311" "S0311" "S0311" "S0311" ...
$ servicecodedescription : chr "Rat Abatement" "Rat Abatement" "Rat Abatement" "Rat Abatement" ...
$ servicetypecode : chr "DEPAHEAL" "DEPAHEAL" "DEPAHEAL" "DEPAHEAL" ...
$ servicetypecodedescription: chr "DOH" "DOH" "DOH" "DOH" ...
$ serviceorderdate : POSIXct, format: "1999-04-27 12:59:00" "1999-04-30 19:59:00" ...
$ servicenotes : chr "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "rats in the alley behind house" "the rat are coming from an apartment building adjacent to the alley. there is alot of trash pilled up behind the apartm"| __truncated__ "The vector control branch baited at 2874 Perry St. NE for rats on 5-25-99." ...
$ serviceorder_date : Date, format: "1999-04-27" "1999-04-30" ...
$ serviceorder_yr : num 1999 1999 1999 1999 1999 ...
$ serviceorder_yr_posix : POSIXct, format: "1999-01-01" "1999-01-01" ...
$ serviceorder_mth : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 4 4 5 5 5 5 5 5 6 6 ...
$ serviceorder_yrmth : chr "1999-04" "1999-04" "1999-05" "1999-05" ...
$ serviceorder_yrmth_posix : POSIXct, format: "1999-04-01" "1999-04-01" ...
$ serviceorder_day : int 27 30 6 14 19 21 26 28 3 8 ...
$ serviceorder_wkday : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 3 6 5 6 4 6 4 6 5 3 ...
$ servicenotes_nonums_nopunc: chr "customer called vector control control no " "rats alley house" "rat coming apartment building adjacent alley alot trash pilled apartment building" "vector control branch baited perry st ne rats " ...
View(ServiceNotesCleaned %>%
filter(str_detect(servicenotes_nonums_nopunc,
"washington dc"
)
) %>%
select(servicerequestid,
servicenotes,
servicenotes_nonums_nopunc
)
)
View(ServiceNotesCleaned %>%
filter(str_detect(servicenotes_nonums_nopunc,
"seeclickfixcom"
)
) %>%
select(servicerequestid,
servicenotes,
servicenotes_nonums_nopunc
)
)
fix_list <- c("\\bwashington\\b" = "",
"\\bdc\\b" = "",
"\\busa\\b" = "",
"\\bnorthwest\\b" = "",
"\\bnortheast\\b" = "",
"\\bsouthwest\\b" = "",
"\\bsoutheast\\b" = "",
"\\bnw\\b" = "",
"\\bne\\b" = "",
"\\bsw\\b" = "",
"\\bse\\b" = "",
"\\buser\\sentered\\saddress\\b" = "",
"\\bissue\\simage\\sview\\b" = "",
"\\bdetails\\svisit\\shttp\\b" = "",
"\\bseeclickfixcom\\sissues\\b" = "",
"\\s{2,}" = " "
)
ServiceNotesCleaned2 <- ServiceNotesCleaned %>%
mutate(servicenotes_cleaned = str_replace_all(servicenotes_nonums_nopunc,
fix_list
)
)
saveRDS(ServiceNotesCleaned2,
paste0(wd,
"/Data_Processed/",
"ServiceNotesCleaned2.Rds"
)
)
rm(fix_list)
View(ServiceNotesCleaned2 %>%
filter(servicerequestid == "11-00257293" |
servicerequestid == "11-00350959"
) %>%
select(servicenotes,
servicenotes_nonums_nopunc,
servicenotes_cleaned
)
)
Now, let’s transform the servicenotes field into one row per n-gram. Because we don’t know what level of ‘n’ to use, we’ll cycle through the possibilities from n = 1 to n = 5.
ngram_list <- 1:5
Rat_Ngram_list <- list()
Rat_Ngram_list <- lapply(ngram_list,
function(i) {
# x <- paste0("0", i, "_gram")
# ServiceNotesCleaned %>%
ServiceNotesCleaned2 %>%
unnest_tokens(n_gram,
# servicenotes_nonums_nopunc,
servicenotes_cleaned,
token = "ngrams",
n = i
)
}
)
# rm(ngram_list)
Rat_Ngram_list
[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
# str(Rat_Ngram_list[[1]])
Counting the 5-grams in each servicerequestid.
word_counts_list <- list()
word_counts_list <- lapply(ngram_list,
function(i) {
Rat_Ngram_list[[i]] %>%
count(servicerequestid,
n_gram,
sort = TRUE
)
}
)
word_counts_list
[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
NA
Transforming the dataframe into a document term matrix - i.e., documents (servicerequestids) are the rows and n-grams are the columns.
dtm_list <- list()
dtm_list <- lapply(ngram_list,
function(i) {
word_counts_list[[i]] %>%
cast_dtm(document = servicerequestid,
term = n_gram,
value = n,
# weighting = tm::weightTfIdf,
# using term frequency inverse document frequency (TfIdf) weighting is another, possibly more accurate measure, but topicmodels::LDA (used below) only accepts document term matrices with term-frequency weighting
weighting = tm::weightTf
)
}
)
dtm_list
[[1]]
<<DocumentTermMatrix (documents: 26298, terms: 6202)>>
Non-/sparse entries: 233772/162866424
Sparsity : 100%
Maximal term length: 20
Weighting : term frequency (tf)
[[2]]
<<DocumentTermMatrix (documents: 25655, terms: 33514)>>
Non-/sparse entries: 213630/859588040
Sparsity : 100%
Maximal term length: 32
Weighting : term frequency (tf)
[[3]]
<<DocumentTermMatrix (documents: 24398, terms: 50616)>>
Non-/sparse entries: 188970/1234740198
Sparsity : 100%
Maximal term length: 41
Weighting : term frequency (tf)
[[4]]
<<DocumentTermMatrix (documents: 22746, terms: 54286)>>
Non-/sparse entries: 164828/1234624528
Sparsity : 100%
Maximal term length: 53
Weighting : term frequency (tf)
[[5]]
<<DocumentTermMatrix (documents: 20816, terms: 53065)>>
Non-/sparse entries: 142190/1104458850
Sparsity : 100%
Maximal term length: 61
Weighting : term frequency (tf)
To determine the “proper” number of topics, here I try using the ldatuning::FindTopicsNumber function. The code chunk is based on the vignette here: https://cran.r-project.org/web/packages/ldatuning/vignettes/topics.html.
This analyses was done separately for each n-gram level (n = 1:5), and the overall results were inconclusive - the “proper” number of topics fluctuated between the highest level tried (12 topics) and one of the lowest levels tried (2, 3, or 4 topics).
Note that even with parallel processing, this took about 20min to run on my laptop.
detectCores(logical = TRUE) - 1
[1] 3
tunes_list <- dtm_list %>%
map(~ FindTopicsNumber(.x,
topics = c(2:12),
metrics = c("Griffiths2004",
"CaoJuan2009",
"Arun2010",
"Deveaud2014"
),
method = "Gibbs",
control = list(seed = 123456789),
mc.cores = 3L,
verbose = TRUE
)
)
fit models... done.
calculate metrics:
Griffiths2004... done.
CaoJuan2009... done.
Arun2010... done.
Deveaud2014... done.
fit models... done.
calculate metrics:
Griffiths2004... done.
CaoJuan2009... done.
Arun2010... done.
Deveaud2014... done.
fit models... done.
calculate metrics:
Griffiths2004... done.
CaoJuan2009... done.
Arun2010... done.
Deveaud2014... done.
fit models... done.
calculate metrics:
Griffiths2004... done.
CaoJuan2009... done.
Arun2010... done.
Deveaud2014... done.
fit models... done.
calculate metrics:
Griffiths2004... done.
CaoJuan2009... done.
Arun2010... done.
Deveaud2014... done.
# str(tunes_list[[5]])
topic_plots <-
tunes_list %>%
map(~ FindTopicsNumber_plot(.x)
)
saveRDS(topic_plots,
paste0(wd,
"/Data_Processed/",
"topic_plots.Rds"
)
)
topic_plots
[[1]]
NULL
[[2]]
NULL
[[3]]
NULL
[[4]]
NULL
[[5]]
NULL
As an alternative, here I try to determine the “proper” number of topics using topicmodels::perplexity. Perplexity measures how well a probability model predicts a sample, and I use it here via 10-folder cross validation. For computational purposes, I’m only trying this for the 5-gram model.
This is based on the method used here:
http://ellisp.github.io/blog/2017/01/05/topic-model-cv
As with the ldatuning::FindTopicsNumber function used previously, topicmodels::perplexity is also inconclusive as there is no clear “elbow” in the perplexity plot.
Note that even with parallel processing, this took about 20min to run on my laptop.
full_data <- dtm_list[[5]]
n <- nrow(full_data)
seed <- 123456789
topic_guess <- 12
folds <- 10
burnin <- 1000
iter <-1000
keep <-50
#----------------10-fold cross-validation, different numbers of topics----------------
cluster <- makeCluster(detectCores(logical = TRUE) - 1
) # leave one CPU spare...
registerDoParallel(cluster)
clusterEvalQ(cluster, {
library(topicmodels)
})
[[1]]
[1] "topicmodels" "stats" "graphics" "grDevices" "utils" "datasets"
[7] "methods" "base"
[[2]]
[1] "topicmodels" "stats" "graphics" "grDevices" "utils" "datasets"
[7] "methods" "base"
[[3]]
[1] "topicmodels" "stats" "graphics" "grDevices" "utils" "datasets"
[7] "methods" "base"
splitfolds <- sample(1:folds, n, replace = TRUE)
candidate_k <- c(2:topic_guess) # candidates for how many topics
clusterExport(cluster,
c("full_data", "burnin", "iter", "keep", "splitfolds", "folds", "candidate_k")
)
# we parallelize by the different number of topics. A processor is allocated a value of k, and does the cross-validation serially. This is because it is assumed there are more candidate values of k than there are cross-validation folds, hence it will be more efficient to parallelise
system.time({
results <- foreach(j = 1:length(candidate_k),
.combine = rbind
) %dopar%{
k <- candidate_k[j]
results_1k <- matrix(0,
nrow = folds,
ncol = 2
)
colnames(results_1k) <- c("k", "perplexity")
for(i in 1:folds){
train_set <- full_data[splitfolds != i , ]
valid_set <- full_data[splitfolds == i, ]
fitted <- LDA(train_set,
k = k,
method = "Gibbs",
control = list(seed = seed,
verbose = 1,
burnin = burnin,
iter = iter,
keep = keep
)
)
results_1k[i, ] <- c(k, perplexity(fitted, newdata = valid_set)
)
}
return(results_1k)
}
})
user system elapsed
3.401 4.952 2181.552
stopCluster(cluster)
results_df <- as.data.frame(results)
saveRDS(results_df,
paste0(wd,
"/Data_Processed/",
"results_df_perplex_cv.Rds"
)
)
# ggplot(data = results_df,
# aes(x = k,
# y = perplexity)
# ) +
# geom_point() +
# geom_smooth(se = FALSE) +
# coord_cartesian(xlim = c(0, 12)
# ) +
# scale_x_continuous(breaks = seq(0, 12, 2)
# ) +
# ggplot_theme_basic +
# ggtitle(label = "10-fold cross-validation of topic modelling",
# subtitle = "(i.e., 10 different models fit for each potential number of topics)"
# ) +
# labs(x = "Potential Number of Topics",
# y = "Perplexity When Fitting the Trained Model to the Hold-Out Set"
# )
ggplot(data = results_df,
aes(x = k,
y = perplexity)
) +
geom_point() +
geom_smooth(se = TRUE) +
# coord_cartesian(xlim = c(0, 12),
# ylim = c(0, 10000)
# ) +
scale_x_continuous(limits = c(0, 12),
breaks = seq(0, 12, 2)
) +
scale_y_continuous(limits = c(0, 8000),
breaks = seq(0, 8000, 2000)
) +
# ggplot_theme_basic +
ggtitle(label = "10-fold cross-validation of topic modelling",
subtitle = "(i.e., 10 different models fit for each potential number of topics)"
) +
labs(x = "Potential Number of Topics",
y = "Perplexity When Fitting the Trained Model to the Hold-Out Set"
)
Remove the no-longer-needed files.
rm(cluster, full_data, results, results_df, topic_plots, tunes_list, burnin, candidate_k, folds, iter, keep, n, seed, splitfolds, topic_guess)
Here I use Latent Dirichlet allocation for topic modeling. As determining the “proper” number of topics was inconclusive, I’m cycling through every combination of ngrams (1:5) and topics (2:12).
Note that even with parallel processing, this took about 20min to run on my laptop.
topic_guess <- 2:12
lda_list <- list()
cluster <- makeCluster(detectCores(logical = TRUE) - 1
) # leave one CPU spare...
registerDoParallel(cluster)
for(i in ngram_list) {
for(j in topic_guess) {
x <- LDA(dtm_list[[i]],
k = j,
control = list(seed = 123456789,
verbose = 1
)
)
ifelse((i == min(ngram_list) &
j == min(topic_guess)
),
countx <- 1,
countx <- countx + 1
)
lda_list[[countx]] <- list(ngram = i,
topic = j,
lda_model = x
)
}
}
stopCluster(cluster)
rm(ngram_list, topic_guess, i, j, x, countx, cluster)
saveRDS(lda_list,
paste0(wd,
"/Data_Processed/",
"lda_list.Rds"
)
)
lda_list
Creating a dataframe with beta - the per-topic-per-ngram probability (i.e., the probability that each ngram is in each topic).
PerTopicPerNgram <- list()
for(i in 1:length(lda_list)
) {
x <- tidy(lda_list[[i]]$lda_model,
matrix = "beta"
) %>%
arrange(term,
desc(beta)
)
PerTopicPerNgram[[i]] <- list(ngram = lda_list[[i]]$ngram,
topic = lda_list[[i]]$topic,
PerTopicPerNgram = x
)
}
rm(i, x)
str(PerTopicPerNgram[[55]]$PerTopicPerNgram)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 636780 obs. of 3 variables:
$ topic: int 6 2 4 5 9 12 7 3 1 10 ...
$ term : chr "a cornes baited rat burrows" "a cornes baited rat burrows" "a cornes baited rat burrows" "a cornes baited rat burrows" ...
$ beta : num 1.37e-04 1.98e-61 3.99e-191 3.54e-191 1.55e-191 ...
# rm(serv_req_id_lda)
head(PerTopicPerNgram[[55]]$PerTopicPerNgram, 500)
Creating a dataframe with just the top ten terms (ranked by beta) in each topic.
top_terms <- list()
for(i in 1:length(PerTopicPerNgram)
) {
x <- PerTopicPerNgram[[i]]$PerTopicPerNgram %>%
group_by(topic) %>%
top_n(10,
beta
) %>%
ungroup() %>%
arrange(topic,
-beta
)
top_terms[[i]] <- list(ngram = PerTopicPerNgram[[i]]$ngram,
topic = PerTopicPerNgram[[i]]$topic,
top_terms = x
)
}
rm(i, x)
top_terms[[55]]$top_terms
View(top_terms[[55]]$top_terms)
Now we can plot the top 10 n-grams in each topic to visually inspect if the topic classifications “make sense” based on the n-gram text.
Here, we’re just creating and saving the plots themselves.
TopNgrams_ByTopic_BarGraphs <-
top_terms %>%
# to_graph %>%
map(function(x)
x$top_terms %>%
mutate(term = reorder(term,
beta
),
topic = paste0("Topic ",
str_pad(as.character(topic),
width = 2,
side = "left",
pad = "0"
)
)
) %>%
ggplot(aes(x = term,
y = beta,
fill = factor(topic)
)
) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic,
scales = "free",
ncol = 2
) +
ggplot_theme_basic +
theme(plot.title = element_text(size = 11),
axis.title = element_text(size = 10),
axis.text = element_text(size = 9)
) +
labs(title = "Most Common Terms Per Topic",
subtitle = paste0("(",
str_pad(x$ngram,
width = 2,
side = "left",
pad = "0"
),
"gram",
str_pad(x$topic,
width = 2,
side = "left",
pad = "0"
),
"topic)"
),
x = paste0(str_pad(x$ngram,
width = 2,
side = "left",
pad = "0"
),
"gram"
),
y = paste0("probability of the ",
str_pad(x$ngram,
width = 2,
side = "left",
pad = "0"
),
"gram in the topic"
)
) +
coord_flip()
)
# TopNgrams_ByTopic_BarGraphs
TopNgrams_ByTopic_BarGraphs[[24]] # ngram = 3 & topics = 3
TopNgrams_ByTopic_BarGraphs[[25]] # ngram = 3 & topics = 4
TopNgrams_ByTopic_BarGraphs[[35]] # ngram = 4 & topics = 3
TopNgrams_ByTopic_BarGraphs[[36]] # ngram = 4 & topics = 4
TopNgrams_ByTopic_BarGraphs[[46]] # ngram = 5 & topics = 3
TopNgrams_ByTopic_BarGraphs[[47]] # ngram = 5 & topics = 4
TopNgrams_ByTopic_BarGraphs %>%
map(function(x)
ggsave(paste0(wd,
"/Viz/",
"New_",
substr(x$labels$subtitle,
2,
(nchar(x$labels$subtitle) - 1)
),
"_Top10Terms_facet.png"
),
x,
scale = 4,
width = 6,
height = 6,
units = "cm"
)
)
[[1]]
NULL
[[2]]
NULL
[[3]]
NULL
[[4]]
NULL
[[5]]
NULL
[[6]]
NULL
[[7]]
NULL
[[8]]
NULL
[[9]]
NULL
[[10]]
NULL
[[11]]
NULL
[[12]]
NULL
[[13]]
NULL
[[14]]
NULL
[[15]]
NULL
[[16]]
NULL
[[17]]
NULL
[[18]]
NULL
[[19]]
NULL
[[20]]
NULL
[[21]]
NULL
[[22]]
NULL
[[23]]
NULL
[[24]]
NULL
[[25]]
NULL
[[26]]
NULL
[[27]]
NULL
[[28]]
NULL
[[29]]
NULL
[[30]]
NULL
[[31]]
NULL
[[32]]
NULL
[[33]]
NULL
[[34]]
NULL
[[35]]
NULL
[[36]]
NULL
[[37]]
NULL
[[38]]
NULL
[[39]]
NULL
[[40]]
NULL
[[41]]
NULL
[[42]]
NULL
[[43]]
NULL
[[44]]
NULL
[[45]]
NULL
[[46]]
NULL
[[47]]
NULL
[[48]]
NULL
[[49]]
NULL
[[50]]
NULL
[[51]]
NULL
[[52]]
NULL
[[53]]
NULL
[[54]]
NULL
[[55]]
NULL
Examples of “new” data (from 2017-10-07) adding in terms that cause issues with LDA analyses - interestingly, the issue of these “additional terms” is reduced at 5-gram and beyond.
str(ServiceNotesCleaned)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 26302 obs. of 17 variables:
$ servicerequestid : chr "09-00001211" "09-00001323" "09-00001410" "09-00001865" ...
$ servicepriority : chr "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
$ servicecode : chr "S0311" "S0311" "S0311" "S0311" ...
$ servicecodedescription : chr "Rat Abatement" "Rat Abatement" "Rat Abatement" "Rat Abatement" ...
$ servicetypecode : chr "DEPAHEAL" "DEPAHEAL" "DEPAHEAL" "DEPAHEAL" ...
$ servicetypecodedescription: chr "DOH" "DOH" "DOH" "DOH" ...
$ serviceorderdate : POSIXct, format: "1999-04-27 12:59:00" "1999-04-30 19:59:00" ...
$ servicenotes : chr "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "rats in the alley behind house" "the rat are coming from an apartment building adjacent to the alley. there is alot of trash pilled up behind the apartm"| __truncated__ "The vector control branch baited at 2874 Perry St. NE for rats on 5-25-99." ...
$ serviceorder_date : Date, format: "1999-04-27" "1999-04-30" ...
$ serviceorder_yr : num 1999 1999 1999 1999 1999 ...
$ serviceorder_yr_posix : POSIXct, format: "1999-01-01" "1999-01-01" ...
$ serviceorder_mth : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 4 4 5 5 5 5 5 5 6 6 ...
$ serviceorder_yrmth : chr "1999-04" "1999-04" "1999-05" "1999-05" ...
$ serviceorder_yrmth_posix : POSIXct, format: "1999-04-01" "1999-04-01" ...
$ serviceorder_day : int 27 30 6 14 19 21 26 28 3 8 ...
$ serviceorder_wkday : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 3 6 5 6 4 6 4 6 5 3 ...
$ servicenotes_nonums_nopunc: chr "customer called vector control control no " "rats alley house" "rat coming apartment building adjacent alley alot trash pilled apartment building" "vector control branch baited perry st ne rats " ...
View(ServiceNotesCleaned %>%
filter(str_detect(servicenotes_nonums_nopunc,
"washington dc"
)
)
)
View(ServiceNotesCleaned %>%
filter(str_detect(servicenotes_nonums_nopunc,
"seeclickfixcom"
)
)
)
Creating a dataframe with gamma - the per-document-per-topic probability (i.e., the probability that each document (serv_req_id) is in each topic).
I chose to do this for six different combinations of ngrams and topics (ngram = 3 & topic = 3, 3 & 4, 4 & 3, 4 & 4, 5 & 3, 5 & 4). This was chosen in part becasue after looking at the graphs produced above, these models seemed (by visual inspection) to perform better. It was also done in part becasue a portion of the analyses below requries defining the topics as unknown, no_rats_found, or rats_found by visual inspection of the graphs produced above. Six also seemed to be a good medium between too few and too many visual inspections to do.
rm(TopNgrams_ByTopic_BarGraphs)
top_terms[[55]] #lda model with ngram = 5 & topics = 12
$ngram
[1] 5
$topic
[1] 12
$top_terms
top_terms[[2]] #lda model with ngram = 1 & topics = 3
$ngram
[1] 1
$topic
[1] 3
$top_terms
top_terms[[3]] #lda model with ngram = 1 & topics = 4
$ngram
[1] 1
$topic
[1] 4
$top_terms
top_terms[[24]] #lda model with ngram = 3 & topics = 3
$ngram
[1] 3
$topic
[1] 3
$top_terms
top_terms[[25]] #lda model with ngram = 3 & topics = 4
$ngram
[1] 3
$topic
[1] 4
$top_terms
top_terms[[35]] #lda model with ngram = 4 & topics = 3
$ngram
[1] 4
$topic
[1] 3
$top_terms
top_terms[[36]] #lda model with ngram = 4 & topics = 4
$ngram
[1] 4
$topic
[1] 4
$top_terms
top_terms[[46]] #lda model with ngram = 5 & topics = 3
$ngram
[1] 5
$topic
[1] 3
$top_terms
top_terms[[47]] #lda model with ngram = 5 & topics = 4
$ngram
[1] 5
$topic
[1] 4
$top_terms
ProbDocInTopic_ngram03_topic03 <-
list(ngram = lda_list[[24]]$ngram,
topic = lda_list[[24]]$topic,
data = tidy(lda_list[[24]]$lda_model,
matrix = "gamma"
) %>%
arrange(document,
desc(gamma)
) %>%
mutate(topic_name = case_when(#topic %in% c() ~ "unknown",
topic %in% c(2, 3) ~ "no_rats_found",
topic %in% c(1) ~ "rats_found"
),
model_ngram = lda_list[[24]]$ngram,
model_topic = lda_list[[24]]$topic
)
)
ProbDocInTopic_ngram03_topic04 <-
list(ngram = lda_list[[25]]$ngram,
topic = lda_list[[25]]$topic,
data = tidy(lda_list[[25]]$lda_model,
matrix = "gamma"
) %>%
arrange(document,
desc(gamma)
) %>%
mutate(topic_name = case_when(topic %in% c(2, 3) ~ "unknown",
topic %in% c(4) ~ "no_rats_found",
topic %in% c(1) ~ "rats_found"
),
model_ngram = lda_list[[25]]$ngram,
model_topic = lda_list[[25]]$topic
)
)
ProbDocInTopic_ngram04_topic03 <-
list(ngram = lda_list[[35]]$ngram,
topic = lda_list[[35]]$topic,
data = tidy(lda_list[[35]]$lda_model,
matrix = "gamma"
) %>%
arrange(document,
desc(gamma)
) %>%
mutate(topic_name = case_when(topic %in% c(1) ~ "unknown",
topic %in% c(2) ~ "no_rats_found",
topic %in% c(3) ~ "rats_found"
),
model_ngram = lda_list[[35]]$ngram,
model_topic = lda_list[[35]]$topic
)
)
ProbDocInTopic_ngram04_topic04 <-
list(ngram = lda_list[[36]]$ngram,
topic = lda_list[[36]]$topic,
data = tidy(lda_list[[36]]$lda_model,
matrix = "gamma"
) %>%
arrange(document,
desc(gamma)
) %>%
mutate(topic_name = case_when(topic %in% c(1, 4) ~ "unknown",
topic %in% c(2) ~ "no_rats_found",
topic %in% c(3) ~ "rats_found"
),
model_ngram = lda_list[[36]]$ngram,
model_topic = lda_list[[36]]$topic
)
)
ProbDocInTopic_ngram05_topic03 <-
list(ngram = lda_list[[46]]$ngram,
topic = lda_list[[46]]$topic,
data = tidy(lda_list[[46]]$lda_model,
matrix = "gamma"
) %>%
arrange(document,
desc(gamma)
) %>%
mutate(topic_name = case_when(topic %in% c(1) ~ "unknown",
topic %in% c(2) ~ "no_rats_found",
topic %in% c(3) ~ "rats_found"
),
model_ngram = lda_list[[46]]$ngram,
model_topic = lda_list[[46]]$topic
)
)
ProbDocInTopic_ngram05_topic04 <-
list(ngram = lda_list[[47]]$ngram,
topic = lda_list[[47]]$topic,
data = tidy(lda_list[[47]]$lda_model,
matrix = "gamma"
) %>%
arrange(document,
desc(gamma)
) %>%
mutate(topic_name = case_when(topic %in% c(1, 2, 4) ~ "unknown",
# topic %in% c() ~ "no_rats_found",
topic %in% c(3) ~ "rats_found"
),
model_ngram = lda_list[[47]]$ngram,
model_topic = lda_list[[47]]$topic
)
)
Here, we put the six individual ProbDocInTopic models together, and add in some of the original information (e.g., the original servicenotes) for context.
ProbDocInTopic_AllModels <-
bind_rows(ProbDocInTopic_ngram03_topic03[[3]],
ProbDocInTopic_ngram03_topic04[[3]],
ProbDocInTopic_ngram04_topic03[[3]],
ProbDocInTopic_ngram04_topic04[[3]],
ProbDocInTopic_ngram05_topic03[[3]],
ProbDocInTopic_ngram05_topic04[[3]]
) %>%
arrange(document,
model_ngram,
model_topic,
gamma
) %>%
rename("servicerequestid" = "document") %>%
left_join(select(ServiceNotesCleaned,
servicerequestid,
servicenotes,
serviceorderdate
),
by = c("servicerequestid" = "servicerequestid")
)
rm(list = ls(pattern = "ProbDocInTopic_ngram"))
str(ProbDocInTopic_AllModels)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 475720 obs. of 8 variables:
$ servicerequestid: chr "09-00001211" "09-00001211" "09-00001211" "09-00001211" ...
$ topic : int 2 3 1 1 2 3 4 1 3 2 ...
$ gamma : num 0.00864 0.00864 0.98272 0.00931 0.00931 ...
$ topic_name : chr "no_rats_found" "no_rats_found" "rats_found" "rats_found" ...
$ model_ngram : int 3 3 3 3 3 3 3 4 4 4 ...
$ model_topic : int 3 3 3 4 4 4 4 3 3 3 ...
$ servicenotes : chr "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" ...
$ serviceorderdate: POSIXct, format: "1999-04-27 12:59:00" "1999-04-27 12:59:00" ...
head(ProbDocInTopic_AllModels, 100)
View(head(ProbDocInTopic_AllModels, 1000))
Remove the datafiles that are no longer needed.
rm(list = ls(pattern = "_list"))
rm(PerTopicPerNgram, top_terms)
Next, for each model (e.g., 3-gram 4-topic), we sum the probabilities given for each numeric topic, into the “rats topics” (e.g., rats_found) which were defined above via visual inspection of the graphs on the Top 10 ngrams in each numeric topic.
I also pull out the 5-gram 4-topic model because it appeared (visually) to be the most accurate individual model.
ProbDocInTopic_ProbsSummed_ByModel <-
ProbDocInTopic_AllModels %>%
group_by(servicerequestid,
model_ngram,
model_topic,
topic_name
) %>%
summarise(prob_ = sum(gamma)
) %>%
ungroup() %>%
left_join(select(ServiceNotesCleaned,
servicerequestid,
servicenotes,
serviceorderdate
),
by = c("servicerequestid" = "servicerequestid")
)
str(ProbDocInTopic_ProbsSummed_ByModel)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 362546 obs. of 7 variables:
$ servicerequestid: chr "09-00001211" "09-00001211" "09-00001211" "09-00001211" ...
$ model_ngram : int 3 3 3 3 3 4 4 4 4 4 ...
$ model_topic : int 3 3 4 4 4 3 3 3 4 4 ...
$ topic_name : chr "no_rats_found" "rats_found" "no_rats_found" "rats_found" ...
$ prob_ : num 0.01728 0.98272 0.97207 0.00931 0.01862 ...
$ servicenotes : chr "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" ...
$ serviceorderdate: POSIXct, format: "1999-04-27 12:59:00" "1999-04-27 12:59:00" ...
head(ProbDocInTopic_ProbsSummed_ByModel, 100)
View(head(ProbDocInTopic_ProbsSummed_ByModel, 1000))
ProbDocInTopic_ProbsSummed_05gram04topic <-
ProbDocInTopic_ProbsSummed_ByModel %>%
filter(model_ngram == 5 &
model_topic == 4)
str(ProbDocInTopic_ProbsSummed_05gram04topic)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 41632 obs. of 7 variables:
$ servicerequestid: chr "09-00001211" "09-00001211" "09-00001410" "09-00001410" ...
$ model_ngram : int 5 5 5 5 5 5 5 5 5 5 ...
$ model_topic : int 4 4 4 4 4 4 4 4 4 4 ...
$ topic_name : chr "rats_found" "unknown" "rats_found" "unknown" ...
$ prob_ : num 0.0224 0.9776 0.9795 0.0205 0.0154 ...
$ servicenotes : chr "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "the rat are coming from an apartment building adjacent to the alley. there is alot of trash pilled up behind the apartm"| __truncated__ "the rat are coming from an apartment building adjacent to the alley. there is alot of trash pilled up behind the apartm"| __truncated__ ...
$ serviceorderdate: POSIXct, format: "1999-04-27 12:59:00" "1999-04-27 12:59:00" ...
head(ProbDocInTopic_ProbsSummed_05gram04topic, 100)
View(head(ProbDocInTopic_ProbsSummed_05gram04topic, 1000))
Next, for each ngram-topic combination, I create histograms of the probabilities assigned to each topic. This is done to help visualy determine if the topic assignments are clearly separating documents (serv_request_id values).
A log10 transformation of the probability is done to help more clearly see any differences.
# str(ProbDocInTopic_ProbsSummed_ByModel)
ProbDocInTopic_ProbsSummed_ByModel_Details <-
ProbDocInTopic_ProbsSummed_ByModel %>%
mutate(model = paste0("0",
model_ngram,
"gram_",
"0",
model_topic,
"topic"
),
serviceorder_yr = year(serviceorderdate),
model_and_yr = paste0(model,
"_",
as.character(serviceorder_yr)
)
)
head(ProbDocInTopic_ProbsSummed_ByModel_Details, 100)
View(head(ProbDocInTopic_ProbsSummed_ByModel_Details, 1000))
TopicDistro_MainModels_Histogram_ByModel <-
ProbDocInTopic_ProbsSummed_ByModel_Details %>%
split(.$model) %>%
map(~ ggplot(data = .x,
aes(x = prob_,
fill = topic_name
)
) +
geom_histogram(binwidth = 0.05) +
scale_x_continuous(limits = c(0, 1)
) +
scale_y_log10() + # this transformation is used to help more clearly see any differences in the probability values
ggtitle(label = paste0(.x$model,
"_Histogram"
)
) +
theme(legend.position = "none") +
labs(x = "Prob of ServiceRequestId in the Topic",
y = "log10 of counts"
) +
facet_wrap(~topic_name)
)
TopicDistro_MainModels_Histogram_ByModelYr <-
ProbDocInTopic_ProbsSummed_ByModel_Details %>%
split(.$model_and_yr) %>%
map(~ ggplot(data = .x,
aes(x = prob_,
fill = topic_name
)
) +
geom_histogram(binwidth = 0.05) +
scale_x_continuous(limits = c(0, 1)
) +
scale_y_log10() + # this transformation is used to help more clearly see any differences in the probability values
ggtitle(label = paste0(.x$model_and_yr,
"_Histogram"
)
) +
theme(legend.position = "none") +
labs(x = "Prob of ServiceRequestId in the Topic",
y = "log10 of counts"
) +
facet_wrap(~topic_name)
)
Saving the histograms.
Removing no-longer-needed files.
rm(list = ls(pattern = "TopicDistro_"))
Comparing how the histograms look with a regular y-scale vs a log10 y-scale.
ProbDocInTopic_ProbsSummed_ByModel_Details %>%
split(.$model) %>%
map(~ ggplot(data = .x,
aes(x = prob_,
fill = topic_name
)
) +
geom_histogram(binwidth = 0.05) +
scale_x_continuous(limits = c(0, 1)
) +
scale_y_log10() +
ggtitle(label = paste0(.x$model,
"_Histogram"
)
) +
theme(legend.position = "none") +
labs(x = "Prob of ServiceRequestId in the Topic",
y = "log10 of counts"
) +
facet_wrap(~topic_name,
scales = "fixed"
)
)
$`03gram_03topic`
$`03gram_04topic`
$`04gram_03topic`
$`04gram_04topic`
$`05gram_03topic`
$`05gram_04topic`
ProbDocInTopic_ProbsSummed_ByModel_Details %>%
split(.$model) %>%
map(~ ggplot(data = .x,
aes(x = prob_,
fill = topic_name
)
) +
geom_histogram(binwidth = 0.05) +
scale_x_continuous(limits = c(0, 1)
) +
# scale_y_log10() +
ggtitle(label = paste0(.x$model,
"_Histogram"
)
) +
theme(legend.position = "none") +
labs(x = "Prob of ServiceRequestId in the Topic",
y = "counts"
) +
coord_cartesian(ylim = c(0, 5000)
) +
facet_wrap(~topic_name,
scales = "fixed"
)
)
$`03gram_03topic`
$`03gram_04topic`
$`04gram_03topic`
$`04gram_04topic`
$`05gram_03topic`
$`05gram_04topic`
Then, for each servicerequestid and topic_name we calculate the mean topic probability across all the models. NOTE: This step could be modified more as my instinct is that more weight should probably be given to the models with larger ngrams and topics (e.g., the 5-gram & 4-topic model). However, using larger values of n-grams will not be able to analyze those records that do not have at least n words. Meaning that smaller values of n-grams can analyze more documents, but possibly less accurately.
ProbDocInTopic_MeanProb_BySrvcRqstId <-
ProbDocInTopic_ProbsSummed_ByModel %>%
group_by(servicerequestid,
topic_name
) %>%
summarise(MeanProb = mean(prob_, na.rm = TRUE)
) %>%
left_join(select(ServiceNotesCleaned,
servicerequestid,
servicenotes,
serviceorderdate
),
by = c("servicerequestid" = "servicerequestid")
) %>%
arrange(servicerequestid,
desc(MeanProb)
)
str(ProbDocInTopic_MeanProb_BySrvcRqstId)
Classes ‘grouped_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 73194 obs. of 5 variables:
$ servicerequestid: chr "09-00001211" "09-00001211" "09-00001211" "09-00001323" ...
$ topic_name : chr "unknown" "no_rats_found" "rats_found" "rats_found" ...
$ MeanProb : num 0.587 0.399 0.178 0.918 0.067 ...
$ servicenotes : chr "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "rats in the alley behind house" ...
$ serviceorderdate: POSIXct, format: "1999-04-27 12:59:00" "1999-04-27 12:59:00" ...
- attr(*, "vars")= chr "servicerequestid"
- attr(*, "indices")=List of 24398
..$ : int 0 1 2
..$ : int 3 4 5
..$ : int 6 7 8
..$ : int 9 10 11
..$ : int 12 13 14
..$ : int 15 16 17
..$ : int 18 19 20
..$ : int 21 22 23
..$ : int 24 25 26
..$ : int 27 28 29
..$ : int 30 31 32
..$ : int 33 34 35
..$ : int 36 37 38
..$ : int 39 40 41
..$ : int 42 43 44
..$ : int 45 46 47
..$ : int 48 49 50
..$ : int 51 52 53
..$ : int 54 55 56
..$ : int 57 58 59
..$ : int 60 61 62
..$ : int 63 64 65
..$ : int 66 67 68
..$ : int 69 70 71
..$ : int 72 73 74
..$ : int 75 76 77
..$ : int 78 79 80
..$ : int 81 82 83
..$ : int 84 85 86
..$ : int 87 88 89
..$ : int 90 91 92
..$ : int 93 94 95
..$ : int 96 97 98
..$ : int 99 100 101
..$ : int 102 103 104
..$ : int 105 106 107
..$ : int 108 109 110
..$ : int 111 112 113
..$ : int 114 115 116
..$ : int 117 118 119
..$ : int 120 121 122
..$ : int 123 124 125
..$ : int 126 127 128
..$ : int 129 130 131
..$ : int 132 133 134
..$ : int 135 136 137
..$ : int 138 139 140
..$ : int 141 142 143
..$ : int 144 145 146
..$ : int 147 148 149
..$ : int 150 151 152
..$ : int 153 154 155
..$ : int 156 157 158
..$ : int 159 160 161
..$ : int 162 163 164
..$ : int 165 166 167
..$ : int 168 169 170
..$ : int 171 172 173
..$ : int 174 175 176
..$ : int 177 178 179
..$ : int 180 181 182
..$ : int 183 184 185
..$ : int 186 187 188
..$ : int 189 190 191
..$ : int 192 193 194
..$ : int 195 196 197
..$ : int 198 199 200
..$ : int 201 202 203
..$ : int 204 205 206
..$ : int 207 208 209
..$ : int 210 211 212
..$ : int 213 214 215
..$ : int 216 217 218
..$ : int 219 220 221
..$ : int 222 223 224
..$ : int 225 226 227
..$ : int 228 229 230
..$ : int 231 232 233
..$ : int 234 235 236
..$ : int 237 238 239
..$ : int 240 241 242
..$ : int 243 244 245
..$ : int 246 247 248
..$ : int 249 250 251
..$ : int 252 253 254
..$ : int 255 256 257
..$ : int 258 259 260
..$ : int 261 262 263
..$ : int 264 265 266
..$ : int 267 268 269
..$ : int 270 271 272
..$ : int 273 274 275
..$ : int 276 277 278
..$ : int 279 280 281
..$ : int 282 283 284
..$ : int 285 286 287
..$ : int 288 289 290
..$ : int 291 292 293
..$ : int 294 295 296
.. [list output truncated]
- attr(*, "group_sizes")= int 3 3 3 3 3 3 3 3 3 3 ...
- attr(*, "biggest_group_size")= int 3
- attr(*, "labels")='data.frame': 24398 obs. of 1 variable:
..$ servicerequestid: chr "09-00001211" "09-00001323" "09-00001410" "09-00001865" ...
..- attr(*, "vars")= chr "servicerequestid"
tail(ProbDocInTopic_MeanProb_BySrvcRqstId, 100)
View(head(ProbDocInTopic_MeanProb_BySrvcRqstId, 1000))
View(tail(ProbDocInTopic_MeanProb_BySrvcRqstId, 1000))
Here, we create a dataset suitable for graphing and analyzing, by simply selecting the highest topic probability (the MeanProb value) assigned to each topic. We also do this for the single 5-gram 4-topic model.
# ProbDocInTopic_AllModels
# ProbDocInTopic_ProbsSummed_ByModel
# ProbDocInTopic_MeanProb_BySrvcRqstId
#
#
# ProbDocInTopic_AllModels %>% select(servicerequestid) %>% distinct() %>% nrow
# ProbDocInTopic_ProbsSummed_ByModel %>% select(servicerequestid) %>% distinct() %>% nrow
# ProbDocInTopic_MeanProb_BySrvcRqstId %>% select(servicerequestid) %>% distinct() %>% nrow
TopProb_BySrvcRqstId <-
ProbDocInTopic_MeanProb_BySrvcRqstId %>%
mutate(serviceorder_yr = year(serviceorderdate),
# serviceorder_yr2 = as.factor(serviceorder_yr),
yr_group = paste0(as.character(serviceorder_yr),
"_",
as.character(topic_name)
),
model = "AllModels_MeanProb"
) %>%
rename("prob" = "MeanProb") %>%
group_by(servicerequestid) %>%
top_n(1,
prob
) %>%
ungroup() %>%
arrange(prob)
str(TopProb_BySrvcRqstId)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 24398 obs. of 8 variables:
$ servicerequestid: chr "13-00129433" "16-00668555" "16-00740993" "17-00170279" ...
$ topic_name : chr "no_rats_found" "unknown" "unknown" "unknown" ...
$ prob : num 0.376 0.376 0.376 0.376 0.376 ...
$ servicenotes : chr "there is not a locked gate and there are dogs" "On 8/31/16@12:00 pm R Herrington baited 6 rat burrows in the alley & rear yd. Treatment will continue until rodent activity cea"| __truncated__ "On 10/18/16@1:34 pm R Herrington baited 6 rat burrows in the alley and rear yd. Treatment will continue until rodent activity c"| __truncated__ "On 4/11/17@1:12 pm R Herrington baited 6 rat burrows in the alley & rear yd. Treatment will continue until rodent activity cea"| __truncated__ ...
$ serviceorderdate: POSIXct, format: "2013-06-04 09:19:46" "2016-08-29 14:59:00" ...
$ serviceorder_yr : num 2013 2016 2016 2017 2017 ...
$ yr_group : chr "2013_no_rats_found" "2016_unknown" "2016_unknown" "2017_unknown" ...
$ model : chr "AllModels_MeanProb" "AllModels_MeanProb" "AllModels_MeanProb" "AllModels_MeanProb" ...
TopProb_BySrvcRqstId
View(TopProb_BySrvcRqstId)
TopProb_BySrvcRqstId_05gram04topic <-
ProbDocInTopic_ProbsSummed_05gram04topic %>%
mutate(serviceorder_yr = year(serviceorderdate),
# serviceorder_yr2 = as.factor(serviceorder_yr),
yr_group = paste0(as.character(serviceorder_yr),
"_",
as.character(topic_name)
),
model = "5gram4topic"
) %>%
rename("prob" = "prob_") %>%
group_by(servicerequestid) %>%
top_n(1,
prob
) %>%
ungroup() %>%
arrange(prob)
str(TopProb_BySrvcRqstId_05gram04topic)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 20816 obs. of 10 variables:
$ servicerequestid: chr "10-00168699" "10-00431559" "14-00185064" "14-00108459" ...
$ model_ngram : int 5 5 5 5 5 5 5 5 5 5 ...
$ model_topic : int 4 4 4 4 4 4 4 4 4 4 ...
$ topic_name : chr "rats_found" "rats_found" "unknown" "rats_found" ...
$ prob : num 0.501 0.501 0.501 0.501 0.502 ...
$ servicenotes : chr "On 6/4/10 Task force baited 5 rat burrows in the rear yd by shed and Ivy along fence line." "ON 2/23/11@ 9:05am T Taylor baited 6 rat burrows under front porch. First Strike/soft bait, EPA#7173-258, .0025%, 6oz, Gloves "| __truncated__ "On 7/1/14@9:20am R Herrington baited 1 rat burrow under front steps. Ditrac/powder/ EPA#12455-56/ 0.2% 3oz, B&G duster/ gloves "| __truncated__ "On 4/23/2014 @ 1:40 pm Mr. Cornes baited one rat burrow in the front. Ditrac /powder, EPA#12455-56, 0.2%, 1oz, B&G duster." ...
$ serviceorderdate: POSIXct, format: "2010-05-17 12:46:44" "2010-12-31 02:20:04" ...
$ serviceorder_yr : num 2010 2010 2014 2014 2011 ...
$ yr_group : chr "2010_rats_found" "2010_rats_found" "2014_unknown" "2014_rats_found" ...
$ model : chr "5gram4topic" "5gram4topic" "5gram4topic" "5gram4topic" ...
TopProb_BySrvcRqstId_05gram04topic
View(TopProb_BySrvcRqstId_05gram04topic)
Now, we can simply create the freqpoly plots and density plots for each year-topic combination to investigate how the topic assignment did over time, and then save them.
# str(TopProb_BySrvcRqstId)
# str(TopProb_BySrvcRqstId_05gram04topic)
TopicDistro_AllModelsMean_Freqpoly <-
TopProb_BySrvcRqstId %>%
split(.$serviceorder_yr) %>%
map(~ ggplot(data = .x,
aes(x = prob,
colour = topic_name
)
) +
geom_freqpoly(binwidth = 0.05,
alpha = 0.6
) +
scale_x_continuous(limits = c(0, 1)
) +
ggtitle(label = paste0("TopicDistro_",
.x$model,
"_Freqpoly_",
as.character(.x$serviceorder_yr)
)
) +
labs(x = "Prob of ServiceRequestId in the Topic")
)
TopicDistro_AllModelsMean_Density <-
TopProb_BySrvcRqstId %>%
split(.$serviceorder_yr) %>%
map(~ ggplot(data = .x,
aes(x = prob,
colour = topic_name
)
) +
geom_density() +
scale_x_continuous(limits = c(0, 1)
) +
ggtitle(label = paste0("TopicDistro_",
.x$model,
"_Density_",
as.character(.x$serviceorder_yr)
)
) +
labs(x = "Prob of ServiceRequestId in the Topic")
)
TopicDistro_05gram04topic_Freqpoly <-
TopProb_BySrvcRqstId_05gram04topic %>%
split(.$serviceorder_yr) %>%
map(~ ggplot(data = .x,
aes(x = prob,
colour = topic_name
)
) +
geom_freqpoly(binwidth = 0.05,
alpha = 0.6
) +
scale_x_continuous(limits = c(0, 1)
) +
ggtitle(label = paste0("TopicDistro_",
.x$model,
"_Freqpoly_",
as.character(.x$serviceorder_yr)
)
) +
labs(x = "Prob of ServiceRequestId in the Topic")
)
TopicDistro_05gram04topic_Density <-
TopProb_BySrvcRqstId_05gram04topic %>%
split(.$serviceorder_yr) %>%
map(~ ggplot(data = .x,
aes(x = prob,
colour = topic_name
)
) +
geom_density() +
scale_x_continuous(limits = c(0, 1)
) +
ggtitle(label = paste0("TopicDistro_",
.x$model,
"_Density_",
as.character(.x$serviceorder_yr)
)
) +
labs(x = "Prob of ServiceRequestId in the Topic")
)
# str(TopicDistro_AllModelsMean_Freqpoly[[18]])
# str(TopicDistro_AllModelsMean_Density[[18]])
# str(TopicDistro_05gram04topic_Freqpoly[[18]])
# str(TopicDistro_05gram04topic_Density[[18]])
# TopicDistro_AllModelsMean_Freqpoly
# TopicDistro_AllModelsMean_Density
# TopicDistro_05gram04topic_Freqpoly
# TopicDistro_05gram04topic_Density
Saving the visuals created above.
Removing no-longer-needed files.
rm(list = ls(pattern = "TopicDistro_"))
As another method to determine the “correct” topic assignment, here I simply count the number of times a topic assignment was given to each document (servicerequestid), and assign the “correct” topic to the topic with the most assignments. In the case of ties (e.g., all three topics, each assigned twice), I use the MeanProb calculated above in the ProbDocInTopic_MeanProb_BySrvcRqstId dataframe previously.
str(ProbDocInTopic_ProbsSummed_ByModel)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 362546 obs. of 7 variables:
$ servicerequestid: chr "09-00001211" "09-00001211" "09-00001211" "09-00001211" ...
$ model_ngram : int 3 3 3 3 3 4 4 4 4 4 ...
$ model_topic : int 3 3 4 4 4 3 3 3 4 4 ...
$ topic_name : chr "no_rats_found" "rats_found" "no_rats_found" "rats_found" ...
$ prob_ : num 0.01728 0.98272 0.97207 0.00931 0.01862 ...
$ servicenotes : chr "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" ...
$ serviceorderdate: POSIXct, format: "1999-04-27 12:59:00" "1999-04-27 12:59:00" ...
View(ProbDocInTopic_ProbsSummed_ByModel)
# ProbDocInTopic_ProbsSummed_ByModel %>% select(model_ngram, model_topic) %>% distinct()
str(ProbDocInTopic_MeanProb_BySrvcRqstId)
Classes ‘grouped_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 73194 obs. of 5 variables:
$ servicerequestid: chr "09-00001211" "09-00001211" "09-00001211" "09-00001323" ...
$ topic_name : chr "unknown" "no_rats_found" "rats_found" "rats_found" ...
$ MeanProb : num 0.587 0.399 0.178 0.918 0.067 ...
$ servicenotes : chr "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "rats in the alley behind house" ...
$ serviceorderdate: POSIXct, format: "1999-04-27 12:59:00" "1999-04-27 12:59:00" ...
- attr(*, "vars")= chr "servicerequestid"
- attr(*, "indices")=List of 24398
..$ : int 0 1 2
..$ : int 3 4 5
..$ : int 6 7 8
..$ : int 9 10 11
..$ : int 12 13 14
..$ : int 15 16 17
..$ : int 18 19 20
..$ : int 21 22 23
..$ : int 24 25 26
..$ : int 27 28 29
..$ : int 30 31 32
..$ : int 33 34 35
..$ : int 36 37 38
..$ : int 39 40 41
..$ : int 42 43 44
..$ : int 45 46 47
..$ : int 48 49 50
..$ : int 51 52 53
..$ : int 54 55 56
..$ : int 57 58 59
..$ : int 60 61 62
..$ : int 63 64 65
..$ : int 66 67 68
..$ : int 69 70 71
..$ : int 72 73 74
..$ : int 75 76 77
..$ : int 78 79 80
..$ : int 81 82 83
..$ : int 84 85 86
..$ : int 87 88 89
..$ : int 90 91 92
..$ : int 93 94 95
..$ : int 96 97 98
..$ : int 99 100 101
..$ : int 102 103 104
..$ : int 105 106 107
..$ : int 108 109 110
..$ : int 111 112 113
..$ : int 114 115 116
..$ : int 117 118 119
..$ : int 120 121 122
..$ : int 123 124 125
..$ : int 126 127 128
..$ : int 129 130 131
..$ : int 132 133 134
..$ : int 135 136 137
..$ : int 138 139 140
..$ : int 141 142 143
..$ : int 144 145 146
..$ : int 147 148 149
..$ : int 150 151 152
..$ : int 153 154 155
..$ : int 156 157 158
..$ : int 159 160 161
..$ : int 162 163 164
..$ : int 165 166 167
..$ : int 168 169 170
..$ : int 171 172 173
..$ : int 174 175 176
..$ : int 177 178 179
..$ : int 180 181 182
..$ : int 183 184 185
..$ : int 186 187 188
..$ : int 189 190 191
..$ : int 192 193 194
..$ : int 195 196 197
..$ : int 198 199 200
..$ : int 201 202 203
..$ : int 204 205 206
..$ : int 207 208 209
..$ : int 210 211 212
..$ : int 213 214 215
..$ : int 216 217 218
..$ : int 219 220 221
..$ : int 222 223 224
..$ : int 225 226 227
..$ : int 228 229 230
..$ : int 231 232 233
..$ : int 234 235 236
..$ : int 237 238 239
..$ : int 240 241 242
..$ : int 243 244 245
..$ : int 246 247 248
..$ : int 249 250 251
..$ : int 252 253 254
..$ : int 255 256 257
..$ : int 258 259 260
..$ : int 261 262 263
..$ : int 264 265 266
..$ : int 267 268 269
..$ : int 270 271 272
..$ : int 273 274 275
..$ : int 276 277 278
..$ : int 279 280 281
..$ : int 282 283 284
..$ : int 285 286 287
..$ : int 288 289 290
..$ : int 291 292 293
..$ : int 294 295 296
.. [list output truncated]
- attr(*, "group_sizes")= int 3 3 3 3 3 3 3 3 3 3 ...
- attr(*, "biggest_group_size")= int 3
- attr(*, "labels")='data.frame': 24398 obs. of 1 variable:
..$ servicerequestid: chr "09-00001211" "09-00001323" "09-00001410" "09-00001865" ...
..- attr(*, "vars")= chr "servicerequestid"
View(ProbDocInTopic_MeanProb_BySrvcRqstId)
TopicAssigned_ByCounts_ByMeanProb <-
ProbDocInTopic_ProbsSummed_ByModel %>%
group_by(servicerequestid,
model_ngram,
model_topic
) %>%
top_n(1,
prob_
) %>%
ungroup() %>%
count(servicerequestid,
topic_name
) %>%
left_join(ProbDocInTopic_MeanProb_BySrvcRqstId,
by = c("servicerequestid" = "servicerequestid",
"topic_name" = "topic_name"
)
) %>%
select(servicerequestid,
topic_name,
n,
MeanProb
) %>%
group_by(servicerequestid) %>%
arrange(servicerequestid,
desc(n),
desc(MeanProb)
) %>%
ungroup() %>%
group_by(servicerequestid) %>%
mutate(RowNum = row_number()
) %>%
ungroup() %>%
filter(RowNum == 1) %>%
rename(times_topic_assigned = n) %>%
left_join(ServiceNotesCleaned,
by = "servicerequestid"
) %>%
select(servicerequestid,
topic_name,
times_topic_assigned,
MeanProb,
servicenotes,
servicenotes_nonums_nopunc,
serviceorderdate,
serviceorder_date,
serviceorder_yr,
serviceorder_yr_posix,
serviceorder_mth,
serviceorder_yrmth,
serviceorder_yrmth_posix,
serviceorder_day,
serviceorder_wkday
)
str(ServiceNotesCleaned)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 26302 obs. of 17 variables:
$ servicerequestid : chr "09-00001211" "09-00001323" "09-00001410" "09-00001865" ...
$ servicepriority : chr "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
$ servicecode : chr "S0311" "S0311" "S0311" "S0311" ...
$ servicecodedescription : chr "Rat Abatement" "Rat Abatement" "Rat Abatement" "Rat Abatement" ...
$ servicetypecode : chr "DEPAHEAL" "DEPAHEAL" "DEPAHEAL" "DEPAHEAL" ...
$ servicetypecodedescription: chr "DOH" "DOH" "DOH" "DOH" ...
$ serviceorderdate : POSIXct, format: "1999-04-27 12:59:00" "1999-04-30 19:59:00" ...
$ servicenotes : chr "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "rats in the alley behind house" "the rat are coming from an apartment building adjacent to the alley. there is alot of trash pilled up behind the apartm"| __truncated__ "The vector control branch baited at 2874 Perry St. NE for rats on 5-25-99." ...
$ serviceorder_date : Date, format: "1999-04-27" "1999-04-30" ...
$ serviceorder_yr : num 1999 1999 1999 1999 1999 ...
$ serviceorder_yr_posix : POSIXct, format: "1999-01-01" "1999-01-01" ...
$ serviceorder_mth : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 4 4 5 5 5 5 5 5 6 6 ...
$ serviceorder_yrmth : chr "1999-04" "1999-04" "1999-05" "1999-05" ...
$ serviceorder_yrmth_posix : POSIXct, format: "1999-04-01" "1999-04-01" ...
$ serviceorder_day : int 27 30 6 14 19 21 26 28 3 8 ...
$ serviceorder_wkday : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 3 6 5 6 4 6 4 6 5 3 ...
$ servicenotes_nonums_nopunc: chr "customer called vector control control no " "rats alley house" "rat coming apartment building adjacent alley alot trash pilled apartment building" "vector control branch baited perry st ne rats " ...
str(TopicAssigned_ByCounts_ByMeanProb)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 24398 obs. of 15 variables:
$ servicerequestid : chr "09-00001211" "09-00001323" "09-00001410" "09-00001865" ...
$ topic_name : chr "unknown" "rats_found" "no_rats_found" "unknown" ...
$ times_topic_assigned : int 3 2 3 4 3 3 3 4 2 2 ...
$ MeanProb : num 0.587 0.918 0.597 0.785 0.598 ...
$ servicenotes : chr "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "rats in the alley behind house" "the rat are coming from an apartment building adjacent to the alley. there is alot of trash pilled up behind the apartm"| __truncated__ "The vector control branch baited at 2874 Perry St. NE for rats on 5-25-99." ...
$ servicenotes_nonums_nopunc: chr "customer called vector control control no " "rats alley house" "rat coming apartment building adjacent alley alot trash pilled apartment building" "vector control branch baited perry st ne rats " ...
$ serviceorderdate : POSIXct, format: "1999-04-27 12:59:00" "1999-04-30 19:59:00" ...
$ serviceorder_date : Date, format: "1999-04-27" "1999-04-30" ...
$ serviceorder_yr : num 1999 1999 1999 1999 1999 ...
$ serviceorder_yr_posix : POSIXct, format: "1999-01-01" "1999-01-01" ...
$ serviceorder_mth : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 4 4 5 5 5 5 5 5 6 6 ...
$ serviceorder_yrmth : chr "1999-04" "1999-04" "1999-05" "1999-05" ...
$ serviceorder_yrmth_posix : POSIXct, format: "1999-04-01" "1999-04-01" ...
$ serviceorder_day : int 27 30 6 14 19 21 26 28 3 8 ...
$ serviceorder_wkday : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 3 6 5 6 4 6 4 6 5 3 ...
head(TopicAssigned_ByCounts_ByMeanProb, 1000)
View(TopicAssigned_ByCounts_ByMeanProb)
Remove no-longer-needed files.
rm(list = ls(pattern = "ProbDocInTopic_"))
The analyses above (particularly the plots) show that the n-grams in each topic are not “pure,” in the sense that n-grams manually interpreted as rats_found, no_rats_found, and unknown can sometime be found in the same topic.
So it might be better to simply use LDA to find the frequent terms, and then build a simple regex function to assign topics based on these. Because it appears that the language/text varies over time, let’s do LDA for each year.
Based on the above analyses, it also looks like the 5-gram 4-topic model works well/best. So I’ll just do LDA with those parameters.
Here, we transform the servicenotes field into one row per n-gram.
# str(ServiceNotesCleaned2)
Rat_5gram <- ServiceNotesCleaned2 %>%
split(.$serviceorder_yr) %>%
map(~ unnest_tokens(tbl = .x,
n_gram,
servicenotes_cleaned,
token = "ngrams",
n = 5
)
)
# str(Rat_5gram)
# length(Rat_5gram)
str(Rat_5gram[[19]])
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 31921 obs. of 18 variables:
$ servicerequestid : chr "17-00000634" "17-00000634" "17-00000634" "17-00000718" ...
$ servicepriority : chr "Standard" "Standard" "Standard" "Standard" ...
$ servicecode : chr "S0311" "S0311" "S0311" "S0311" ...
$ servicecodedescription : chr "Rodent Inspection and Treatment" "Rodent Inspection and Treatment" "Rodent Inspection and Treatment" "Rodent Inspection and Treatment" ...
$ servicetypecode : chr "DEPAHEAL" "DEPAHEAL" "DEPAHEAL" "DEPAHEAL" ...
$ servicetypecodedescription: chr "DOH- Department Of Health" "DOH- Department Of Health" "DOH- Department Of Health" "DOH- Department Of Health" ...
$ serviceorderdate : POSIXct, format: "2017-01-04 16:06:55" "2017-01-04 16:06:55" ...
$ servicenotes : chr "On 1/18/17, 1/31/17, 2/13/17 D Broomfield found gate locked and left service notice." "On 1/18/17, 1/31/17, 2/13/17 D Broomfield found gate locked and left service notice." "On 1/18/17, 1/31/17, 2/13/17 D Broomfield found gate locked and left service notice." "On 1/9/17 M Parker found no rat burrows or activity on property and public space; left service notice. Citizen called about ne"| __truncated__ ...
$ serviceorder_date : Date, format: "2017-01-04" "2017-01-04" ...
$ serviceorder_yr : num 2017 2017 2017 2017 2017 ...
$ serviceorder_yr_posix : POSIXct, format: "2017-01-01" "2017-01-01" ...
$ serviceorder_mth : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 1 1 1 1 1 1 1 1 1 1 ...
$ serviceorder_yrmth : chr "2017-01" "2017-01" "2017-01" "2017-01" ...
$ serviceorder_yrmth_posix : POSIXct, format: "2017-01-01" "2017-01-01" ...
$ serviceorder_day : int 4 4 4 4 4 4 4 4 4 4 ...
$ serviceorder_wkday : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 4 4 4 4 4 4 4 4 4 4 ...
$ servicenotes_nonums_nopunc: chr " broomfield found gate locked left service notice" " broomfield found gate locked left service notice" " broomfield found gate locked left service notice" " parker found no rat burrows activity property public space left service notice citizen called trash" ...
$ n_gram : chr "broomfield found gate locked left" "found gate locked left service" "gate locked left service notice" "parker found no rat burrows" ...
Counting the 5-grams in each servicerequestid.
word_counts_5gram <- Rat_5gram %>%
map(~ count(x = .x,
servicerequestid,
n_gram,
sort = TRUE
)
)
# str(word_counts_5gram)
# length(word_counts_5gram)
str(word_counts_5gram[[19]])
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 31921 obs. of 3 variables:
$ servicerequestid: chr "17-00000634" "17-00000634" "17-00000634" "17-00000718" ...
$ n_gram : chr "broomfield found gate locked left" "found gate locked left service" "gate locked left service notice" "activity property public space left" ...
$ n : int 1 1 1 1 1 1 1 1 1 1 ...
Transforming the dataframe into a document term matrix - i.e., documents (servicerequestids) are the rows and 5-grams are the columns.
dtm_5gram <- word_counts_5gram %>%
map(~ cast_dtm(data = .x,
document = servicerequestid,
term = n_gram,
value = n,
# weighting = tm::weightTfIdf,
# using term frequency inverse document frequency (TfIdf) weighting is another, possibly more accurate measure, but topicmodels::LDA (used below) only accepts document term matrices with term-frequency weighting
weighting = tm::weightTf
)
)
# str(dtm_5gram)
# length(dtm_5gram)
dtm_5gram[[19]]
<<DocumentTermMatrix (documents: 3179, terms: 3773)>>
Non-/sparse entries: 31921/11962446
Sparsity : 100%
Maximal term length: 51
Weighting : term frequency (tf)
Here I use Latent Dirichlet Allocation for topic modeling. As mentioned above, as I’ll merely be using the topics to inform a regex function, I will only create a 4-topic model.
lda_5gram4topic <- dtm_5gram %>%
map(~ LDA(x = .x,
k = 4,
control = list(seed = 123456789,
verbose = 0
)
)
)
# str(lda_5gram4topic)
# length(lda_5gram4topic)
lda_5gram4topic[[19]]
A LDA_VEM topic model with 4 topics.
Creating a dataframe with beta - the per-topic-per-ngram probability (i.e., the probability that each ngram is in each topic).
PerTopicPer5gram <- lda_5gram4topic %>%
map(~ tidy(.x,
matrix = "beta"
) %>%
arrange(term,
desc(beta)
)
)
# str(PerTopicPer5gram)
# length(PerTopicPer5gram)
PerTopicPer5gram[[19]]
Creating a dataframe with just the top ten terms (ranked by beta) in each topic.
top_terms_5gram <- PerTopicPer5gram %>%
map(~ group_by(.x,
topic
) %>%
top_n(10,
beta
) %>%
ungroup() %>%
arrange(topic,
-beta
)
)
# str(top_terms_5gram)
# length(top_terms_5gram)
top_terms_5gram[[1]]
Now we can plot the top 10 5-grams in each topic to visually inspect if the topic classifications “make sense” based on the 5-gram text.
Here, we’re just creating and saving the plots themselves.
year_list <- names(top_terms_5gram)
TopNgrams_ByTopic_5gram_BarGraphs <-
map2(.x = top_terms_5gram,
.y = year_list,
.f = ~ mutate(.x,
term = reorder(term,
beta
),
topic = paste0("Topic ",
str_pad(as.character(topic),
width = 2,
side = "left",
pad = "0"
)
)
) %>%
ggplot(aes(x = term,
y = beta,
fill = factor(topic)
)
) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic,
scales = "free",
ncol = 2
) +
ggplot_theme_basic +
# theme(plot.title = element_text(size = 11),
# axis.title = element_text(size = 10),
# axis.text = element_text(size = 9)
# ) +
labs(title = "Most Common Terms Per Topic",
subtitle = .y,
x = "5-gram",
y = "probability of the 5-gram in the topic"
) +
coord_flip()
)
TopNgrams_ByTopic_5gram_BarGraphs[[19]] # plot for 2017
# str(TopNgrams_ByTopic_5gram_BarGraphs[[19]])
TopNgrams_ByTopic_5gram_BarGraphs %>%
map(~ ggsave(paste0(wd,
"/Viz/",
"New_b_",
.x$labels$subtitle,
"_",
str_replace_all(.x$labels$x,
"-",
""),
"4topic",
"_Top10Terms_facet.png"
),
.x,
# scale = 4,
width = 10,
height = 7,
)
)
$`1999`
NULL
$`2000`
NULL
$`2001`
NULL
$`2002`
NULL
$`2003`
NULL
$`2004`
NULL
$`2005`
NULL
$`2006`
NULL
$`2007`
NULL
$`2008`
NULL
$`2009`
NULL
$`2010`
NULL
$`2011`
NULL
$`2012`
NULL
$`2013`
NULL
$`2014`
NULL
$`2015`
NULL
$`2016`
NULL
$`2017`
NULL
To help inform our regex model, we can also investigate the number of times a 5-gram was used across all years. Then we can use these 5-grams and the bar plots created above, to build out the regex model.
# str(top_terms_5gram)
top_terms_5gram_all_years <- top_terms_5gram %>%
bind_rows() %>%
count(term) %>%
arrange(desc(n)
)
regex_rats_found <- "(a){0,1}ba(i){0,1}ted|blocks epa( ){0,1}|ditrac|( ){0,1}epa( ){0,1}|rat(s){0,1} burrows found|reveal rat burrows|rat burrows (n|r)ear property|soft bait"
regex_no_rats_found <- "no rat(s){0,1}|no rodent|no action|no (active ){0,1}burrow(s){0,1}|no(t){0,1} eviden(ce){0,1}(ts){0,1}|no sign(s){0,1} rat(s){0,1}|no sign(s){0,1}|no(t){0,1} find"
# View(
top_terms_5gram_all_years %>%
filter(!str_detect(term,
regex_rats_found
)
) %>%
filter(!str_detect(term,
regex_no_rats_found
)
)
# )
Remove no-longer-needed files.
rm(list = ls(pattern = "_5gram"))
rm(year_list)
Now we have the info to build out the regex model itself.
regex_model <- ServiceNotesCleaned2 %>%
select(servicerequestid,
servicenotes,
servicenotes_cleaned
) %>%
mutate(rats_found = str_detect(servicenotes_cleaned,
regex_rats_found
),
no_rats_found = str_detect(servicenotes_cleaned,
regex_no_rats_found
),
investigation_outcome = case_when(rats_found == TRUE &
no_rats_found == FALSE ~ "rats_found",
rats_found == FALSE &
no_rats_found == TRUE ~ "no_rats_found",
TRUE ~ "unknown"
)
)
# confirm "unknown" functions as desired
View(filter(regex_model,
servicerequestid == "09-00003482"
)
)
# confirm "rats_found" functions as desired
View(filter(regex_model,
servicerequestid == "17-00433923"
)
)
rm(regex_rats_found, regex_no_rats_found)
dim(regex_model)
[1] 26302 6
str(regex_model)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 26302 obs. of 6 variables:
$ servicerequestid : chr "09-00001211" "09-00001323" "09-00001410" "09-00001865" ...
$ servicenotes : chr "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "rats in the alley behind house" "the rat are coming from an apartment building adjacent to the alley. there is alot of trash pilled up behind the apartm"| __truncated__ "The vector control branch baited at 2874 Perry St. NE for rats on 5-25-99." ...
$ servicenotes_cleaned : chr "customer called vector control control no " "rats alley house" "rat coming apartment building adjacent alley alot trash pilled apartment building" "vector control branch baited perry st rats " ...
$ rats_found : logi FALSE FALSE FALSE TRUE TRUE FALSE ...
$ no_rats_found : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ investigation_outcome: chr "unknown" "unknown" "unknown" "rats_found" ...
regex_model
regex_confirm <- regex_model %>%
filter(investigation_outcome == "rats_found") %>%
sample_n(5) %>%
bind_rows(filter(regex_model,
investigation_outcome == "no_rats_found"
) %>%
sample_n(5)
) %>%
bind_rows(filter(regex_model,
investigation_outcome == "unknown"
) %>%
sample_n(5)
)
regex_confirm
View(regex_confirm)
rm(regex_confirm)
So now, we can compare four different models, each being slight variations of LDA models and regex.
TopProb_BySrvcRqstId assigns the topic by taking the mean topic probability score across six LDA models.TopProb_BySrvcRqstId_05gram04topic assigns the topic by simply using the probability score from only the 5gram4topic LDA model.TopicAssigned_ByCounts_ByMeanProb assigns the topic by taking the most frequently assigned topic across six LDA models.regex_model assigns the topic by building out a regular expression based on a 5gram4topic LDA model (done separately for each year).First, let’s give the models more intelligible and more similar names.
rm(TopicAssigned_ByCounts_ByMeanProb,
TopProb_BySrvcRqstId,
TopProb_BySrvcRqstId_05gram04topic,
regex_model
)
object 'TopicAssigned_ByCounts_ByMeanProb' not foundobject 'TopProb_BySrvcRqstId' not foundobject 'TopProb_BySrvcRqstId_05gram04topic' not found
message("Prediction_AllModels_Counts")
Prediction_AllModels_Counts
str(Prediction_AllModels_Counts) # prediction uses the count across all models
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 24398 obs. of 15 variables:
$ servicerequestid : chr "09-00001211" "09-00001323" "09-00001410" "09-00001865" ...
$ topic_name : chr "unknown" "rats_found" "no_rats_found" "unknown" ...
$ times_topic_assigned : int 3 2 3 4 3 3 3 4 2 2 ...
$ MeanProb : num 0.587 0.918 0.597 0.785 0.598 ...
$ servicenotes : chr "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "rats in the alley behind house" "the rat are coming from an apartment building adjacent to the alley. there is alot of trash pilled up behind the apartm"| __truncated__ "The vector control branch baited at 2874 Perry St. NE for rats on 5-25-99." ...
$ servicenotes_nonums_nopunc: chr "customer called vector control control no " "rats alley house" "rat coming apartment building adjacent alley alot trash pilled apartment building" "vector control branch baited perry st ne rats " ...
$ serviceorderdate : POSIXct, format: "1999-04-27 12:59:00" "1999-04-30 19:59:00" ...
$ serviceorder_date : Date, format: "1999-04-27" "1999-04-30" ...
$ serviceorder_yr : num 1999 1999 1999 1999 1999 ...
$ serviceorder_yr_posix : POSIXct, format: "1999-01-01" "1999-01-01" ...
$ serviceorder_mth : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 4 4 5 5 5 5 5 5 6 6 ...
$ serviceorder_yrmth : chr "1999-04" "1999-04" "1999-05" "1999-05" ...
$ serviceorder_yrmth_posix : POSIXct, format: "1999-04-01" "1999-04-01" ...
$ serviceorder_day : int 27 30 6 14 19 21 26 28 3 8 ...
$ serviceorder_wkday : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 3 6 5 6 4 6 4 6 5 3 ...
message("Prediction_AllModels_MeanProb")
Prediction_AllModels_MeanProb
str(Prediction_AllModels_MeanProb) # prediction uses the average probability across all models
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 24398 obs. of 8 variables:
$ servicerequestid: chr "13-00129433" "16-00668555" "16-00740993" "17-00170279" ...
$ topic_name : chr "no_rats_found" "unknown" "unknown" "unknown" ...
$ prob : num 0.376 0.376 0.376 0.376 0.376 ...
$ servicenotes : chr "there is not a locked gate and there are dogs" "On 8/31/16@12:00 pm R Herrington baited 6 rat burrows in the alley & rear yd. Treatment will continue until rodent activity cea"| __truncated__ "On 10/18/16@1:34 pm R Herrington baited 6 rat burrows in the alley and rear yd. Treatment will continue until rodent activity c"| __truncated__ "On 4/11/17@1:12 pm R Herrington baited 6 rat burrows in the alley & rear yd. Treatment will continue until rodent activity cea"| __truncated__ ...
$ serviceorderdate: POSIXct, format: "2013-06-04 09:19:46" "2016-08-29 14:59:00" ...
$ serviceorder_yr : num 2013 2016 2016 2017 2017 ...
$ yr_group : chr "2013_no_rats_found" "2016_unknown" "2016_unknown" "2017_unknown" ...
$ model : chr "AllModels_MeanProb" "AllModels_MeanProb" "AllModels_MeanProb" "AllModels_MeanProb" ...
message("Prediction_05gram04topic_Prob")
Prediction_05gram04topic_Prob
str(Prediction_05gram04topic_Prob) # prediction uses only the 5gram4topic model
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 20816 obs. of 10 variables:
$ servicerequestid: chr "10-00168699" "10-00431559" "14-00185064" "14-00108459" ...
$ model_ngram : int 5 5 5 5 5 5 5 5 5 5 ...
$ model_topic : int 4 4 4 4 4 4 4 4 4 4 ...
$ topic_name : chr "rats_found" "rats_found" "unknown" "rats_found" ...
$ prob : num 0.501 0.501 0.501 0.501 0.502 ...
$ servicenotes : chr "On 6/4/10 Task force baited 5 rat burrows in the rear yd by shed and Ivy along fence line." "ON 2/23/11@ 9:05am T Taylor baited 6 rat burrows under front porch. First Strike/soft bait, EPA#7173-258, .0025%, 6oz, Gloves "| __truncated__ "On 7/1/14@9:20am R Herrington baited 1 rat burrow under front steps. Ditrac/powder/ EPA#12455-56/ 0.2% 3oz, B&G duster/ gloves "| __truncated__ "On 4/23/2014 @ 1:40 pm Mr. Cornes baited one rat burrow in the front. Ditrac /powder, EPA#12455-56, 0.2%, 1oz, B&G duster." ...
$ serviceorderdate: POSIXct, format: "2010-05-17 12:46:44" "2010-12-31 02:20:04" ...
$ serviceorder_yr : num 2010 2010 2014 2014 2011 ...
$ yr_group : chr "2010_rats_found" "2010_rats_found" "2014_unknown" "2014_rats_found" ...
$ model : chr "5gram4topic" "5gram4topic" "5gram4topic" "5gram4topic" ...
message("Prediction_Regex")
Prediction_Regex
str(Prediction_Regex) # prediction uses regex
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 26302 obs. of 6 variables:
$ servicerequestid : chr "09-00001211" "09-00001323" "09-00001410" "09-00001865" ...
$ servicenotes : chr "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "rats in the alley behind house" "the rat are coming from an apartment building adjacent to the alley. there is alot of trash pilled up behind the apartm"| __truncated__ "The vector control branch baited at 2874 Perry St. NE for rats on 5-25-99." ...
$ servicenotes_cleaned : chr "customer called vector control control no " "rats alley house" "rat coming apartment building adjacent alley alot trash pilled apartment building" "vector control branch baited perry st rats " ...
$ rats_found : logi FALSE FALSE FALSE TRUE TRUE FALSE ...
$ no_rats_found : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ investigation_outcome: chr "unknown" "unknown" "unknown" "rats_found" ...
message("Prediction_AllModels_Counts")
Prediction_AllModels_Counts
dim(Prediction_AllModels_Counts) # prediction uses the count across all models
[1] 24398 15
message("Prediction_AllModels_MeanProb")
Prediction_AllModels_MeanProb
dim(Prediction_AllModels_MeanProb) # prediction uses the average probability across all models
[1] 24398 8
message("Prediction_05gram04topic_Prob")
Prediction_05gram04topic_Prob
dim(Prediction_05gram04topic_Prob) # prediction uses only the 5gram4topic model
[1] 20816 10
message("Prediction_Regex")
Prediction_Regex
dim(Prediction_Regex) # prediction uses regex
[1] 26302 6
Now, let’s put everything together with the base data (i.e., the ServiceNotesCleaned2 dataset) to create a “wide” dataset.
a <- Prediction_AllModels_MeanProb %>%
select(servicerequestid,
topic_name,
prob
) %>%
rename(topicname_meanprob = topic_name,
prob_meanprob = prob
)
b <- Prediction_05gram04topic_Prob %>%
select(servicerequestid,
topic_name,
prob
) %>%
rename(topicname_5g4t = topic_name,
prob_5g4t = prob
)
c <- Prediction_AllModels_Counts %>%
select(servicerequestid,
topic_name,
times_topic_assigned,
MeanProb
) %>%
rename(topicname_topcounts = topic_name,
timestopicassigned_topcounts = times_topic_assigned,
prob_topcounts = MeanProb
)
d <- Prediction_Regex %>%
select(servicerequestid,
investigation_outcome
) %>%
rename(topicname_regex = investigation_outcome)
ModelsCompare <- ServiceNotesCleaned2 %>%
left_join(a,
by = "servicerequestid"
) %>%
left_join(b,
by = "servicerequestid"
) %>%
left_join(c,
by = "servicerequestid"
) %>%
left_join(d,
by = "servicerequestid"
) %>%
mutate(matches = case_when(topicname_meanprob == topicname_5g4t &
topicname_meanprob == topicname_topcounts &
topicname_meanprob == topicname_regex ~ "all_match",
is.na(topicname_meanprob) |
is.na(topicname_5g4t)|
is.na(topicname_topcounts)|
is.na(topicname_regex) ~ "one_plus_NA",
TRUE ~ "one_plus_mismatches"
)
)
rm(a, b, c, d)
str(ModelsCompare)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 26302 obs. of 27 variables:
$ servicerequestid : chr "09-00001211" "09-00001323" "09-00001410" "09-00001865" ...
$ servicepriority : chr "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
$ servicecode : chr "S0311" "S0311" "S0311" "S0311" ...
$ servicecodedescription : chr "Rat Abatement" "Rat Abatement" "Rat Abatement" "Rat Abatement" ...
$ servicetypecode : chr "DEPAHEAL" "DEPAHEAL" "DEPAHEAL" "DEPAHEAL" ...
$ servicetypecodedescription : chr "DOH" "DOH" "DOH" "DOH" ...
$ serviceorderdate : POSIXct, format: "1999-04-27 12:59:00" "1999-04-30 19:59:00" ...
$ servicenotes : chr "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "rats in the alley behind house" "the rat are coming from an apartment building adjacent to the alley. there is alot of trash pilled up behind the apartm"| __truncated__ "The vector control branch baited at 2874 Perry St. NE for rats on 5-25-99." ...
$ serviceorder_date : Date, format: "1999-04-27" "1999-04-30" ...
$ serviceorder_yr : num 1999 1999 1999 1999 1999 ...
$ serviceorder_yr_posix : POSIXct, format: "1999-01-01" "1999-01-01" ...
$ serviceorder_mth : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 4 4 5 5 5 5 5 5 6 6 ...
$ serviceorder_yrmth : chr "1999-04" "1999-04" "1999-05" "1999-05" ...
$ serviceorder_yrmth_posix : POSIXct, format: "1999-04-01" "1999-04-01" ...
$ serviceorder_day : int 27 30 6 14 19 21 26 28 3 8 ...
$ serviceorder_wkday : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 3 6 5 6 4 6 4 6 5 3 ...
$ servicenotes_nonums_nopunc : chr "customer called vector control control no " "rats alley house" "rat coming apartment building adjacent alley alot trash pilled apartment building" "vector control branch baited perry st ne rats " ...
$ servicenotes_cleaned : chr "customer called vector control control no " "rats alley house" "rat coming apartment building adjacent alley alot trash pilled apartment building" "vector control branch baited perry st rats " ...
$ topicname_meanprob : chr "unknown" "rats_found" "no_rats_found" "unknown" ...
$ prob_meanprob : num 0.587 0.918 0.597 0.785 0.598 ...
$ topicname_5g4t : chr "unknown" NA "rats_found" "unknown" ...
$ prob_5g4t : num 0.978 NA 0.98 0.985 0.994 ...
$ topicname_topcounts : chr "unknown" "rats_found" "no_rats_found" "unknown" ...
$ timestopicassigned_topcounts: int 3 2 3 4 3 3 3 4 2 2 ...
$ prob_topcounts : num 0.587 0.918 0.597 0.785 0.598 ...
$ topicname_regex : chr "unknown" "unknown" "unknown" "rats_found" ...
$ matches : chr "all_match" "one_plus_NA" "one_plus_mismatches" "one_plus_mismatches" ...
ModelsCompare
View(sample_n(ModelsCompare,
1000
)
)
Here, I take a quick look at how the models compare with each other.
Interestingly, it appears that the model using regex appears (by manual inspection) to be the most accurate.
Matches <- ModelsCompare %>%
select(servicerequestid,
servicenotes,
servicenotes_cleaned,
topicname_meanprob,
topicname_5g4t,
topicname_topcounts,
topicname_regex,
matches
)
Matches_Check <- Matches %>%
filter(matches == "all_match") %>%
sample_n(5) %>%
bind_rows(filter(Matches,
matches == "one_plus_NA"
) %>%
sample_n(5)
) %>%
bind_rows(filter(Matches,
matches == "one_plus_mismatches"
) %>%
sample_n(5)
) %>%
arrange(matches,
topicname_regex
)
Matches_Check
View(Matches_Check)
Now we can use the results from the regex model to do some quick inspections about how often each of the topics were assigned, when they were assigned, any changes over time, etc.
str(ModelsCompare)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 26302 obs. of 27 variables:
$ servicerequestid : chr "09-00001211" "09-00001323" "09-00001410" "09-00001865" ...
$ servicepriority : chr "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
$ servicecode : chr "S0311" "S0311" "S0311" "S0311" ...
$ servicecodedescription : chr "Rat Abatement" "Rat Abatement" "Rat Abatement" "Rat Abatement" ...
$ servicetypecode : chr "DEPAHEAL" "DEPAHEAL" "DEPAHEAL" "DEPAHEAL" ...
$ servicetypecodedescription : chr "DOH" "DOH" "DOH" "DOH" ...
$ serviceorderdate : POSIXct, format: "1999-04-27 12:59:00" "1999-04-30 19:59:00" ...
$ servicenotes : chr "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "rats in the alley behind house" "the rat are coming from an apartment building adjacent to the alley. there is alot of trash pilled up behind the apartm"| __truncated__ "The vector control branch baited at 2874 Perry St. NE for rats on 5-25-99." ...
$ serviceorder_date : Date, format: "1999-04-27" "1999-04-30" ...
$ serviceorder_yr : num 1999 1999 1999 1999 1999 ...
$ serviceorder_yr_posix : POSIXct, format: "1999-01-01" "1999-01-01" ...
$ serviceorder_mth : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 4 4 5 5 5 5 5 5 6 6 ...
$ serviceorder_yrmth : chr "1999-04" "1999-04" "1999-05" "1999-05" ...
$ serviceorder_yrmth_posix : POSIXct, format: "1999-04-01" "1999-04-01" ...
$ serviceorder_day : int 27 30 6 14 19 21 26 28 3 8 ...
$ serviceorder_wkday : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 3 6 5 6 4 6 4 6 5 3 ...
$ servicenotes_nonums_nopunc : chr "customer called vector control control no " "rats alley house" "rat coming apartment building adjacent alley alot trash pilled apartment building" "vector control branch baited perry st ne rats " ...
$ servicenotes_cleaned : chr "customer called vector control control no " "rats alley house" "rat coming apartment building adjacent alley alot trash pilled apartment building" "vector control branch baited perry st rats " ...
$ topicname_meanprob : chr "unknown" "rats_found" "no_rats_found" "unknown" ...
$ prob_meanprob : num 0.587 0.918 0.597 0.785 0.598 ...
$ topicname_5g4t : chr "unknown" NA "rats_found" "unknown" ...
$ prob_5g4t : num 0.978 NA 0.98 0.985 0.994 ...
$ topicname_topcounts : chr "unknown" "rats_found" "no_rats_found" "unknown" ...
$ timestopicassigned_topcounts: int 3 2 3 4 3 3 3 4 2 2 ...
$ prob_topcounts : num 0.587 0.918 0.597 0.785 0.598 ...
$ topicname_regex : chr "unknown" "unknown" "unknown" "rats_found" ...
$ matches : chr "all_match" "one_plus_NA" "one_plus_mismatches" "one_plus_mismatches" ...
Counts_AllYears <- ModelsCompare %>%
mutate(topic_name = factor(topicname_regex,
levels = c("unknown",
"no_rats_found",
"rats_found"
)
)
) %>%
group_by(topic_name) %>%
count() %>%
rename(counts = n)
Counts_AcrossYears <- ModelsCompare %>%
mutate(topic_name = factor(topicname_regex,
levels = c("unknown",
"no_rats_found",
"rats_found"
)
)
) %>%
group_by(topic_name,
serviceorder_yr
) %>%
count() %>%
rename(counts = n)
ggplot(data = Counts_AllYears,
aes(x = topic_name,
y = counts,
fill = topic_name
)
) +
geom_col() +
geom_text(aes(label = counts),
nudge_y = -200,
size = 3
) +
labs(title = "Regex Model - Counts by Topic",
subtitle = "all years"
) +
# theme_minimal() +
theme(legend.position = "none") +
coord_flip()
ggsave(paste0(wd,
"/Viz/",
"New_",
"Topics_CountModel_Counts_AllYears.png"
),
scale = 4,
width = 6,
height = 6,
units = "cm"
)
ggplot(data = Counts_AcrossYears,
aes(x = topic_name,
y = counts,
fill = topic_name
)
) +
geom_col() +
geom_text(aes(label = counts),
nudge_y = 100,
size = 2.5
) +
labs(title = "Regex Model - Counts by Topic",
subtitle = "by year"
) +
scale_y_continuous(limits = c(0, 2000),
breaks = seq(0, 2000, 400)
) +
facet_wrap(~serviceorder_yr) +
theme(legend.position = "none") +
coord_flip()
ggsave(paste0(wd,
"/Viz/",
"New_",
"Topics_CountModel_Counts_AcrossYears.png"
),
scale = 4,
width = 8,
height = 6,
units = "cm"
)
Remove no-longer-needed files.
rm(list = ls(pattern = "Counts_"))
rm(list = ls(pattern = "Matches"))
rm(PerTopicPer5gram)
# rm(list = ls(pattern = "Prediction_"))