Load the relevant libraries.
# rm(list = ls())
library("tidyverse")          # data manipulation
library("magrittr")           # data manipulation (pipeing data)
library("stringr")            # string manipulation
library("lubridate")          # date manipulation
library("tidytext")           # text manipulation
library("topicmodels")        # topic modeling
library("ggplot2")            # viz
library("doParallel")         # parallel processing
library("ldatuning")          # estimating the proper number of topics
# library("scales")
# library("cowplot")            # grid plotting
Session Info.
sessionInfo()
R version 3.3.3 (2017-03-06)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: macOS Sierra 10.12.6

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] ldatuning_0.2.0   doParallel_1.0.11 iterators_1.0.8   foreach_1.4.3    
 [5] topicmodels_0.2-7 tidytext_0.1.4    lubridate_1.7.1   stringr_1.2.0    
 [9] magrittr_1.5      dplyr_0.7.4       purrr_0.2.4       readr_1.1.1      
[13] tidyr_0.7.2       tibble_1.3.4      ggplot2_2.2.1     tidyverse_1.1.1  

loaded via a namespace (and not attached):
 [1] modeltools_0.2-21 slam_0.1-40       NLP_0.1-11        reshape2_1.4.2   
 [5] haven_1.1.0       lattice_0.20-35   colorspace_1.3-2  SnowballC_0.5.1  
 [9] stats4_3.3.3      yaml_2.1.14       rlang_0.1.4       foreign_0.8-69   
[13] glue_1.2.0        modelr_0.1.1      readxl_1.0.0      bindrcpp_0.2     
[17] bindr_0.1         plyr_1.8.4        munsell_0.4.3     gtable_0.2.0     
[21] cellranger_1.1.0  rvest_0.3.2       codetools_0.2-15  psych_1.7.8      
[25] knitr_1.17        forcats_0.2.0     tm_0.7-1          broom_0.4.2      
[29] tokenizers_0.1.4  Rcpp_0.12.13      scales_0.5.0      jsonlite_1.5     
[33] mnormt_1.5-5      hms_0.3           stringi_1.1.5     grid_3.3.3       
[37] tools_3.3.3       lazyeval_0.2.1    janeaustenr_0.1.5 pkgconfig_2.0.1  
[41] Matrix_1.2-11     xml2_1.1.1        assertthat_0.2.0  httr_1.3.1       
[45] R6_2.2.2          nlme_3.1-131     
Setup the root directory.
Setting `wd` as the working directory.
wd <- getwd()
wd
[1] "/Users/mdturse/Desktop/Analytics/dc_doh_hackathon"
Get the raw data. Because of trouble maintaining a connection to Dropbox via R, I first downloaded the raw data from [https://www.dropbox.com/sh/4j7q53lltasez3h/AABQrwrQj3aenqG42wbW7-qra/years_combined/dc_311-2017-01-16.csv?dl=0](https://www.dropbox.com/sh/4j7q53lltasez3h/AABQrwrQj3aenqG42wbW7-qra/years_combined/dc_311-2017-01-16.csv?dl=0) and saved the file locally.
Raw311Data <- read_csv(paste0(wd,
                              "/Data_Raw/dc_311-2017-01-16.csv"
                              # "/Data_Raw/dc_311-2017-10-07.csv"
                              ),
                       progress = FALSE
                       )
Parsed with column specification:
cols(
  .default = col_character(),
  SERVICEORDERDATE = col_datetime(format = ""),
  SERVICECALLCOUNT = col_integer(),
  INSPECTIONDATE = col_datetime(format = ""),
  RESOLUTIONDATE = col_datetime(format = ""),
  SERVICEDUEDATE = col_datetime(format = ""),
  ADDDATE = col_datetime(format = ""),
  LASTMODIFIEDDATE = col_datetime(format = ""),
  LATITUDE = col_double(),
  LONGITUDE = col_double(),
  ZIPCODE = col_integer(),
  MARADDRESSREPOSITORYID = col_integer(),
  DCSTATADDRESSKEY = col_integer(),
  DCSTATLOCATIONKEY = col_integer(),
  WARD = col_integer(),
  PSA = col_integer()
)
See spec(...) for full column specifications.
# saving is done to avoid having to download all the data again
saveRDS(Raw311Data,
        paste0(wd,
               "/Data_Processed/",
               "Raw311Data.Rds"
               )
        )
str(Raw311Data)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   5093887 obs. of  36 variables:
 $ SERVICEREQUESTID          : chr  "10-00020033" "10-00020034" "10-00020035" "10-00020036" ...
 $ SERVICEPRIORITY           : chr  "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
 $ SERVICECODE               : chr  "S0276" "S0276" "S0276" "S0276" ...
 $ SERVICECODEDESCRIPTION    : chr  "Parking Meter Repair" "Parking Meter Repair" "Parking Meter Repair" "Parking Meter Repair" ...
 $ SERVICETYPECODE           : chr  "TRAOP001" "TRAOP001" "TRAOP001" "TRAOP001" ...
 $ SERVICETYPECODEDESCRIPTION: chr  "TOA" "TOA" "TOA" "TOA" ...
 $ SERVICEORDERDATE          : POSIXct, format: "2010-01-21 14:58:07" "2010-01-21 14:58:08" ...
 $ SERVICEORDERSTATUS        : chr  "OVERDUE CLOSED" "CLOSED" "CLOSED" "CLOSED" ...
 $ SERVICECALLCOUNT          : int  1 1 1 1 1 1 1 1 1 1 ...
 $ AGENCYABBREVIATION        : chr  "DDOT" "DDOT" "DDOT" "DDOT" ...
 $ INSPECTIONFLAG            : chr  "N" "N" "N" "N" ...
 $ INSPECTIONDATE            : POSIXct, format: NA NA ...
 $ RESOLUTION                : chr  "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
 $ RESOLUTIONDATE            : POSIXct, format: "2010-01-26 19:31:32" "2010-01-21 15:43:45" ...
 $ SERVICEDUEDATE            : POSIXct, format: "2010-01-26 14:58:07" "2010-01-26 14:58:08" ...
 $ SERVICENOTES              : chr  NA NA NA NA ...
 $ PARENTSERVICEREQUESTID    : chr  NA NA NA NA ...
 $ ADDDATE                   : POSIXct, format: "2010-01-21 14:58:07" "2010-01-21 14:58:08" ...
 $ LASTMODIFIEDDATE          : POSIXct, format: "2011-09-01 11:18:17" "2012-08-06 21:19:19" ...
 $ SITEADDRESS               : chr  "17TH STREET NW & RHODE ISLAND AVENUE NW" "20TH STREET NW & K STREET NW" "800 I STREET NW" "901 15TH STREET NW" ...
 $ LATITUDE                  : num  38.9 38.9 38.9 38.9 38.9 ...
 $ LONGITUDE                 : num  -77 -77 -77 -77 -77 ...
 $ ZIPCODE                   : int  NA NA 20001 20005 NA NA NA 20002 20018 NA ...
 $ MARADDRESSREPOSITORYID    : int  0 0 285559 218779 0 -100 -100 76069 50068 -100 ...
 $ DCSTATADDRESSKEY          : int  207502 207430 124416 62196 455845 -100 -100 49582 20159 -100 ...
 $ DCSTATLOCATIONKEY         : int  207502 207430 124416 62196 391873 348291 348291 49582 20159 348291 ...
 $ WARD                      : int  2 2 2 2 2 NA NA 6 5 NA ...
 $ ANC                       : chr  "2B" "2A" "2C" "2F" ...
 $ SMD                       : chr  "2B05" "2A06" "2C03" "2F03" ...
 $ DISTRICT                  : chr  "SECOND" "SECOND" "FIRST" "SECOND" ...
 $ PSA                       : int  208 207 102 207 208 NA NA 108 503 NA ...
 $ NEIGHBORHOODCLUSTER       : chr  "6" "6" "8" "8" ...
 $ HOTSPOT2006NAME           : chr  "NONE" "NONE" "NONE" "NONE" ...
 $ HOTSPOT2005NAME           : chr  "NONE" "NONE" "NONE" "NONE" ...
 $ HOTSPOT2004NAME           : chr  "NONE" "NONE" "NONE" "NONE" ...
 $ SERVICESOURCECODE         : chr  "PHONE" "PHONE" "PHONE" "PHONE" ...
 - attr(*, "problems")=Classes ‘tbl_df’, ‘tbl’ and 'data.frame':    3874 obs. of  5 variables:
  ..$ row     : int  3938387 3938538 3939027 3939187 3939240 3939316 3939514 3939869 3940110 3940469 ...
  ..$ col     : chr  "WARD" "WARD" "WARD" "WARD" ...
  ..$ expected: chr  "an integer" "an integer" "an integer" "an integer" ...
  ..$ actual  : chr  "Null" "Null" "Null" "Null" ...
  ..$ file    : chr  "'/Users/mdturse/Desktop/Analytics/dc_doh_hackathon/Data_Raw/dc_311-2017-01-16.csv'" "'/Users/mdturse/Desktop/Analytics/dc_doh_hackathon/Data_Raw/dc_311-2017-01-16.csv'" "'/Users/mdturse/Desktop/Analytics/dc_doh_hackathon/Data_Raw/dc_311-2017-01-16.csv'" "'/Users/mdturse/Desktop/Analytics/dc_doh_hackathon/Data_Raw/dc_311-2017-01-16.csv'" ...
 - attr(*, "spec")=List of 2
  ..$ cols   :List of 36
  .. ..$ SERVICEREQUESTID          : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SERVICEPRIORITY           : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SERVICECODE               : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SERVICECODEDESCRIPTION    : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SERVICETYPECODE           : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SERVICETYPECODEDESCRIPTION: list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SERVICEORDERDATE          :List of 1
  .. .. ..$ format: chr ""
  .. .. ..- attr(*, "class")= chr  "collector_datetime" "collector"
  .. ..$ SERVICEORDERSTATUS        : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SERVICECALLCOUNT          : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ AGENCYABBREVIATION        : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ INSPECTIONFLAG            : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ INSPECTIONDATE            :List of 1
  .. .. ..$ format: chr ""
  .. .. ..- attr(*, "class")= chr  "collector_datetime" "collector"
  .. ..$ RESOLUTION                : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ RESOLUTIONDATE            :List of 1
  .. .. ..$ format: chr ""
  .. .. ..- attr(*, "class")= chr  "collector_datetime" "collector"
  .. ..$ SERVICEDUEDATE            :List of 1
  .. .. ..$ format: chr ""
  .. .. ..- attr(*, "class")= chr  "collector_datetime" "collector"
  .. ..$ SERVICENOTES              : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ PARENTSERVICEREQUESTID    : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ ADDDATE                   :List of 1
  .. .. ..$ format: chr ""
  .. .. ..- attr(*, "class")= chr  "collector_datetime" "collector"
  .. ..$ LASTMODIFIEDDATE          :List of 1
  .. .. ..$ format: chr ""
  .. .. ..- attr(*, "class")= chr  "collector_datetime" "collector"
  .. ..$ SITEADDRESS               : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ LATITUDE                  : list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  .. ..$ LONGITUDE                 : list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  .. ..$ ZIPCODE                   : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ MARADDRESSREPOSITORYID    : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ DCSTATADDRESSKEY          : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ DCSTATLOCATIONKEY         : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ WARD                      : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ ANC                       : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SMD                       : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ DISTRICT                  : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ PSA                       : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ NEIGHBORHOODCLUSTER       : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ HOTSPOT2006NAME           : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ HOTSPOT2005NAME           : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ HOTSPOT2004NAME           : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SERVICESOURCECODE         : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  ..$ default: list()
  .. ..- attr(*, "class")= chr  "collector_guess" "collector"
  ..- attr(*, "class")= chr "col_spec"
tail(Raw311Data, 500)
Un-comment the chunk below to load the saved raw data (to avoid having to download the raw data again).
# Raw311Data <- readRDS(paste0(wd,
#                              "/Data_Processed/",
#                              "Raw311Data.Rds"
#                              )
#                       )
# 
# str(Raw311Data)
# tail(Raw311Data, 500)
# View(tail(Raw311Data, 1000))
Selecting those variables that may be useful to test breakdowns of topic modeling.  For example, running a topic model separately for the different levels of `servicecode`.
SelectedVars <- select(Raw311Data,
                       SERVICEREQUESTID,
                       SERVICEPRIORITY,
                       SERVICECODE,
                       SERVICECODEDESCRIPTION,
                       SERVICETYPECODE,
                       SERVICETYPECODEDESCRIPTION,
                       SERVICEORDERDATE,
                       SERVICENOTES
                       )
names(SelectedVars) <- tolower(names(SelectedVars))
rm(Raw311Data)
str(SelectedVars)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   5093887 obs. of  8 variables:
 $ servicerequestid          : chr  "10-00020033" "10-00020034" "10-00020035" "10-00020036" ...
 $ servicepriority           : chr  "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
 $ servicecode               : chr  "S0276" "S0276" "S0276" "S0276" ...
 $ servicecodedescription    : chr  "Parking Meter Repair" "Parking Meter Repair" "Parking Meter Repair" "Parking Meter Repair" ...
 $ servicetypecode           : chr  "TRAOP001" "TRAOP001" "TRAOP001" "TRAOP001" ...
 $ servicetypecodedescription: chr  "TOA" "TOA" "TOA" "TOA" ...
 $ serviceorderdate          : POSIXct, format: "2010-01-21 14:58:07" "2010-01-21 14:58:08" ...
 $ servicenotes              : chr  NA NA NA NA ...
 - attr(*, "problems")=Classes ‘tbl_df’, ‘tbl’ and 'data.frame':    3874 obs. of  5 variables:
  ..$ row     : int  3938387 3938538 3939027 3939187 3939240 3939316 3939514 3939869 3940110 3940469 ...
  ..$ col     : chr  "WARD" "WARD" "WARD" "WARD" ...
  ..$ expected: chr  "an integer" "an integer" "an integer" "an integer" ...
  ..$ actual  : chr  "Null" "Null" "Null" "Null" ...
  ..$ file    : chr  "'/Users/mdturse/Desktop/Analytics/dc_doh_hackathon/Data_Raw/dc_311-2017-01-16.csv'" "'/Users/mdturse/Desktop/Analytics/dc_doh_hackathon/Data_Raw/dc_311-2017-01-16.csv'" "'/Users/mdturse/Desktop/Analytics/dc_doh_hackathon/Data_Raw/dc_311-2017-01-16.csv'" "'/Users/mdturse/Desktop/Analytics/dc_doh_hackathon/Data_Raw/dc_311-2017-01-16.csv'" ...
 - attr(*, "spec")=List of 2
  ..$ cols   :List of 36
  .. ..$ SERVICEREQUESTID          : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SERVICEPRIORITY           : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SERVICECODE               : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SERVICECODEDESCRIPTION    : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SERVICETYPECODE           : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SERVICETYPECODEDESCRIPTION: list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SERVICEORDERDATE          :List of 1
  .. .. ..$ format: chr ""
  .. .. ..- attr(*, "class")= chr  "collector_datetime" "collector"
  .. ..$ SERVICEORDERSTATUS        : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SERVICECALLCOUNT          : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ AGENCYABBREVIATION        : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ INSPECTIONFLAG            : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ INSPECTIONDATE            :List of 1
  .. .. ..$ format: chr ""
  .. .. ..- attr(*, "class")= chr  "collector_datetime" "collector"
  .. ..$ RESOLUTION                : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ RESOLUTIONDATE            :List of 1
  .. .. ..$ format: chr ""
  .. .. ..- attr(*, "class")= chr  "collector_datetime" "collector"
  .. ..$ SERVICEDUEDATE            :List of 1
  .. .. ..$ format: chr ""
  .. .. ..- attr(*, "class")= chr  "collector_datetime" "collector"
  .. ..$ SERVICENOTES              : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ PARENTSERVICEREQUESTID    : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ ADDDATE                   :List of 1
  .. .. ..$ format: chr ""
  .. .. ..- attr(*, "class")= chr  "collector_datetime" "collector"
  .. ..$ LASTMODIFIEDDATE          :List of 1
  .. .. ..$ format: chr ""
  .. .. ..- attr(*, "class")= chr  "collector_datetime" "collector"
  .. ..$ SITEADDRESS               : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ LATITUDE                  : list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  .. ..$ LONGITUDE                 : list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  .. ..$ ZIPCODE                   : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ MARADDRESSREPOSITORYID    : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ DCSTATADDRESSKEY          : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ DCSTATLOCATIONKEY         : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ WARD                      : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ ANC                       : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SMD                       : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ DISTRICT                  : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ PSA                       : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ NEIGHBORHOODCLUSTER       : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ HOTSPOT2006NAME           : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ HOTSPOT2005NAME           : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ HOTSPOT2004NAME           : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SERVICESOURCECODE         : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  ..$ default: list()
  .. ..- attr(*, "class")= chr  "collector_guess" "collector"
  ..- attr(*, "class")= chr "col_spec"
Quick visual inspection of filtering the data to only service calls with notes (i.e., removing NA values), and only those that are rat-related (`servicecode == "S0311`).  
  
Removing NA values takes us from 5,093,887 rows to 3,018,411 rows.  

Looking at only rat-related service calls takes us from 3,018,411 rows to 15,581 rows.
NoNAServiceNotes <- filter(SelectedVars,
                           !is.na(servicenotes)
                           )
# message("SelectedVars")
nrow(SelectedVars)
[1] 5093887
# message("NoNAServiceNotes")
nrow(NoNAServiceNotes)
[1] 3018411
rm(SelectedVars)
str(NoNAServiceNotes)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   3018411 obs. of  8 variables:
 $ servicerequestid          : chr  "10-00020037" "10-00020041" "10-00020044" "10-00020050" ...
 $ servicepriority           : chr  "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
 $ servicecode               : chr  "DMV19" "S0003" "S0276" "S0276" ...
 $ servicecodedescription    : chr  "DMV - Vehicle Registration Issues" "Abandoned Vehicle - On Private Property" "Parking Meter Repair" "Parking Meter Repair" ...
 $ servicetypecode           : chr  "DRIVEHSE" "PRSVAVOP" "TRAOP001" "TRAOP001" ...
 $ servicetypecodedescription: chr  "Driver Vehicle Services" "Abandoned Vehicle Operations - Private" "TOA" "TOA" ...
 $ serviceorderdate          : POSIXct, format: "2010-01-21 14:58:22" "2010-01-21 14:58:56" ...
 $ servicenotes              : chr  "customer was called informed a trace will be placed on the request" "VEHICLE IN QUESTION WAS GONE ON ARRIVAL  PER INVESTIGATOR AUSTIN......TJ" "Comments update:  10-00020044-DEAD" "Comments update:  10-00020050-METER DISPLAYED OUT OF ORDER." ...
 - attr(*, "problems")=Classes ‘tbl_df’, ‘tbl’ and 'data.frame':    3874 obs. of  5 variables:
  ..$ row     : int  3938387 3938538 3939027 3939187 3939240 3939316 3939514 3939869 3940110 3940469 ...
  ..$ col     : chr  "WARD" "WARD" "WARD" "WARD" ...
  ..$ expected: chr  "an integer" "an integer" "an integer" "an integer" ...
  ..$ actual  : chr  "Null" "Null" "Null" "Null" ...
  ..$ file    : chr  "'/Users/mdturse/Desktop/Analytics/dc_doh_hackathon/Data_Raw/dc_311-2017-01-16.csv'" "'/Users/mdturse/Desktop/Analytics/dc_doh_hackathon/Data_Raw/dc_311-2017-01-16.csv'" "'/Users/mdturse/Desktop/Analytics/dc_doh_hackathon/Data_Raw/dc_311-2017-01-16.csv'" "'/Users/mdturse/Desktop/Analytics/dc_doh_hackathon/Data_Raw/dc_311-2017-01-16.csv'" ...
 - attr(*, "spec")=List of 2
  ..$ cols   :List of 36
  .. ..$ SERVICEREQUESTID          : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SERVICEPRIORITY           : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SERVICECODE               : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SERVICECODEDESCRIPTION    : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SERVICETYPECODE           : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SERVICETYPECODEDESCRIPTION: list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SERVICEORDERDATE          :List of 1
  .. .. ..$ format: chr ""
  .. .. ..- attr(*, "class")= chr  "collector_datetime" "collector"
  .. ..$ SERVICEORDERSTATUS        : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SERVICECALLCOUNT          : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ AGENCYABBREVIATION        : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ INSPECTIONFLAG            : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ INSPECTIONDATE            :List of 1
  .. .. ..$ format: chr ""
  .. .. ..- attr(*, "class")= chr  "collector_datetime" "collector"
  .. ..$ RESOLUTION                : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ RESOLUTIONDATE            :List of 1
  .. .. ..$ format: chr ""
  .. .. ..- attr(*, "class")= chr  "collector_datetime" "collector"
  .. ..$ SERVICEDUEDATE            :List of 1
  .. .. ..$ format: chr ""
  .. .. ..- attr(*, "class")= chr  "collector_datetime" "collector"
  .. ..$ SERVICENOTES              : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ PARENTSERVICEREQUESTID    : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ ADDDATE                   :List of 1
  .. .. ..$ format: chr ""
  .. .. ..- attr(*, "class")= chr  "collector_datetime" "collector"
  .. ..$ LASTMODIFIEDDATE          :List of 1
  .. .. ..$ format: chr ""
  .. .. ..- attr(*, "class")= chr  "collector_datetime" "collector"
  .. ..$ SITEADDRESS               : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ LATITUDE                  : list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  .. ..$ LONGITUDE                 : list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  .. ..$ ZIPCODE                   : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ MARADDRESSREPOSITORYID    : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ DCSTATADDRESSKEY          : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ DCSTATLOCATIONKEY         : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ WARD                      : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ ANC                       : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SMD                       : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ DISTRICT                  : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ PSA                       : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ NEIGHBORHOODCLUSTER       : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ HOTSPOT2006NAME           : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ HOTSPOT2005NAME           : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ HOTSPOT2004NAME           : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SERVICESOURCECODE         : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  ..$ default: list()
  .. ..- attr(*, "class")= chr  "collector_guess" "collector"
  ..- attr(*, "class")= chr "col_spec"
View(head(NoNAServiceNotes, 1000))
RatCalls <- filter(NoNAServiceNotes,
                   servicecode == "S0311"
                   )
# message("NoNAServiceNotes")
nrow(NoNAServiceNotes)
[1] 3018411
# message("RatCalls")
nrow(RatCalls)
[1] 15581
rm(NoNAServiceNotes)
View(RatCalls)
Add in time-related variables.
RatCalls_Time <- RatCalls %>%
  mutate(serviceorder_date = as_date(serviceorderdate),
         serviceorder_yr = year(serviceorderdate),
         serviceorder_yr_posix = floor_date(serviceorderdate, "year"),
         serviceorder_mth = month(serviceorderdate, label = TRUE),
         serviceorder_yrmth = as.character(serviceorder_date) %>% 
           substr(1, 7),
         serviceorder_yrmth_posix = floor_date(serviceorderdate, "month"),
         serviceorder_day = day(serviceorderdate),
         serviceorder_wkday = wday(serviceorderdate, label = TRUE)
         )
rm(RatCalls)
str(RatCalls_Time)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   15581 obs. of  16 variables:
 $ servicerequestid          : chr  "10-00020177" "10-00020346" "10-00020514" "10-00020754" ...
 $ servicepriority           : chr  "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
 $ servicecode               : chr  "S0311" "S0311" "S0311" "S0311" ...
 $ servicecodedescription    : chr  "Rat Abatement" "Rat Abatement" "Rat Abatement" "Rat Abatement" ...
 $ servicetypecode           : chr  "DEPAHEAL" "DEPAHEAL" "DEPAHEAL" "DEPAHEAL" ...
 $ servicetypecodedescription: chr  "DOH" "DOH" "DOH" "DOH" ...
 $ serviceorderdate          : POSIXct, format: "2010-01-21 15:46:23" "2010-01-21 16:32:51" ...
 $ servicenotes              : chr  "On 1/25/10 L Rogers baited 3 rat burrows in the rear yd." "On 1/26/10 G Cornes found no rat burrows in the area." "On 1/28/10 L Rogers baited 4 rat burrows in the rear yd." "On 1/26/10 V Purvis baited 9 rat burrows in the park." ...
 $ serviceorder_date         : Date, format: "2010-01-21" "2010-01-21" ...
 $ serviceorder_yr           : num  2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
 $ serviceorder_yr_posix     : POSIXct, format: "2010-01-01" "2010-01-01" ...
 $ serviceorder_mth          : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 1 1 1 1 1 1 1 1 1 1 ...
 $ serviceorder_yrmth        : chr  "2010-01" "2010-01" "2010-01" "2010-01" ...
 $ serviceorder_yrmth_posix  : POSIXct, format: "2010-01-01" "2010-01-01" ...
 $ serviceorder_day          : int  21 21 21 21 21 22 22 22 22 22 ...
 $ serviceorder_wkday        : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 5 5 5 5 5 6 6 6 6 6 ...
tail(RatCalls_Time, 500)
View(tail(RatCalls_Time, 1000))
Next we need to clean up the text of the `servicenotes` variable - this will be done in multiple steps.  
  
As the first step, we'll remove common "stopwords" (e.g., is, the, and, etc.) as they won't be very useful in finding topics in the `servicenotes` text. Although it's a stopwords, we specifically do not remove the word "no" as it is often used to distinguish between "rats found" and "no rats found".
NoStopWords_Unnest <- 
  RatCalls_Time %>% 
  select(servicerequestid,
         servicenotes
         ) %>% 
  unnest_tokens(word,
                servicenotes
                ) %>% 
  anti_join(filter(stop_words,
                   word != "no" # we don't remove the word "no" as it is often used to distinguish between "rats found" and "no rats found"
                   ),
            by = "word"
            )
Servicenotes_NoStopWrds <- NoStopWords_Unnest %>% 
  nest(word) %>% 
  mutate(servicenotes_nostop = map(data,
                                   unlist
                                   ),
         servicenotes_nostop = map_chr(servicenotes_nostop,
                                       paste,
                                       collapse = " "
                                       )
         ) %>% 
  select(-data)
Remove_StopWrds <- RatCalls_Time %>% 
  left_join(Servicenotes_NoStopWrds,
            by = "servicerequestid"
            )
dim(RatCalls_Time)
[1] 15581    16
dim(Remove_StopWrds)
[1] 15581    17
rm(NoStopWords_Unnest, Servicenotes_NoStopWrds)
head(Remove_StopWrds, 100)
View(head(Remove_StopWrds, 100))
# str(Remove_StopWrds)
# View(Remove_StopWrds %>% 
#        filter(str_detect(servicenotes_nostop,
#                          "washington dc"
#                          )
#               ) %>% 
#        select(servicenotes,
#               servicenotes_nostop
#               )
#      )
Then, we'll remove any numeric characters from 'servicenotes' to avoid distinctions not needed at this level (e.g., "baited 3 rat borrows" vs. "baited 6 rat burrows"). We'll also remove punctuation.
ServiceNotesCleaned <- Remove_StopWrds %>% 
  mutate(servicenotes_nonums_nopunc = str_replace_all(servicenotes_nostop,
                                                      "[[:digit:]]",
                                                      ""
                                                      ) %>% 
           str_replace_all("[[:punct:]]",
                           ""
                           )
         ) %>% 
  select(-servicenotes_nostop)
dim(RatCalls_Time)
[1] 15581    16
dim(Remove_StopWrds)
[1] 15581    17
dim(ServiceNotesCleaned)
[1] 15581    17
# View(select(ServiceNotesCleaned,
#             servicerequestid,
#             servicenotes,
#             servicenotes_nonums_nopunc
#             ) %>% 
#        filter(servicerequestid %in% nomatch$servicerequestid)
#      )
rm(RatCalls_Time, Remove_StopWrds)
head(ServiceNotesCleaned, 100)
View(head(ServiceNotesCleaned, 100))
Now, we can inspect the frequencies of rat-related service requests.
summary(ServiceNotesCleaned)
 servicerequestid   servicepriority    servicecode        servicecodedescription
 Length:15581       Length:15581       Length:15581       Length:15581          
 Class :character   Class :character   Class :character   Class :character      
 Mode  :character   Mode  :character   Mode  :character   Mode  :character      
                                                                                
                                                                                
                                                                                
                                                                                
 servicetypecode    servicetypecodedescription serviceorderdate             
 Length:15581       Length:15581               Min.   :1999-04-27 12:59:00  
 Class :character   Class :character           1st Qu.:2009-07-22 12:00:00  
 Mode  :character   Mode  :character           Median :2011-04-07 15:24:33  
                                               Mean   :2010-01-30 02:23:50  
                                               3rd Qu.:2012-10-05 12:31:54  
                                               Max.   :2016-12-14 10:45:49  
                                                                            
 servicenotes       serviceorder_date    serviceorder_yr serviceorder_yr_posix        
 Length:15581       Min.   :1999-04-27   Min.   :1999    Min.   :1999-01-01 00:00:00  
 Class :character   1st Qu.:2009-07-22   1st Qu.:2009    1st Qu.:2009-01-01 00:00:00  
 Mode  :character   Median :2011-04-07   Median :2011    Median :2011-01-01 00:00:00  
                    Mean   :2010-01-29   Mean   :2010    Mean   :2009-07-25 06:25:01  
                    3rd Qu.:2012-10-05   3rd Qu.:2012    3rd Qu.:2012-01-01 00:00:00  
                    Max.   :2016-12-14   Max.   :2016    Max.   :2016-01-01 00:00:00  
                                                                                      
 serviceorder_mth serviceorder_yrmth serviceorder_yrmth_posix      serviceorder_day
 Aug    :1923     Length:15581       Min.   :1999-04-01 00:00:00   Min.   : 1.00   
 Jun    :1697     Class :character   1st Qu.:2009-07-01 00:00:00   1st Qu.: 8.00   
 Sep    :1666     Mode  :character   Median :2011-04-01 00:00:00   Median :16.00   
 May    :1549                        Mean   :2010-01-14 17:37:56   Mean   :15.71   
 Oct    :1523                        3rd Qu.:2012-10-01 00:00:00   3rd Qu.:23.00   
 Jul    :1507                        Max.   :2016-12-01 00:00:00   Max.   :31.00   
 (Other):5716                                                                      
 serviceorder_wkday servicenotes_nonums_nopunc
 Sun: 489           Length:15581              
 Mon:3140           Class :character          
 Tue:3132           Mode  :character          
 Wed:3025                                     
 Thu:2654                                     
 Fri:2450                                     
 Sat: 691                                     
# summary(RatCalls_Time$serviceorderdate)
# library("psych")
# describe(RatCalls_Time$serviceorderdate)
ggplot_theme_basic <-
  theme(panel.background = element_blank(),
        panel.grid.minor = element_blank(),
        panel.grid.major = element_blank(),
        # axis.text.x = element_blank(),
        axis.ticks = element_blank(),
        axis.line = element_line(size = 1, colour = "black")
        )
# ggplot(data = RatCalls_Time,
#        aes(x = serviceorder_date)
#        ) +
#   geom_histogram() +
#   ggplot_theme_basic
yr_counts <- ServiceNotesCleaned %>% 
  group_by(serviceorder_yr_posix) %>% 
  summarise(Cnt = n()
            ) %>% 
  arrange(serviceorder_yr_posix)
ggplot(data = yr_counts,
       aes(x = serviceorder_yr_posix,
           y = Cnt
           )
       ) +
  geom_col(fill = "light blue") + #, alpha = 0.1) +
  geom_text(aes(label = Cnt),
            nudge_y = 50,
            size = 3
            ) +
  labs(title = "Counts of non-NA ServiceNotes",
       # subtitle = "by year",
       x = "Year",
       y = "Count"
       ) +
  ggplot_theme_basic +
  theme(axis.text.x = element_text(angle = 90)
        ) +
  scale_x_datetime(date_breaks = "1 year")

yrmth_counts <- ServiceNotesCleaned %>% 
  group_by(serviceorder_yrmth_posix) %>% 
  summarise(Cnt = n()
            ) %>% 
  arrange(serviceorder_yrmth_posix)
ggplot(data = yrmth_counts,
       aes(x = serviceorder_yrmth_posix,
           y = Cnt
           )
       ) +
  geom_col(fill = "light blue") + #, alpha = 0.1) +
  # geom_text(aes(label = Cnt),
  #           nudge_y = 50,
  #           size = 3
  #           ) +
  labs(title = "Counts of non-NA ServiceNotes",
       x = "Year-Month",
       y = "Count"
       ) +
  ggplot_theme_basic +
  theme(axis.text.x = element_text(angle = 90)
        ) +
  coord_cartesian(xlim = c(as.POSIXct("1998-12-01"),
                           as.POSIXct("2017-12-01")
                           ),
                  expand = TRUE
                  ) +
  scale_x_datetime(date_breaks = "6 months")

Based on the frequencies of when we actually have 'servicenotes' data, let's try limiting the dataset to service calls from 2010 or later. This reduces the dataset further, from 15,581 rows to 11,037 rows. 
rm(yr_counts, yrmth_counts)
ServiceNotesCleanedAfter2010 <- ServiceNotesCleaned %>%
  filter(serviceorderdate >= as_date("2010-01-01")
         )
nrow(ServiceNotesCleaned)
[1] 15581
nrow(ServiceNotesCleanedAfter2010)
[1] 11037
summary(ServiceNotesCleanedAfter2010)
 servicerequestid   servicepriority    servicecode        servicecodedescription
 Length:11037       Length:11037       Length:11037       Length:11037          
 Class :character   Class :character   Class :character   Class :character      
 Mode  :character   Mode  :character   Mode  :character   Mode  :character      
                                                                                
                                                                                
                                                                                
                                                                                
 servicetypecode    servicetypecodedescription serviceorderdate             
 Length:11037       Length:11037               Min.   :2010-01-02 12:51:20  
 Class :character   Class :character           1st Qu.:2010-12-08 16:46:26  
 Mode  :character   Mode  :character           Median :2012-02-06 19:55:15  
                                               Mean   :2012-05-08 12:20:41  
                                               3rd Qu.:2013-05-07 18:58:17  
                                               Max.   :2016-12-14 10:45:49  
                                                                            
 servicenotes       serviceorder_date    serviceorder_yr serviceorder_yr_posix        
 Length:11037       Min.   :2010-01-02   Min.   :2010    Min.   :2010-01-01 00:00:00  
 Class :character   1st Qu.:2010-12-08   1st Qu.:2010    1st Qu.:2010-01-01 00:00:00  
 Mode  :character   Median :2012-02-06   Median :2012    Median :2012-01-01 00:00:00  
                    Mean   :2012-05-07   Mean   :2012    Mean   :2011-11-01 14:15:37  
                    3rd Qu.:2013-05-07   3rd Qu.:2013    3rd Qu.:2013-01-01 00:00:00  
                    Max.   :2016-12-14   Max.   :2016    Max.   :2016-01-01 00:00:00  
                                                                                      
 serviceorder_mth serviceorder_yrmth serviceorder_yrmth_posix      serviceorder_day
 Aug    :1429     Length:11037       Min.   :2010-01-01 00:00:00   Min.   : 1.00   
 Jun    :1268     Class :character   1st Qu.:2010-12-01 00:00:00   1st Qu.: 8.00   
 Sep    :1260     Mode  :character   Median :2012-02-01 00:00:00   Median :16.00   
 Jul    :1075                        Mean   :2012-04-23 02:01:28   Mean   :15.78   
 Oct    :1056                        3rd Qu.:2013-05-01 00:00:00   3rd Qu.:23.00   
 May    :1030                        Max.   :2016-12-01 00:00:00   Max.   :31.00   
 (Other):3919                                                                      
 serviceorder_wkday servicenotes_nonums_nopunc
 Sun: 427           Length:11037              
 Mon:2211           Class :character          
 Tue:2166           Mode  :character          
 Wed:2085                                     
 Thu:1846                                     
 Fri:1736                                     
 Sat: 566                                     
Then, transform the `servicenotes` field into one row per n-gram. Because we don't know what level of 'n' to use, we'll cycle through the possibilities from n = 1 to n = 5.
ngram_list <- 1:5
Rat_Ngram_list <- list()
Rat_Ngram_list <- lapply(ngram_list,
                         function(i) {
                           # x <- paste0("0", i, "_gram")
                           ServiceNotesCleaned %>% 
                             unnest_tokens(n_gram,
                                           servicenotes_nonums_nopunc,
                                           token = "ngrams",
                                           n = i
                                           )
                           }
                         )
# rm(ngram_list)
Rat_Ngram_list
[[1]]

[[2]]

[[3]]

[[4]]

[[5]]
# str(Rat_Ngram_list[[1]])
Counting the n-grams in each `servicerequestid`.
word_counts_list <- list()
word_counts_list <- lapply(ngram_list,
                           function(i) {
                             Rat_Ngram_list[[i]] %>% 
                               count(servicerequestid,
                                     n_gram,
                                     sort = TRUE
                                     )
                             }
                           )
word_counts_list
[[1]]

[[2]]

[[3]]

[[4]]

[[5]]
NA
Transforming the dataframe into a document term matrix - i.e., documents (`servicerequestid`s) are the rows and n-grams are the columns.
dtm_list <- list()
dtm_list <- lapply(ngram_list,
                   function(i) {
                     word_counts_list[[i]] %>% 
                       cast_dtm(document = servicerequestid,
                                term = n_gram,
                                value = n,
                                # weighting = tm::weightTfIdf,
                                # using term frequency inverse document frequency (TfIdf) weighting is another, possibly more accurate measure, but topicmodels::LDA (used below) only accepts document term matrices with term-frequency weighting
                                weighting = tm::weightTf
                                )
                     }
                   )
dtm_list
[[1]]
<<DocumentTermMatrix (documents: 15580, terms: 3177)>>
Non-/sparse entries: 116407/49381253
Sparsity           : 100%
Maximal term length: 20
Weighting          : term frequency (tf)

[[2]]
<<DocumentTermMatrix (documents: 15268, terms: 12341)>>
Non-/sparse entries: 102353/188320035
Sparsity           : 100%
Maximal term length: 32
Weighting          : term frequency (tf)

[[3]]
<<DocumentTermMatrix (documents: 14543, terms: 18256)>>
Non-/sparse entries: 87424/265409584
Sparsity           : 100%
Maximal term length: 40
Weighting          : term frequency (tf)

[[4]]
<<DocumentTermMatrix (documents: 13157, terms: 20491)>>
Non-/sparse entries: 73020/269527067
Sparsity           : 100%
Maximal term length: 47
Weighting          : term frequency (tf)

[[5]]
<<DocumentTermMatrix (documents: 11846, terms: 21176)>>
Non-/sparse entries: 59946/250790950
Sparsity           : 100%
Maximal term length: 58
Weighting          : term frequency (tf)
To determine the "proper" number of topics, here I try using the `ldatuning::FindTopicsNumber` function. The code chunk is based on the vignette here:  [https://cran.r-project.org/web/packages/ldatuning/vignettes/topics.html](https://cran.r-project.org/web/packages/ldatuning/vignettes/topics.html).  

This analyses was done separately for each n-gram level (n = 1:5), and the overall results were inconclusive - the "proper" number of topics fluctuated between the highest level tried (12 topics) and one of the lowest levels tried (2, 3, or 4 topics).
detectCores(logical = TRUE) - 1
[1] 3
tunes_list <- dtm_list %>% 
  map(~ FindTopicsNumber(.x,
                         topics = c(2:12),
                         metrics = c("Griffiths2004",
                                     "CaoJuan2009",
                                     "Arun2010",
                                     "Deveaud2014"
                                     ),
                         method = "Gibbs",
                         control = list(seed = 123456789),
                         mc.cores = 3L,
                         verbose = TRUE
                         )
      )
fit models... done.
calculate metrics:
  Griffiths2004... done.
  CaoJuan2009... done.
  Arun2010... done.
  Deveaud2014... done.
fit models... done.
calculate metrics:
  Griffiths2004... done.
  CaoJuan2009... done.
  Arun2010... done.
  Deveaud2014... done.
fit models... done.
calculate metrics:
  Griffiths2004... done.
  CaoJuan2009... done.
  Arun2010... done.
  Deveaud2014... done.
fit models... done.
calculate metrics:
  Griffiths2004... done.
  CaoJuan2009... done.
  Arun2010... done.
  Deveaud2014... done.
fit models... done.
calculate metrics:
  Griffiths2004... done.
  CaoJuan2009... done.
  Arun2010... done.
  Deveaud2014... done.
# str(tunes_list[[5]])
topic_plots <-
  tunes_list %>% 
  map(~ FindTopicsNumber_plot(.x)
      )

topic_plots
[[1]]
NULL

[[2]]
NULL

[[3]]
NULL

[[4]]
NULL

[[5]]
NULL
As an alternative, here I try to determine the "proper" number of topics using `topicmodels::perplexity`. Perplexity measures how well a probability model predicts a sample, and I use it here via 10-folder cross validation. For computational purposes, I'm only trying this for the 5-gram model.  

This is based on the method used here:
[http://ellisp.github.io/blog/2017/01/05/topic-model-cv](http://ellisp.github.io/blog/2017/01/05/topic-model-cv)

As with the `ldatuning::FindTopicsNumber` function used previously, `topicmodels::perplexity` is also inconclusive as there is no clear "elbow" in the perplexity plot.
full_data <- dtm_list[[5]]
n <- nrow(full_data)
seed <- 123456789
topic_guess <- 12
folds <- 10
burnin <- 1000
iter <-1000
keep <-50
#----------------10-fold cross-validation, different numbers of topics----------------
cluster <- makeCluster(detectCores(logical = TRUE) - 1
                       ) # leave one CPU spare...
registerDoParallel(cluster)
clusterEvalQ(cluster, {
   library(topicmodels)
})
[[1]]
[1] "topicmodels" "stats"       "graphics"    "grDevices"   "utils"       "datasets"   
[7] "methods"     "base"       

[[2]]
[1] "topicmodels" "stats"       "graphics"    "grDevices"   "utils"       "datasets"   
[7] "methods"     "base"       

[[3]]
[1] "topicmodels" "stats"       "graphics"    "grDevices"   "utils"       "datasets"   
[7] "methods"     "base"       
folds <- folds
splitfolds <- sample(1:folds, n, replace = TRUE)
candidate_k <- c(2:topic_guess) # candidates for how many topics
clusterExport(cluster,
              c("full_data", "burnin", "iter", "keep", "splitfolds", "folds", "candidate_k")
              )
# we parallelize by the different number of topics.  A processor is allocated a value
# of k, and does the cross-validation serially.  This is because it is assumed there
# are more candidate values of k than there are cross-validation folds, hence it
# will be more efficient to parallelise
system.time({
results <- foreach(j = 1:length(candidate_k),
                   .combine = rbind
                   ) %dopar%{
   k <- candidate_k[j]
   
   results_1k <- matrix(0,
                        nrow = folds,
                        ncol = 2
                        )
   
   colnames(results_1k) <- c("k", "perplexity")
   
   for(i in 1:folds){
      train_set <- full_data[splitfolds != i , ]
      valid_set <- full_data[splitfolds == i, ]
      
      fitted <- LDA(train_set,
                    k = k,
                    method = "Gibbs",
                    control = list(seed = seed,
                                   verbose = 1,
                                   burnin = burnin,
                                   iter = iter,
                                   keep = keep
                                   )
                    )
      
      results_1k[i, ] <- c(k, perplexity(fitted, newdata = valid_set)
                           )
   }
   
   return(results_1k)
}
})
    user   system  elapsed 
   1.684    2.252 1080.057 
stopCluster(cluster)
results_df <- as.data.frame(results)
# ggplot(data = results_df,
#        aes(x = k,
#            y = perplexity)
#        ) +
#   geom_point() +
#   geom_smooth(se = FALSE) +
#   coord_cartesian(xlim = c(0, 12)
#                   ) +
#   scale_x_continuous(breaks = seq(0, 12, 2)
#                      ) +
#   ggplot_theme_basic +
#   ggtitle(label = "10-fold cross-validation of topic modelling",
#           subtitle = "(i.e., 10 different models fit for each potential number of topics)"
#           ) +
#   labs(x = "Potential Number of Topics",
#        y = "Perplexity When Fitting the Trained Model to the Hold-Out Set"
#        )
ggplot(data = results_df,
       aes(x = k,
           y = perplexity)
       ) +
  geom_point() +
  geom_smooth(se = TRUE) +
  coord_cartesian(xlim = c(0, 12),
                  ylim = c(0, 6000)
                  ) +
  scale_x_continuous(breaks = seq(0, 12, 2)
                     ) +
  # ggplot_theme_basic +
  ggtitle(label = "10-fold cross-validation of topic modelling",
          subtitle = "(i.e., 10 different models fit for each potential number of topics)"
          ) +
  labs(x = "Potential Number of Topics",
       y = "Perplexity When Fitting the Trained Model to the Hold-Out Set"
       )

Remove the no-longer-needed files.
rm(cluster, folds, full_data, results, topic_plots, tunes_list, burnin, candidate_k, folds, iter, k, keep, n, seed, splitfolds, topic_guess)
object 'folds' not foundobject 'k' not found
Here I use Latent Dirichlet allocation for topic modeling. As determining the "proper" number of topics was inconclusive, I'm cycling through every combination of ngrams (1:5) and topics (2:12).

topic_guess <- 2:12

lda_list <- list()


cluster <- makeCluster(detectCores(logical = TRUE) - 1
                       ) # leave one CPU spare...
registerDoParallel(cluster)

for(i in ngram_list) {
  for(j in topic_guess) {
    x <- LDA(dtm_list[[i]],
             k = j,
             control = list(seed = 123456789,
                            verbose = 1
                            )
             )
    
    ifelse((i == min(ngram_list) &
              j == min(topic_guess)
            ),
           countx <- 1,
           countx <- countx + 1
           )
    
    lda_list[[countx]] <- list(ngram = i,
                               topic = j,
                               lda_model = x
                               )
    }
  }

stopCluster(cluster)

rm(ngram_list, topic_guess, i, j, x, countx, cluster)

lda_list
Creating a dataframe with `beta` - the per-topic-per-ngram probability (i.e., the probability that each ngram is in each topic).
PerTopicPerNgram <- list()
for(i in 1:length(lda_list)
    ) {
  x <- tidy(lda_list[[i]]$lda_model,
            matrix = "beta"
            ) %>% 
    arrange(term,
            desc(beta)
            )
  
  PerTopicPerNgram[[i]] <- list(ngram = lda_list[[i]]$ngram,
                                topic = lda_list[[i]]$topic,
                                PerTopicPerNgram = x
                                )
  }
rm(i, x)
str(PerTopicPerNgram[[55]]$PerTopicPerNgram)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   254112 obs. of  3 variables:
 $ topic: int  5 6 8 2 9 3 1 4 11 7 ...
 $ term : chr  "a covnes parker bated rat" "a covnes parker bated rat" "a covnes parker bated rat" "a covnes parker bated rat" ...
 $ beta : num  2.33e-04 3.29e-13 9.58e-153 8.26e-153 5.04e-153 ...
# rm(serv_req_id_lda)
head(PerTopicPerNgram[[55]]$PerTopicPerNgram, 500)
Creating a dataframe with just the top ten terms (ranked by beta) in each topic.
top_terms <- list()
for(i in 1:length(PerTopicPerNgram)
    ) {
  x <- PerTopicPerNgram[[i]]$PerTopicPerNgram %>% 
    group_by(topic) %>% 
    top_n(10,
          beta
          ) %>% 
    ungroup() %>% 
    arrange(topic,
            -beta
            )
  
  top_terms[[i]] <- list(ngram = PerTopicPerNgram[[i]]$ngram,
                         topic = PerTopicPerNgram[[i]]$topic,
                         top_terms = x
                         )
  }
rm(i, x)
top_terms[[55]]$top_terms
View(top_terms[[55]]$top_terms)
Now we can plot the top 10 n-grams in each topic to visually inspect if the topic classifications "make sense" based on the n-gram text.  

Here, we're just creating and saving the plots themselves.
TopNgrams_ByTopic_BarGraphs <-
  top_terms %>%
  # to_graph %>%
  map(function(x) 
    x$top_terms %>%
      mutate(term = reorder(term,
                            beta
                            ),
             topic = paste0("Topic ",
                            str_pad(as.character(topic),
                                    width = 2,
                                    side = "left",
                                    pad = "0"
                                    )
                            )
             ) %>% 
      ggplot(aes(x = term,
                 y = beta,
                 fill = factor(topic)
                 )
             ) +
      geom_col(show.legend = FALSE) +
      facet_wrap(~ topic,
                 scales = "free",
                 ncol = 2
                 ) +
      ggplot_theme_basic +
      theme(plot.title = element_text(size = 11),
            axis.title = element_text(size = 10),
            axis.text = element_text(size = 9)
            ) +
      labs(title = "Most Common Terms Per Topic",
           subtitle = paste0("(",
                             str_pad(x$ngram,
                                     width = 2,
                                     side = "left",
                                     pad = "0"
                                     ),
                             "gram",
                             str_pad(x$topic,
                                     width = 2,
                                     side = "left",
                                     pad = "0"
                             ),
                             "topic)"
                             ),
           x = paste0(str_pad(x$ngram,
                              width = 2,
                              side = "left",
                              pad = "0"
                              ),
                      "gram"
                      ),
           y = paste0("probability of the ",
                      str_pad(x$ngram,
                              width = 2,
                              side = "left",
                              pad = "0"
                              ),
                      "gram in the topic"
                      )
           ) +
      coord_flip()
    )
# TopNgrams_ByTopic_BarGraphs
TopNgrams_ByTopic_BarGraphs[[24]] # ngram = 3 & topics = 3

TopNgrams_ByTopic_BarGraphs[[25]] # ngram = 3 & topics = 4

TopNgrams_ByTopic_BarGraphs[[35]] # ngram = 4 & topics = 3

TopNgrams_ByTopic_BarGraphs[[36]] # ngram = 4 & topics = 4

TopNgrams_ByTopic_BarGraphs[[46]] # ngram = 5 & topics = 3

TopNgrams_ByTopic_BarGraphs[[47]] # ngram = 5 & topics = 4
TopNgrams_ByTopic_BarGraphs %>% 
  map(function(x)
    ggsave(paste0(wd,
                  "/Viz/",
                  # "New_",
                  substr(x$labels$subtitle,
                         2,
                         (nchar(x$labels$subtitle) - 1)
                  ),
                  "_Top10Terms_facet.png"
                  ),
           x,
           scale = 4,
           width = 6,
           height = 6,
           units = "cm"
           )
    )
[[1]]
NULL

[[2]]
NULL

[[3]]
NULL

[[4]]
NULL

[[5]]
NULL

[[6]]
NULL

[[7]]
NULL

[[8]]
NULL

[[9]]
NULL

[[10]]
NULL

[[11]]
NULL

[[12]]
NULL

[[13]]
NULL

[[14]]
NULL

[[15]]
NULL

[[16]]
NULL

[[17]]
NULL

[[18]]
NULL

[[19]]
NULL

[[20]]
NULL

[[21]]
NULL

[[22]]
NULL

[[23]]
NULL

[[24]]
NULL

[[25]]
NULL

[[26]]
NULL

[[27]]
NULL

[[28]]
NULL

[[29]]
NULL

[[30]]
NULL

[[31]]
NULL

[[32]]
NULL

[[33]]
NULL

[[34]]
NULL

[[35]]
NULL

[[36]]
NULL

[[37]]
NULL

[[38]]
NULL

[[39]]
NULL

[[40]]
NULL

[[41]]
NULL

[[42]]
NULL

[[43]]
NULL

[[44]]
NULL

[[45]]
NULL

[[46]]
NULL

[[47]]
NULL

[[48]]
NULL

[[49]]
NULL

[[50]]
NULL

[[51]]
NULL

[[52]]
NULL

[[53]]
NULL

[[54]]
NULL

[[55]]
NULL

Creating a dataframe with `gamma` - the per-document-per-topic probability (i.e., the probability that each document (serv_req_id) is in each topic).  

I chose to do this for six different combinations of ngrams and topics (ngram = 3 & topic = 3, 3 & 4, 4 & 3, 4 & 4, 5 & 3, 5 & 4). This was chosen in part becasue after looking at the graphs produced above, these models seemed (by visual inspection) to perform better. It was also done in part becasue a portion of the analyses below requries defining the topics as `unknown`, `no_rats_found`, or `rats_found` by visual inspection of the graphs produced above. Six also seemed to be a good medium between too few and too many visual inspections to do.
rm(TopNgrams_ByTopic_BarGraphs)
top_terms[[55]] #lda model with ngram = 5 & topics = 12
$ngram
[1] 5

$topic
[1] 12

$top_terms
top_terms[[2]] #lda model with ngram = 1 & topics = 3
$ngram
[1] 1

$topic
[1] 3

$top_terms
top_terms[[3]] #lda model with ngram = 1 & topics = 4
$ngram
[1] 1

$topic
[1] 4

$top_terms
top_terms[[24]] #lda model with ngram = 3 & topics = 3
$ngram
[1] 3

$topic
[1] 3

$top_terms
top_terms[[25]] #lda model with ngram = 3 & topics = 4
$ngram
[1] 3

$topic
[1] 4

$top_terms
top_terms[[35]] #lda model with ngram = 4 & topics = 3
$ngram
[1] 4

$topic
[1] 3

$top_terms
top_terms[[36]] #lda model with ngram = 4 & topics = 4
$ngram
[1] 4

$topic
[1] 4

$top_terms
top_terms[[46]] #lda model with ngram = 5 & topics = 3
$ngram
[1] 5

$topic
[1] 3

$top_terms
top_terms[[47]] #lda model with ngram = 5 & topics = 4
$ngram
[1] 5

$topic
[1] 4

$top_terms
ProbDocInTopic_ngram03_topic03 <-
  list(ngram = lda_list[[24]]$ngram,
       topic = lda_list[[24]]$topic,
       data = tidy(lda_list[[24]]$lda_model,
                   matrix = "gamma"
                   ) %>% 
         arrange(document,
                 desc(gamma)
                 ) %>% 
         mutate(topic_name = case_when(topic %in% c(1) ~ "unknown",
                                       topic %in% c(2) ~ "no_rats_found",
                                       topic %in% c(3) ~ "rats_found"
                                       ),
                model_ngram = lda_list[[24]]$ngram,
                model_topic = lda_list[[24]]$topic
                )
       )
ProbDocInTopic_ngram03_topic04 <-
  list(ngram = lda_list[[25]]$ngram,
       topic = lda_list[[25]]$topic,
       data = tidy(lda_list[[25]]$lda_model,
                   matrix = "gamma"
                   ) %>% 
         arrange(document,
                 desc(gamma)
                 ) %>% 
         mutate(topic_name = case_when(topic %in% c(1) ~ "unknown",
                                       topic %in% c(2) ~ "no_rats_found",
                                       topic %in% c(3, 4) ~ "rats_found"
                                       ),
                model_ngram = lda_list[[25]]$ngram,
                model_topic = lda_list[[25]]$topic
                )
       )
ProbDocInTopic_ngram04_topic03 <-
  list(ngram = lda_list[[35]]$ngram,
       topic = lda_list[[35]]$topic,
       data = tidy(lda_list[[35]]$lda_model,
                   matrix = "gamma"
                   ) %>% 
         arrange(document,
                 desc(gamma)
                 ) %>% 
         mutate(topic_name = case_when(#topic %in% c() ~ "unknown",
                                       topic %in% c(1) ~ "no_rats_found",
                                       topic %in% c(2, 3) ~ "rats_found"
                                       ),
                model_ngram = lda_list[[35]]$ngram,
                model_topic = lda_list[[35]]$topic
                )
       )
ProbDocInTopic_ngram04_topic04 <-
  list(ngram = lda_list[[36]]$ngram,
       topic = lda_list[[36]]$topic,
       data = tidy(lda_list[[36]]$lda_model,
                   matrix = "gamma"
                   ) %>% 
         arrange(document,
                 desc(gamma)
                 ) %>% 
         mutate(topic_name = case_when(topic %in% c(4) ~ "unknown",
                                       topic %in% c(1) ~ "no_rats_found",
                                       topic %in% c(2, 3) ~ "rats_found"
                                       ),
                model_ngram = lda_list[[36]]$ngram,
                model_topic = lda_list[[36]]$topic
                )
       )
ProbDocInTopic_ngram05_topic03 <-
  list(ngram = lda_list[[46]]$ngram,
       topic = lda_list[[46]]$topic,
       data = tidy(lda_list[[46]]$lda_model,
                   matrix = "gamma"
                   ) %>% 
         arrange(document,
                 desc(gamma)
                 ) %>% 
         mutate(topic_name = case_when(topic %in% c(2) ~ "unknown",
                                       topic %in% c(3) ~ "no_rats_found",
                                       topic %in% c(1) ~ "rats_found"
                                       ),
                model_ngram = lda_list[[46]]$ngram,
                model_topic = lda_list[[46]]$topic
                )
       )
ProbDocInTopic_ngram05_topic04 <-
  list(ngram = lda_list[[47]]$ngram,
       topic = lda_list[[47]]$topic,
       data = tidy(lda_list[[47]]$lda_model,
                   matrix = "gamma"
                   ) %>% 
         arrange(document,
                 desc(gamma)
                 ) %>% 
         mutate(topic_name = case_when(topic %in% c(4) ~ "unknown",
                                       topic %in% c(3) ~ "no_rats_found",
                                       topic %in% c(1, 2) ~ "rats_found"
                                       ),
                model_ngram = lda_list[[47]]$ngram,
                model_topic = lda_list[[47]]$topic
                )
       )
Here, we put the six individual ProbDocInTopic models together, and add in some of the original information (e.g., the original `servicenotes`) for context.
ProbDocInTopic_AllModels <-
  bind_rows(ProbDocInTopic_ngram03_topic03[[3]],
            ProbDocInTopic_ngram03_topic04[[3]],
            ProbDocInTopic_ngram04_topic03[[3]],
            ProbDocInTopic_ngram04_topic04[[3]],
            ProbDocInTopic_ngram05_topic03[[3]],
            ProbDocInTopic_ngram05_topic04[[3]]
            ) %>% 
  arrange(document,
          model_ngram,
          model_topic,
          gamma
          ) %>% 
  rename("servicerequestid" = "document") %>% 
  left_join(select(ServiceNotesCleaned,
                   servicerequestid,
                   servicenotes,
                   serviceorderdate
                   ),
            by = c("servicerequestid" = "servicerequestid")
            )
rm(list = ls(pattern = "ProbDocInTopic_ngram"))
str(ProbDocInTopic_AllModels)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   276822 obs. of  8 variables:
 $ servicerequestid: chr  "09-00001211" "09-00001211" "09-00001211" "09-00001211" ...
 $ topic           : int  2 3 1 2 3 4 1 2 3 1 ...
 $ gamma           : num  0.0142 0.0142 0.9717 0.018 0.018 ...
 $ topic_name      : chr  "no_rats_found" "rats_found" "unknown" "no_rats_found" ...
 $ model_ngram     : int  3 3 3 3 3 3 3 4 4 4 ...
 $ model_topic     : int  3 3 3 4 4 4 4 3 3 3 ...
 $ servicenotes    : chr  "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" ...
 $ serviceorderdate: POSIXct, format: "1999-04-27 12:59:00" "1999-04-27 12:59:00" ...
head(ProbDocInTopic_AllModels, 100)
View(head(ProbDocInTopic_AllModels, 1000))
Remove the datafiles that are no longer needed.
rm(list = ls(pattern = "_list"))
rm(PerTopicPerNgram, top_terms)
Next, for each model (e.g., 3-gram 4-topic), we sum the probabilities given for each numeric topic, into the "rats topics" (e.g., `rats_found`) which were defined above via visual inspection of the graphs on the Top 10 ngrams in each numeric topic.  

I also pull out the 5-gram 4-topic model because it appeared (visually) to be the most accurate individual model.
ProbDocInTopic_ProbsSummed_ByModel <-
  ProbDocInTopic_AllModels %>% 
  group_by(servicerequestid,
           model_ngram,
           model_topic,
           topic_name
           ) %>% 
  summarise(prob_ = sum(gamma)
            ) %>% 
  ungroup() %>% 
  left_join(select(ServiceNotesCleaned,
                   servicerequestid,
                   servicenotes,
                   serviceorderdate
                   ),
            by = c("servicerequestid" = "servicerequestid")
            )
str(ProbDocInTopic_ProbsSummed_ByModel)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   224119 obs. of  7 variables:
 $ servicerequestid: chr  "09-00001211" "09-00001211" "09-00001211" "09-00001211" ...
 $ model_ngram     : int  3 3 3 3 3 3 4 4 4 4 ...
 $ model_topic     : int  3 3 3 4 4 4 3 3 4 4 ...
 $ topic_name      : chr  "no_rats_found" "rats_found" "unknown" "no_rats_found" ...
 $ prob_           : num  0.0142 0.0142 0.9717 0.018 0.0359 ...
 $ servicenotes    : chr  "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" ...
 $ serviceorderdate: POSIXct, format: "1999-04-27 12:59:00" "1999-04-27 12:59:00" ...
head(ProbDocInTopic_ProbsSummed_ByModel, 100)
View(head(ProbDocInTopic_ProbsSummed_ByModel, 1000))
ProbDocInTopic_ProbsSummed_05gram04topic <-
  ProbDocInTopic_ProbsSummed_ByModel %>% 
  filter(model_ngram == 5 &
           model_topic == 4)
str(ProbDocInTopic_ProbsSummed_05gram04topic)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   35538 obs. of  7 variables:
 $ servicerequestid: chr  "09-00001211" "09-00001211" "09-00001211" "09-00001410" ...
 $ model_ngram     : int  5 5 5 5 5 5 5 5 5 5 ...
 $ model_topic     : int  4 4 4 4 4 4 4 4 4 4 ...
 $ topic_name      : chr  "no_rats_found" "rats_found" "unknown" "no_rats_found" ...
 $ prob_           : num  0.0208 0.0415 0.9377 0.0063 0.0126 ...
 $ servicenotes    : chr  "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "the rat are coming from an apartment building adjacent to the        alley.  there is alot of trash pilled up behind the apartm"| __truncated__ ...
 $ serviceorderdate: POSIXct, format: "1999-04-27 12:59:00" "1999-04-27 12:59:00" ...
head(ProbDocInTopic_ProbsSummed_05gram04topic, 100)
View(head(ProbDocInTopic_ProbsSummed_05gram04topic, 1000))
Next, for each ngram-topic combination, I create histograms of the probabilities assigned to each topic.  This is done to help visualy determine if the topic assignments are clearly separating documents (serv_request_id values).  

A log10 transformation of the probability is done to help more clearly see any differences.
# str(ProbDocInTopic_ProbsSummed_ByModel)
ProbDocInTopic_ProbsSummed_ByModel_Details <-
  ProbDocInTopic_ProbsSummed_ByModel %>% 
  mutate(model = paste0("0",
                        model_ngram,
                        "gram_",
                        "0",
                        model_topic,
                        "topic"
                        ),
         serviceorder_yr = year(serviceorderdate),
         model_and_yr = paste0(model,
                               "_",
                               as.character(serviceorder_yr)
                               )
         )
head(ProbDocInTopic_ProbsSummed_ByModel_Details, 100)
View(head(ProbDocInTopic_ProbsSummed_ByModel_Details, 1000))
TopicDistro_MainModels_Histogram_ByModel <-
  ProbDocInTopic_ProbsSummed_ByModel_Details %>% 
  split(.$model) %>%
  map(~ ggplot(data = .x,
               aes(x = prob_,
                   fill = topic_name
                   )
               ) +
        geom_histogram(binwidth = 0.05) +
        scale_x_continuous(limits = c(0, 1)
                           ) +
        scale_y_log10() + # this transformation is used to help more clearly see any differences in the probability values
        ggtitle(label = paste0(.x$model,
                               "_Histogram"
                               )
                ) +
        theme(legend.position = "none") +
        labs(x = "Prob of ServiceRequestId in the Topic",
             y = "log10 of counts"
             ) +
        facet_wrap(~topic_name)
      )
TopicDistro_MainModels_Histogram_ByModelYr <-
  ProbDocInTopic_ProbsSummed_ByModel_Details %>% 
  split(.$model_and_yr) %>% 
  map(~ ggplot(data = .x,
               aes(x = prob_,
                   fill = topic_name
                   )
               ) +
        geom_histogram(binwidth = 0.05) +
        scale_x_continuous(limits = c(0, 1)
                           ) +
        scale_y_log10() + # this transformation is used to help more clearly see any differences in the probability values
        ggtitle(label = paste0(.x$model_and_yr,
                               "_Histogram"
                               )
                ) +
        theme(legend.position = "none") +
        labs(x = "Prob of ServiceRequestId in the Topic",
             y = "log10 of counts"
             ) +
        facet_wrap(~topic_name)
      )
Saving the histograms.
Removing unneeded files.
rm(list = ls(pattern = "TopicDistro_"))
Comparing how the histograms look with a regular y-scale vs a log10 y-scale.
ProbDocInTopic_ProbsSummed_ByModel_Details %>% 
  split(.$model) %>%
  map(~ ggplot(data = .x,
               aes(x = prob_,
                   fill = topic_name
                   )
               ) +
        geom_histogram(binwidth = 0.05) +
        scale_x_continuous(limits = c(0, 1)
                           ) +
        scale_y_log10() +
        ggtitle(label = paste0(.x$model,
                               "_Histogram"
                               )
                ) +
        theme(legend.position = "none") +
        labs(x = "Prob of ServiceRequestId in the Topic",
             y = "log10 of counts"
             ) +
        facet_wrap(~topic_name,
                   scales = "fixed"
                   )
      )
$`03gram_03topic`

$`03gram_04topic`

$`04gram_03topic`

$`04gram_04topic`

$`05gram_03topic`

$`05gram_04topic`

ProbDocInTopic_ProbsSummed_ByModel_Details %>% 
  split(.$model) %>%
  map(~ ggplot(data = .x,
               aes(x = prob_,
                   fill = topic_name
                   )
               ) +
        geom_histogram(binwidth = 0.05) +
        scale_x_continuous(limits = c(0, 1)
                           ) +
        # scale_y_log10() +
        ggtitle(label = paste0(.x$model,
                               "_Histogram"
                               )
                ) +
        theme(legend.position = "none") +
        labs(x = "Prob of ServiceRequestId in the Topic",
             y = "counts"
             ) +
        coord_cartesian(ylim = c(0, 5000)
                        ) +
        facet_wrap(~topic_name,
                   scales = "fixed"
                   )
      )
$`03gram_03topic`

$`03gram_04topic`

$`04gram_03topic`

$`04gram_04topic`

$`05gram_03topic`

$`05gram_04topic`

Then, for each `servicerequestid` and `topic_name` we calculate the mean topic probability across all the models. NOTE: This step could be modified more as my instinct is that more weight should probably be given to the models with larger ngrams and topics (e.g., the 5-gram & 4-topic model). However, using larger values of n-grams will not be able to analyze those records that do not have at least `n` words. Meaning that smaller values of n-grams can analyze more documents, but possibly less accurately.
ProbDocInTopic_MeanProb_BySrvcRqstId <-
  ProbDocInTopic_ProbsSummed_ByModel %>% 
  group_by(servicerequestid,
           topic_name
           ) %>% 
  summarise(MeanProb = mean(prob_, na.rm = TRUE)
            ) %>% 
  left_join(select(ServiceNotesCleaned,
                   servicerequestid,
                   servicenotes,
                   serviceorderdate
                   ),
            by = c("servicerequestid" = "servicerequestid")
            ) %>% 
  arrange(servicerequestid,
          desc(MeanProb)
          )
str(ProbDocInTopic_MeanProb_BySrvcRqstId)
Classes ‘grouped_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 43629 obs. of  5 variables:
 $ servicerequestid: chr  "09-00001211" "09-00001211" "09-00001211" "09-00001323" ...
 $ topic_name      : chr  "unknown" "no_rats_found" "rats_found" "unknown" ...
 $ MeanProb        : num  0.581 0.333 0.183 0.479 0.466 ...
 $ servicenotes    : chr  "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "rats in the alley behind house" ...
 $ serviceorderdate: POSIXct, format: "1999-04-27 12:59:00" "1999-04-27 12:59:00" ...
 - attr(*, "vars")= chr "servicerequestid"
 - attr(*, "indices")=List of 14543
  ..$ : int  0 1 2
  ..$ : int  3 4 5
  ..$ : int  6 7 8
  ..$ : int  9 10 11
  ..$ : int  12 13 14
  ..$ : int  15 16 17
  ..$ : int  18 19 20
  ..$ : int  21 22 23
  ..$ : int  24 25 26
  ..$ : int  27 28 29
  ..$ : int  30 31 32
  ..$ : int  33 34 35
  ..$ : int  36 37 38
  ..$ : int  39 40 41
  ..$ : int  42 43 44
  ..$ : int  45 46 47
  ..$ : int  48 49 50
  ..$ : int  51 52 53
  ..$ : int  54 55 56
  ..$ : int  57 58 59
  ..$ : int  60 61 62
  ..$ : int  63 64 65
  ..$ : int  66 67 68
  ..$ : int  69 70 71
  ..$ : int  72 73 74
  ..$ : int  75 76 77
  ..$ : int  78 79 80
  ..$ : int  81 82 83
  ..$ : int  84 85 86
  ..$ : int  87 88 89
  ..$ : int  90 91 92
  ..$ : int  93 94 95
  ..$ : int  96 97 98
  ..$ : int  99 100 101
  ..$ : int  102 103 104
  ..$ : int  105 106 107
  ..$ : int  108 109 110
  ..$ : int  111 112 113
  ..$ : int  114 115 116
  ..$ : int  117 118 119
  ..$ : int  120 121 122
  ..$ : int  123 124 125
  ..$ : int  126 127 128
  ..$ : int  129 130 131
  ..$ : int  132 133 134
  ..$ : int  135 136 137
  ..$ : int  138 139 140
  ..$ : int  141 142 143
  ..$ : int  144 145 146
  ..$ : int  147 148 149
  ..$ : int  150 151 152
  ..$ : int  153 154 155
  ..$ : int  156 157 158
  ..$ : int  159 160 161
  ..$ : int  162 163 164
  ..$ : int  165 166 167
  ..$ : int  168 169 170
  ..$ : int  171 172 173
  ..$ : int  174 175 176
  ..$ : int  177 178 179
  ..$ : int  180 181 182
  ..$ : int  183 184 185
  ..$ : int  186 187 188
  ..$ : int  189 190 191
  ..$ : int  192 193 194
  ..$ : int  195 196 197
  ..$ : int  198 199 200
  ..$ : int  201 202 203
  ..$ : int  204 205 206
  ..$ : int  207 208 209
  ..$ : int  210 211 212
  ..$ : int  213 214 215
  ..$ : int  216 217 218
  ..$ : int  219 220 221
  ..$ : int  222 223 224
  ..$ : int  225 226 227
  ..$ : int  228 229 230
  ..$ : int  231 232 233
  ..$ : int  234 235 236
  ..$ : int  237 238 239
  ..$ : int  240 241 242
  ..$ : int  243 244 245
  ..$ : int  246 247 248
  ..$ : int  249 250 251
  ..$ : int  252 253 254
  ..$ : int  255 256 257
  ..$ : int  258 259 260
  ..$ : int  261 262 263
  ..$ : int  264 265 266
  ..$ : int  267 268 269
  ..$ : int  270 271 272
  ..$ : int  273 274 275
  ..$ : int  276 277 278
  ..$ : int  279 280 281
  ..$ : int  282 283 284
  ..$ : int  285 286 287
  ..$ : int  288 289 290
  ..$ : int  291 292 293
  ..$ : int  294 295 296
  .. [list output truncated]
 - attr(*, "group_sizes")= int  3 3 3 3 3 3 3 3 3 3 ...
 - attr(*, "biggest_group_size")= int 3
 - attr(*, "labels")='data.frame':  14543 obs. of  1 variable:
  ..$ servicerequestid: chr  "09-00001211" "09-00001323" "09-00001410" "09-00001865" ...
  ..- attr(*, "vars")= chr "servicerequestid"
tail(ProbDocInTopic_MeanProb_BySrvcRqstId, 100)
View(head(ProbDocInTopic_MeanProb_BySrvcRqstId, 1000))
View(tail(ProbDocInTopic_MeanProb_BySrvcRqstId, 1000))
Here, we create a dataset suitable for graphing and analyzing, by simply selecting the highest topic probability (the `MeanProb` value) assigned to each topic. We also do this for the single 5-gram 4-topic model.
# ProbDocInTopic_AllModels
# ProbDocInTopic_ProbsSummed_ByModel
# ProbDocInTopic_MeanProb_BySrvcRqstId
# 
# 
# ProbDocInTopic_AllModels %>% select(servicerequestid) %>% distinct() %>% nrow
# ProbDocInTopic_ProbsSummed_ByModel %>% select(servicerequestid) %>% distinct() %>% nrow
# ProbDocInTopic_MeanProb_BySrvcRqstId %>% select(servicerequestid) %>% distinct() %>% nrow
TopProb_BySrvcRqstId <-
  ProbDocInTopic_MeanProb_BySrvcRqstId %>% 
  mutate(serviceorder_yr = year(serviceorderdate),
         # serviceorder_yr2 = as.factor(serviceorder_yr),
         yr_group = paste0(as.character(serviceorder_yr),
                           "_",
                           as.character(topic_name)
                           ),
         model = "AllModels_MeanProb"
         ) %>% 
  rename("prob" = "MeanProb") %>% 
  group_by(servicerequestid) %>% 
  top_n(1,
        prob
        ) %>% 
  ungroup() %>% 
  arrange(prob)
str(TopProb_BySrvcRqstId)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   14543 obs. of  8 variables:
 $ servicerequestid: chr  "09-00119541" "09-00029707" "09-00053342" "09-00040528" ...
 $ topic_name      : chr  "no_rats_found" "unknown" "rats_found" "rats_found" ...
 $ prob            : num  0.36 0.365 0.365 0.366 0.366 ...
 $ servicenotes    : chr  "3-8-01  baited 2 burrows   front yard    3-30-01   baited 2 rat burrows  RH/GB    4-6-01    baited 1 burrow in front gb rh" "3350 C ST SE, NO RAT BURROWS WERE FOUND ON THE PROPERTY. AT 3346 & 3348 C ST SE BAITED REAR GARAGE. RICKS/DD" "THE VECTOR CONTROL BRANCH BAITED 2 BURROWS, RIGHT FRONT STEPS. FOLLOW UP BAITING WILL TAKE PLACE SOON; RICKS/PRICE;  DD" "THE VECTOR CONTROL BRANCH BAITED THE REAR YARD; 10 RAT BURROWS WERE FOUND. FOLLOW-UP BAITING WILL TAKE PLACE SHORTLY.HERRINGTON"| __truncated__ ...
 $ serviceorderdate: POSIXct, format: "2001-02-22 12:25:00" "1999-10-28 16:11:00" ...
 $ serviceorder_yr : num  2001 1999 2000 2000 2000 ...
 $ yr_group        : chr  "2001_no_rats_found" "1999_unknown" "2000_rats_found" "2000_rats_found" ...
 $ model           : chr  "AllModels_MeanProb" "AllModels_MeanProb" "AllModels_MeanProb" "AllModels_MeanProb" ...
TopProb_BySrvcRqstId
View(TopProb_BySrvcRqstId)
TopProb_BySrvcRqstId_05gram04topic <- 
  ProbDocInTopic_ProbsSummed_05gram04topic %>% 
  mutate(serviceorder_yr = year(serviceorderdate),
         # serviceorder_yr2 = as.factor(serviceorder_yr),
         yr_group = paste0(as.character(serviceorder_yr),
                           "_",
                           as.character(topic_name)
                           ),
         model = "5gram4topic"
         ) %>% 
  rename("prob" = "prob_") %>% 
  group_by(servicerequestid) %>% 
  top_n(1,
        prob
        ) %>% 
  ungroup() %>% 
  arrange(prob)
str(TopProb_BySrvcRqstId_05gram04topic)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   11846 obs. of  10 variables:
 $ servicerequestid: chr  "10-00147588" "11-00106490" "10-00165933" "11-00138104" ...
 $ model_ngram     : int  5 5 5 5 5 5 5 5 5 5 ...
 $ model_topic     : int  4 4 4 4 4 4 4 4 4 4 ...
 $ topic_name      : chr  "rats_found" "unknown" "unknown" "unknown" ...
 $ prob            : num  0.343 0.383 0.391 0.414 0.442 ...
 $ servicenotes    : chr  "On 5/24/10 G Cornes baited 10 rat burrows in front of bldg around sidewalk." "On 4/27/11@9:11am L rogers baited 4 rat burrows in the rear yd by shed. Eaton Bait Block/Peanut Butter, EPA# 56-42, .005% 4oz, "| __truncated__ "On 5/18/10 10:25AM G Curtis baited 2 rat burrows in the alley and tree box" "On 5/18/11@10:45am L Rogers baited 1 rat burrow at tree box. Eaton Bait Block/Peanut Butter, EPA# 56-42, .005% 1oz, Gloves & Sh"| __truncated__ ...
 $ serviceorderdate: POSIXct, format: "2010-04-29 14:45:58" "2011-04-15 03:57:15" ...
 $ serviceorder_yr : num  2010 2011 2010 2011 2010 ...
 $ yr_group        : chr  "2010_rats_found" "2011_unknown" "2010_unknown" "2011_unknown" ...
 $ model           : chr  "5gram4topic" "5gram4topic" "5gram4topic" "5gram4topic" ...
TopProb_BySrvcRqstId_05gram04topic
View(TopProb_BySrvcRqstId_05gram04topic)
  
Now, we can simply create the freqpoly plots and density plots for each year-topic combination to investigate how the topic assignment did over time, and then save them.
# str(TopProb_BySrvcRqstId)
# str(TopProb_BySrvcRqstId_05gram04topic)
TopicDistro_AllModelsMean_Freqpoly <-
  TopProb_BySrvcRqstId %>%
  split(.$serviceorder_yr) %>%
  map(~ ggplot(data = .x,
               aes(x = prob,
                   colour = topic_name
                   )
               ) +
        geom_freqpoly(binwidth = 0.05,
                      alpha = 0.6
                      ) +
        scale_x_continuous(limits = c(0, 1)
                           ) +
        ggtitle(label = paste0("TopicDistro_",
                               .x$model,
                               "_Freqpoly_",
                               as.character(.x$serviceorder_yr)
                               )
                ) +
        labs(x = "Prob of ServiceRequestId in the Topic")
      )
TopicDistro_AllModelsMean_Density <-
  TopProb_BySrvcRqstId %>%
  split(.$serviceorder_yr) %>%
  map(~ ggplot(data = .x,
               aes(x = prob,
                   colour = topic_name
                   )
               ) +
        geom_density() +
        scale_x_continuous(limits = c(0, 1)
                           ) +
        ggtitle(label = paste0("TopicDistro_",
                               .x$model,
                               "_Density_",
                               as.character(.x$serviceorder_yr)
                               )
                ) +
        labs(x = "Prob of ServiceRequestId in the Topic")
      )
TopicDistro_05gram04topic_Freqpoly <-
  TopProb_BySrvcRqstId_05gram04topic %>%
  split(.$serviceorder_yr) %>%
  map(~ ggplot(data = .x,
               aes(x = prob,
                   colour = topic_name
                   )
               ) +
        geom_freqpoly(binwidth = 0.05,
                      alpha = 0.6
                      ) +
        scale_x_continuous(limits = c(0, 1)
                           ) +
        ggtitle(label = paste0("TopicDistro_",
                               .x$model,
                               "_Freqpoly_",
                               as.character(.x$serviceorder_yr)
                               )
                ) +
        labs(x = "Prob of ServiceRequestId in the Topic")
      )
TopicDistro_05gram04topic_Density <-
  TopProb_BySrvcRqstId_05gram04topic %>%
  split(.$serviceorder_yr) %>%
  map(~ ggplot(data = .x,
               aes(x = prob,
                   colour = topic_name
                   )
               ) +
        geom_density() +
        scale_x_continuous(limits = c(0, 1)
                           ) +
        ggtitle(label = paste0("TopicDistro_",
                               .x$model,
                               "_Density_",
                               as.character(.x$serviceorder_yr)
                               )
                ) +
        labs(x = "Prob of ServiceRequestId in the Topic")
      )
# str(TopicDistro_AllModelsMean_Freqpoly[[18]])
# str(TopicDistro_AllModelsMean_Density[[18]])
# str(TopicDistro_05gram04topic_Freqpoly[[18]])
# str(TopicDistro_05gram04topic_Density[[18]])
TopicDistro_AllModelsMean_Freqpoly[[18]]

TopicDistro_AllModelsMean_Density[[18]]

TopicDistro_05gram04topic_Freqpoly[[18]]

TopicDistro_05gram04topic_Density[[18]]

Saving the visuals created above.
Removing unneeded files.
rm(list = ls(pattern = "TopicDistro_"))
As another method to determine the "correct" topic assignment, here I simply count the number of times a topic assignment was given to each document (servicerequestid), and assign the "correct" topic to the topic with the most assignments. In the case of ties (e.g., all three topics, each assigned twice), I use the `MeanProb` calculated above in the `ProbDocInTopic_MeanProb_BySrvcRqstId` dataframe previously.
str(ProbDocInTopic_ProbsSummed_ByModel)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   224119 obs. of  7 variables:
 $ servicerequestid: chr  "09-00001211" "09-00001211" "09-00001211" "09-00001211" ...
 $ model_ngram     : int  3 3 3 3 3 3 4 4 4 4 ...
 $ model_topic     : int  3 3 3 4 4 4 3 3 4 4 ...
 $ topic_name      : chr  "no_rats_found" "rats_found" "unknown" "no_rats_found" ...
 $ prob_           : num  0.0142 0.0142 0.9717 0.018 0.0359 ...
 $ servicenotes    : chr  "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" ...
 $ serviceorderdate: POSIXct, format: "1999-04-27 12:59:00" "1999-04-27 12:59:00" ...
View(ProbDocInTopic_ProbsSummed_ByModel)
# ProbDocInTopic_ProbsSummed_ByModel %>% select(model_ngram, model_topic) %>% distinct()
str(ProbDocInTopic_MeanProb_BySrvcRqstId)
Classes ‘grouped_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 43629 obs. of  5 variables:
 $ servicerequestid: chr  "09-00001211" "09-00001211" "09-00001211" "09-00001323" ...
 $ topic_name      : chr  "unknown" "no_rats_found" "rats_found" "unknown" ...
 $ MeanProb        : num  0.581 0.333 0.183 0.479 0.466 ...
 $ servicenotes    : chr  "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "rats in the alley behind house" ...
 $ serviceorderdate: POSIXct, format: "1999-04-27 12:59:00" "1999-04-27 12:59:00" ...
 - attr(*, "vars")= chr "servicerequestid"
 - attr(*, "indices")=List of 14543
  ..$ : int  0 1 2
  ..$ : int  3 4 5
  ..$ : int  6 7 8
  ..$ : int  9 10 11
  ..$ : int  12 13 14
  ..$ : int  15 16 17
  ..$ : int  18 19 20
  ..$ : int  21 22 23
  ..$ : int  24 25 26
  ..$ : int  27 28 29
  ..$ : int  30 31 32
  ..$ : int  33 34 35
  ..$ : int  36 37 38
  ..$ : int  39 40 41
  ..$ : int  42 43 44
  ..$ : int  45 46 47
  ..$ : int  48 49 50
  ..$ : int  51 52 53
  ..$ : int  54 55 56
  ..$ : int  57 58 59
  ..$ : int  60 61 62
  ..$ : int  63 64 65
  ..$ : int  66 67 68
  ..$ : int  69 70 71
  ..$ : int  72 73 74
  ..$ : int  75 76 77
  ..$ : int  78 79 80
  ..$ : int  81 82 83
  ..$ : int  84 85 86
  ..$ : int  87 88 89
  ..$ : int  90 91 92
  ..$ : int  93 94 95
  ..$ : int  96 97 98
  ..$ : int  99 100 101
  ..$ : int  102 103 104
  ..$ : int  105 106 107
  ..$ : int  108 109 110
  ..$ : int  111 112 113
  ..$ : int  114 115 116
  ..$ : int  117 118 119
  ..$ : int  120 121 122
  ..$ : int  123 124 125
  ..$ : int  126 127 128
  ..$ : int  129 130 131
  ..$ : int  132 133 134
  ..$ : int  135 136 137
  ..$ : int  138 139 140
  ..$ : int  141 142 143
  ..$ : int  144 145 146
  ..$ : int  147 148 149
  ..$ : int  150 151 152
  ..$ : int  153 154 155
  ..$ : int  156 157 158
  ..$ : int  159 160 161
  ..$ : int  162 163 164
  ..$ : int  165 166 167
  ..$ : int  168 169 170
  ..$ : int  171 172 173
  ..$ : int  174 175 176
  ..$ : int  177 178 179
  ..$ : int  180 181 182
  ..$ : int  183 184 185
  ..$ : int  186 187 188
  ..$ : int  189 190 191
  ..$ : int  192 193 194
  ..$ : int  195 196 197
  ..$ : int  198 199 200
  ..$ : int  201 202 203
  ..$ : int  204 205 206
  ..$ : int  207 208 209
  ..$ : int  210 211 212
  ..$ : int  213 214 215
  ..$ : int  216 217 218
  ..$ : int  219 220 221
  ..$ : int  222 223 224
  ..$ : int  225 226 227
  ..$ : int  228 229 230
  ..$ : int  231 232 233
  ..$ : int  234 235 236
  ..$ : int  237 238 239
  ..$ : int  240 241 242
  ..$ : int  243 244 245
  ..$ : int  246 247 248
  ..$ : int  249 250 251
  ..$ : int  252 253 254
  ..$ : int  255 256 257
  ..$ : int  258 259 260
  ..$ : int  261 262 263
  ..$ : int  264 265 266
  ..$ : int  267 268 269
  ..$ : int  270 271 272
  ..$ : int  273 274 275
  ..$ : int  276 277 278
  ..$ : int  279 280 281
  ..$ : int  282 283 284
  ..$ : int  285 286 287
  ..$ : int  288 289 290
  ..$ : int  291 292 293
  ..$ : int  294 295 296
  .. [list output truncated]
 - attr(*, "group_sizes")= int  3 3 3 3 3 3 3 3 3 3 ...
 - attr(*, "biggest_group_size")= int 3
 - attr(*, "labels")='data.frame':  14543 obs. of  1 variable:
  ..$ servicerequestid: chr  "09-00001211" "09-00001323" "09-00001410" "09-00001865" ...
  ..- attr(*, "vars")= chr "servicerequestid"
View(ProbDocInTopic_MeanProb_BySrvcRqstId)
TopicAssigned_ByCounts_ByMeanProb <-
  ProbDocInTopic_ProbsSummed_ByModel %>% 
  group_by(servicerequestid,
           model_ngram,
           model_topic
           ) %>% 
  top_n(1,
        prob_
        ) %>% 
  ungroup() %>% 
  count(servicerequestid,
        topic_name
        ) %>% 
  left_join(ProbDocInTopic_MeanProb_BySrvcRqstId,
            by = c("servicerequestid" = "servicerequestid",
                   "topic_name" = "topic_name"
                   )
            ) %>% 
  select(servicerequestid,
         topic_name,
         n,
         MeanProb
         ) %>% 
  group_by(servicerequestid) %>%
  arrange(servicerequestid,
          desc(n),
          desc(MeanProb)
          ) %>% 
  ungroup() %>% 
  group_by(servicerequestid) %>%
  mutate(RowNum = row_number()
         ) %>% 
  ungroup() %>% 
  filter(RowNum == 1) %>% 
  rename(times_topic_assigned = n) %>% 
  left_join(ServiceNotesCleaned,
            by = "servicerequestid"
            ) %>% 
  select(servicerequestid,
         topic_name,
         times_topic_assigned,
         MeanProb,
         servicenotes,
         servicenotes_nonums_nopunc,
         serviceorderdate,
         serviceorder_date,
         serviceorder_yr,
         serviceorder_yr_posix,
         serviceorder_mth,
         serviceorder_yrmth,
         serviceorder_yrmth_posix,
         serviceorder_day,
         serviceorder_wkday
         )
str(ServiceNotesCleaned)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   15581 obs. of  17 variables:
 $ servicerequestid          : chr  "10-00020177" "10-00020346" "10-00020514" "10-00020754" ...
 $ servicepriority           : chr  "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
 $ servicecode               : chr  "S0311" "S0311" "S0311" "S0311" ...
 $ servicecodedescription    : chr  "Rat Abatement" "Rat Abatement" "Rat Abatement" "Rat Abatement" ...
 $ servicetypecode           : chr  "DEPAHEAL" "DEPAHEAL" "DEPAHEAL" "DEPAHEAL" ...
 $ servicetypecodedescription: chr  "DOH" "DOH" "DOH" "DOH" ...
 $ serviceorderdate          : POSIXct, format: "2010-01-21 15:46:23" "2010-01-21 16:32:51" ...
 $ servicenotes              : chr  "On 1/25/10 L Rogers baited 3 rat burrows in the rear yd." "On 1/26/10 G Cornes found no rat burrows in the area." "On 1/28/10 L Rogers baited 4 rat burrows in the rear yd." "On 1/26/10 V Purvis baited 9 rat burrows in the park." ...
 $ serviceorder_date         : Date, format: "2010-01-21" "2010-01-21" ...
 $ serviceorder_yr           : num  2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
 $ serviceorder_yr_posix     : POSIXct, format: "2010-01-01" "2010-01-01" ...
 $ serviceorder_mth          : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 1 1 1 1 1 1 1 1 1 1 ...
 $ serviceorder_yrmth        : chr  "2010-01" "2010-01" "2010-01" "2010-01" ...
 $ serviceorder_yrmth_posix  : POSIXct, format: "2010-01-01" "2010-01-01" ...
 $ serviceorder_day          : int  21 21 21 21 21 22 22 22 22 22 ...
 $ serviceorder_wkday        : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 5 5 5 5 5 6 6 6 6 6 ...
 $ servicenotes_nonums_nopunc: chr  "   rogers baited  rat burrows rear yd" "   cornes found no rat burrows" "   rogers baited  rat burrows rear yd" "   purvis baited  rat burrows park" ...
str(TopicAssigned_ByCounts_ByMeanProb)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   14543 obs. of  15 variables:
 $ servicerequestid          : chr  "09-00001211" "09-00001323" "09-00001410" "09-00001865" ...
 $ topic_name                : chr  "unknown" "unknown" "rats_found" "no_rats_found" ...
 $ times_topic_assigned      : int  3 1 4 4 3 3 3 4 2 2 ...
 $ MeanProb                  : num  0.581 0.479 0.662 0.651 0.593 ...
 $ servicenotes              : chr  "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "rats in the alley behind house" "the rat are coming from an apartment building adjacent to the        alley.  there is alot of trash pilled up behind the apartm"| __truncated__ "The vector control branch baited at 2874 Perry St. NE for rats on 5-25-99." ...
 $ servicenotes_nonums_nopunc: chr  "customer called vector control control no " "rats alley house" "rat coming apartment building adjacent alley alot trash pilled apartment building" "vector control branch baited  perry st ne rats   " ...
 $ serviceorderdate          : POSIXct, format: "1999-04-27 12:59:00" "1999-04-30 19:59:00" ...
 $ serviceorder_date         : Date, format: "1999-04-27" "1999-04-30" ...
 $ serviceorder_yr           : num  1999 1999 1999 1999 1999 ...
 $ serviceorder_yr_posix     : POSIXct, format: "1999-01-01" "1999-01-01" ...
 $ serviceorder_mth          : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 4 4 5 5 5 5 5 5 6 6 ...
 $ serviceorder_yrmth        : chr  "1999-04" "1999-04" "1999-05" "1999-05" ...
 $ serviceorder_yrmth_posix  : POSIXct, format: "1999-04-01" "1999-04-01" ...
 $ serviceorder_day          : int  27 30 6 14 19 21 26 28 3 8 ...
 $ serviceorder_wkday        : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 3 6 5 6 4 6 4 6 5 3 ...
head(TopicAssigned_ByCounts_ByMeanProb, 1000)
View(TopicAssigned_ByCounts_ByMeanProb)
So now, we can compare two different models, each being slight variations of six LDA models.  

1) `TopProb_BySrvcRqstId` assigns the topic by taking the mean topic probability score across all six models.
2) `TopicAssigned_ByCounts_ByMeanProb` assigns the topic by taking the most frequently assigned topic across all six models.
str(TopicAssigned_ByCounts_ByMeanProb)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   14543 obs. of  15 variables:
 $ servicerequestid          : chr  "09-00001211" "09-00001323" "09-00001410" "09-00001865" ...
 $ topic_name                : chr  "unknown" "unknown" "rats_found" "no_rats_found" ...
 $ times_topic_assigned      : int  3 1 4 4 3 3 3 4 2 2 ...
 $ MeanProb                  : num  0.581 0.479 0.662 0.651 0.593 ...
 $ servicenotes              : chr  "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "rats in the alley behind house" "the rat are coming from an apartment building adjacent to the        alley.  there is alot of trash pilled up behind the apartm"| __truncated__ "The vector control branch baited at 2874 Perry St. NE for rats on 5-25-99." ...
 $ servicenotes_nonums_nopunc: chr  "customer called vector control control no " "rats alley house" "rat coming apartment building adjacent alley alot trash pilled apartment building" "vector control branch baited  perry st ne rats   " ...
 $ serviceorderdate          : POSIXct, format: "1999-04-27 12:59:00" "1999-04-30 19:59:00" ...
 $ serviceorder_date         : Date, format: "1999-04-27" "1999-04-30" ...
 $ serviceorder_yr           : num  1999 1999 1999 1999 1999 ...
 $ serviceorder_yr_posix     : POSIXct, format: "1999-01-01" "1999-01-01" ...
 $ serviceorder_mth          : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 4 4 5 5 5 5 5 5 6 6 ...
 $ serviceorder_yrmth        : chr  "1999-04" "1999-04" "1999-05" "1999-05" ...
 $ serviceorder_yrmth_posix  : POSIXct, format: "1999-04-01" "1999-04-01" ...
 $ serviceorder_day          : int  27 30 6 14 19 21 26 28 3 8 ...
 $ serviceorder_wkday        : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 3 6 5 6 4 6 4 6 5 3 ...
str(TopProb_BySrvcRqstId)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   14543 obs. of  8 variables:
 $ servicerequestid: chr  "09-00119541" "09-00029707" "09-00053342" "09-00040528" ...
 $ topic_name      : chr  "no_rats_found" "unknown" "rats_found" "rats_found" ...
 $ prob            : num  0.36 0.365 0.365 0.366 0.366 ...
 $ servicenotes    : chr  "3-8-01  baited 2 burrows   front yard    3-30-01   baited 2 rat burrows  RH/GB    4-6-01    baited 1 burrow in front gb rh" "3350 C ST SE, NO RAT BURROWS WERE FOUND ON THE PROPERTY. AT 3346 & 3348 C ST SE BAITED REAR GARAGE. RICKS/DD" "THE VECTOR CONTROL BRANCH BAITED 2 BURROWS, RIGHT FRONT STEPS. FOLLOW UP BAITING WILL TAKE PLACE SOON; RICKS/PRICE;  DD" "THE VECTOR CONTROL BRANCH BAITED THE REAR YARD; 10 RAT BURROWS WERE FOUND. FOLLOW-UP BAITING WILL TAKE PLACE SHORTLY.HERRINGTON"| __truncated__ ...
 $ serviceorderdate: POSIXct, format: "2001-02-22 12:25:00" "1999-10-28 16:11:00" ...
 $ serviceorder_yr : num  2001 1999 2000 2000 2000 ...
 $ yr_group        : chr  "2001_no_rats_found" "1999_unknown" "2000_rats_found" "2000_rats_found" ...
 $ model           : chr  "AllModels_MeanProb" "AllModels_MeanProb" "AllModels_MeanProb" "AllModels_MeanProb" ...
str(TopProb_BySrvcRqstId_05gram04topic)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   11846 obs. of  10 variables:
 $ servicerequestid: chr  "10-00147588" "11-00106490" "10-00165933" "11-00138104" ...
 $ model_ngram     : int  5 5 5 5 5 5 5 5 5 5 ...
 $ model_topic     : int  4 4 4 4 4 4 4 4 4 4 ...
 $ topic_name      : chr  "rats_found" "unknown" "unknown" "unknown" ...
 $ prob            : num  0.343 0.383 0.391 0.414 0.442 ...
 $ servicenotes    : chr  "On 5/24/10 G Cornes baited 10 rat burrows in front of bldg around sidewalk." "On 4/27/11@9:11am L rogers baited 4 rat burrows in the rear yd by shed. Eaton Bait Block/Peanut Butter, EPA# 56-42, .005% 4oz, "| __truncated__ "On 5/18/10 10:25AM G Curtis baited 2 rat burrows in the alley and tree box" "On 5/18/11@10:45am L Rogers baited 1 rat burrow at tree box. Eaton Bait Block/Peanut Butter, EPA# 56-42, .005% 1oz, Gloves & Sh"| __truncated__ ...
 $ serviceorderdate: POSIXct, format: "2010-04-29 14:45:58" "2011-04-15 03:57:15" ...
 $ serviceorder_yr : num  2010 2011 2010 2011 2010 ...
 $ yr_group        : chr  "2010_rats_found" "2011_unknown" "2010_unknown" "2011_unknown" ...
 $ model           : chr  "5gram4topic" "5gram4topic" "5gram4topic" "5gram4topic" ...
dim(TopicAssigned_ByCounts_ByMeanProb)
[1] 14543    15
dim(TopProb_BySrvcRqstId)
[1] 14543     8
dim(TopProb_BySrvcRqstId_05gram04topic)
[1] 11846    10
a <- TopProb_BySrvcRqstId %>% 
  select(servicerequestid,
         topic_name,
         prob
         ) %>% 
  rename(topic_name_meanprob = topic_name,
         prob_meanprob = prob
         )
b <- TopicAssigned_ByCounts_ByMeanProb %>% 
  rename(topic_name_topcounts = topic_name,
         times_topic_assigned_topcounts = times_topic_assigned,
         prob_topcounts = MeanProb
         )
ModelsCompare <- a %>% 
  left_join(b,
            by = "servicerequestid"
            )
rm(a, b)
str(ModelsCompare)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   14543 obs. of  17 variables:
 $ servicerequestid              : chr  "09-00119541" "09-00029707" "09-00053342" "09-00040528" ...
 $ topic_name_meanprob           : chr  "no_rats_found" "unknown" "rats_found" "rats_found" ...
 $ prob_meanprob                 : num  0.36 0.365 0.365 0.366 0.366 ...
 $ topic_name_topcounts          : chr  "no_rats_found" "unknown" "rats_found" "rats_found" ...
 $ times_topic_assigned_topcounts: int  3 2 2 2 2 2 2 2 2 2 ...
 $ prob_topcounts                : num  0.36 0.365 0.365 0.366 0.366 ...
 $ servicenotes                  : chr  "3-8-01  baited 2 burrows   front yard    3-30-01   baited 2 rat burrows  RH/GB    4-6-01    baited 1 burrow in front gb rh" "3350 C ST SE, NO RAT BURROWS WERE FOUND ON THE PROPERTY. AT 3346 & 3348 C ST SE BAITED REAR GARAGE. RICKS/DD" "THE VECTOR CONTROL BRANCH BAITED 2 BURROWS, RIGHT FRONT STEPS. FOLLOW UP BAITING WILL TAKE PLACE SOON; RICKS/PRICE;  DD" "THE VECTOR CONTROL BRANCH BAITED THE REAR YARD; 10 RAT BURROWS WERE FOUND. FOLLOW-UP BAITING WILL TAKE PLACE SHORTLY.HERRINGTON"| __truncated__ ...
 $ servicenotes_nonums_nopunc    : chr  "   baited  burrows front yard    baited  rat burrows rh gb    baited  burrow front gb rh" " st se no rat burrows found property   st se baited rear garage ricks dd" "vector control branch baited  burrows front steps follow baiting ricks price dd" "vector control branch baited rear yard  rat burrows found follow baiting shortlyherrington brown dd" ...
 $ serviceorderdate              : POSIXct, format: "2001-02-22 12:25:00" "1999-10-28 16:11:00" ...
 $ serviceorder_date             : Date, format: "2001-02-22" "1999-10-28" ...
 $ serviceorder_yr               : num  2001 1999 2000 2000 2000 ...
 $ serviceorder_yr_posix         : POSIXct, format: "2001-01-01" "1999-01-01" ...
 $ serviceorder_mth              : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 2 10 3 1 1 4 4 5 5 5 ...
 $ serviceorder_yrmth            : chr  "2001-02" "1999-10" "2000-03" "2000-01" ...
 $ serviceorder_yrmth_posix      : POSIXct, format: "2001-02-01" "1999-10-01" ...
 $ serviceorder_day              : int  22 28 22 13 21 17 1 13 16 24 ...
 $ serviceorder_wkday            : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 5 5 4 5 6 2 4 5 1 2 ...
ModelsCompare
View(ModelsCompare)
When comparing the two models, they coincide nearly 99% of the time...but the disagreements still raise questions.
filter(ModelsCompare,
       topic_name_meanprob != topic_name_topcounts
       ) %>% 
  nrow() / nrow(ModelsCompare) * 100
[1] 1.155195
filter(ModelsCompare,
       topic_name_meanprob == topic_name_topcounts
       ) %>% 
  nrow() / nrow(ModelsCompare) * 100
[1] 98.84481
# View(filter(ModelsCompare,
#             topic_name_meanprob != topic_name_topcounts
#             )
#      )
View(filter(ModelsCompare,
            topic_name_meanprob != topic_name_topcounts
            ) %>% 
       select(servicerequestid,
              topic_name_meanprob,
              topic_name_topcounts,
              servicenotes,
              servicenotes_nonums_nopunc
              )
     )
As the models assign topics almost identically, let's use the `topcounts` model to do some quick inspections about how often each of the topics were assigned, when, changes over time, etc.
Counts_AllYears <- ModelsCompare %>% 
  mutate(topic_name = factor(topic_name_topcounts,
                             levels = c("unknown",
                                        "no_rats_found",
                                        "rats_found"
                                        )
                             )
         ) %>% 
group_by(topic_name) %>% 
  count() %>% 
  rename(counts = n)
Counts_AcrossYears <- ModelsCompare %>% 
  mutate(topic_name = factor(topic_name_topcounts,
                             levels = c("unknown",
                                        "no_rats_found",
                                        "rats_found"
                                        )
                             )
         ) %>% 
group_by(topic_name,
         serviceorder_yr
         ) %>% 
  count() %>% 
  rename(counts = n)
ggplot(data = Counts_AllYears,
       aes(x = topic_name,
           y = counts,
           fill = topic_name
           )
       ) +
  geom_col() +
  geom_text(aes(label = counts),
            nudge_y = -200,
            size = 3
            ) +
  # theme_minimal() +
  theme(legend.position = "none") +
  coord_flip()
 ggsave(paste0(wd,
               "/Viz/",
               # "New_",
               "Topics_CountModel_Counts_AllYears.png"
                  ),
        scale = 4,
        width = 6,
        height = 6,
        units = "cm"
        )

ggplot(data = Counts_AcrossYears,
       aes(x = topic_name,
           y = counts,
           fill = topic_name
           )
       ) +
  geom_col() +
  geom_text(aes(label = counts),
            nudge_y = 100,
            size = 2.5
            ) +
  scale_y_continuous(limits = c(0, 1800),
                     breaks = seq(0, 1800, 300)
                     ) +
  facet_wrap(~serviceorder_yr) +
  theme(legend.position = "none") +
  coord_flip()
 ggsave(paste0(wd,
               "/Viz/",
               # "New_",
               "Topics_CountModel_Counts_AcrossYears.png"
                  ),
        scale = 4,
        width = 8,
        height = 6,
        units = "cm"
        )

---
title: "Example of Using Topic Modeling (via Latent Dirichlet allocation) to Pull Out Features from 311 Service Notes"
output: html_notebook
---
  
    
    Load the relevant libraries.
```{r, message=FALSE, warning=FALSE}

# rm(list = ls())


library("tidyverse")          # data manipulation
library("magrittr")           # data manipulation (pipeing data)
library("stringr")            # string manipulation
library("lubridate")          # date manipulation
library("tidytext")           # text manipulation
library("topicmodels")        # topic modeling
library("ggplot2")            # viz
library("doParallel")         # parallel processing
library("ldatuning")          # estimating the proper number of topics
# library("scales")
# library("cowplot")            # grid plotting

```
  
    
    Session Info.
```{r}

sessionInfo()

```
  
    
    Setup the root directory.
```{r "setup", include = FALSE}

require("knitr")

opts_knit$set(root.dir = "/Users/mdturse/Desktop/Analytics/dc_doh_hackathon")

```
  
    
    Setting `wd` as the working directory.
```{r}

wd <- getwd()

wd

```
  
    
    Get the raw data. Because of trouble maintaining a connection to Dropbox via R, I first downloaded the raw data from [https://www.dropbox.com/sh/4j7q53lltasez3h/AABQrwrQj3aenqG42wbW7-qra/years_combined/dc_311-2017-01-16.csv?dl=0](https://www.dropbox.com/sh/4j7q53lltasez3h/AABQrwrQj3aenqG42wbW7-qra/years_combined/dc_311-2017-01-16.csv?dl=0) and saved the file locally.
```{r, warning=FALSE}

Raw311Data <- read_csv(paste0(wd,
                              "/Data_Raw/dc_311-2017-01-16.csv"
                              # "/Data_Raw/dc_311-2017-10-07.csv"
                              ),
                       progress = FALSE
                       )

# saving is done to avoid having to download all the data again
saveRDS(Raw311Data,
        paste0(wd,
               "/Data_Processed/",
               "Raw311Data.Rds"
               )
        )

str(Raw311Data)
tail(Raw311Data, 500)

```
  
    
    Un-comment the chunk below to load the saved raw data (to avoid having to download the raw data again).
```{r}

# Raw311Data <- readRDS(paste0(wd,
#                              "/Data_Processed/",
#                              "Raw311Data.Rds"
#                              )
#                       )
# 
# str(Raw311Data)
# tail(Raw311Data, 500)
# View(tail(Raw311Data, 1000))

```
  
    
    Selecting those variables that may be useful to test breakdowns of topic modeling.  For example, running a topic model separately for the different levels of `servicecode`.
```{r}

SelectedVars <- select(Raw311Data,
                       SERVICEREQUESTID,
                       SERVICEPRIORITY,
                       SERVICECODE,
                       SERVICECODEDESCRIPTION,
                       SERVICETYPECODE,
                       SERVICETYPECODEDESCRIPTION,
                       SERVICEORDERDATE,
                       SERVICENOTES
                       )

names(SelectedVars) <- tolower(names(SelectedVars))

rm(Raw311Data)
str(SelectedVars)

```
  
    
    Quick visual inspection of filtering the data to only service calls with notes (i.e., removing NA values), and only those that are rat-related (`servicecode == "S0311`).  
      
    Removing NA values takes us from 5,093,887 rows to 3,018,411 rows.  
    
    Looking at only rat-related service calls takes us from 3,018,411 rows to 15,581 rows.
```{r}

NoNAServiceNotes <- filter(SelectedVars,
                           !is.na(servicenotes)
                           )

# message("SelectedVars")
nrow(SelectedVars)
# message("NoNAServiceNotes")
nrow(NoNAServiceNotes)

rm(SelectedVars)
str(NoNAServiceNotes)

View(head(NoNAServiceNotes, 1000))



RatCalls <- filter(NoNAServiceNotes,
                   servicecode == "S0311"
                   )

# message("NoNAServiceNotes")
nrow(NoNAServiceNotes)
# message("RatCalls")
nrow(RatCalls)

rm(NoNAServiceNotes)
View(RatCalls)

```
  
    
    Add in time-related variables.
```{r}

RatCalls_Time <- RatCalls %>%
  mutate(serviceorder_date = as_date(serviceorderdate),
         serviceorder_yr = year(serviceorderdate),
         serviceorder_yr_posix = floor_date(serviceorderdate, "year"),
         serviceorder_mth = month(serviceorderdate, label = TRUE),
         serviceorder_yrmth = as.character(serviceorder_date) %>% 
           substr(1, 7),
         serviceorder_yrmth_posix = floor_date(serviceorderdate, "month"),
         serviceorder_day = day(serviceorderdate),
         serviceorder_wkday = wday(serviceorderdate, label = TRUE)
         )

rm(RatCalls)
str(RatCalls_Time)
tail(RatCalls_Time, 500)
View(tail(RatCalls_Time, 1000))

```
  
    
    Next we need to clean up the text of the `servicenotes` variable - this will be done in multiple steps.  
      
    As the first step, we'll remove common "stopwords" (e.g., is, the, and, etc.) as they won't be very useful in finding topics in the `servicenotes` text. Although it's a stopwords, we specifically do not remove the word "no" as it is often used to distinguish between "rats found" and "no rats found".
```{r}

NoStopWords_Unnest <- 
  RatCalls_Time %>% 
  select(servicerequestid,
         servicenotes
         ) %>% 
  unnest_tokens(word,
                servicenotes
                ) %>% 
  anti_join(filter(stop_words,
                   word != "no" # we don't remove the word "no" as it is often used to distinguish between "rats found" and "no rats found"
                   ),
            by = "word"
            )

Servicenotes_NoStopWrds <- NoStopWords_Unnest %>% 
  nest(word) %>% 
  mutate(servicenotes_nostop = map(data,
                                   unlist
                                   ),
         servicenotes_nostop = map_chr(servicenotes_nostop,
                                       paste,
                                       collapse = " "
                                       )
         ) %>% 
  select(-data)


Remove_StopWrds <- RatCalls_Time %>% 
  left_join(Servicenotes_NoStopWrds,
            by = "servicerequestid"
            )

dim(RatCalls_Time)
dim(Remove_StopWrds)

rm(NoStopWords_Unnest, Servicenotes_NoStopWrds)

head(Remove_StopWrds, 100)
View(head(Remove_StopWrds, 100))

```
  
    
    
```{r}

# str(Remove_StopWrds)

# View(Remove_StopWrds %>% 
#        filter(str_detect(servicenotes_nostop,
#                          "washington dc"
#                          )
#               ) %>% 
#        select(servicenotes,
#               servicenotes_nostop
#               )
#      )

```
  
    
    Then, we'll remove any numeric characters from 'servicenotes' to avoid distinctions not needed at this level (e.g., "baited 3 rat borrows" vs. "baited 6 rat burrows"). We'll also remove punctuation.
```{r}

ServiceNotesCleaned <- Remove_StopWrds %>% 
  mutate(servicenotes_nonums_nopunc = str_replace_all(servicenotes_nostop,
                                                      "[[:digit:]]",
                                                      ""
                                                      ) %>% 
           str_replace_all("[[:punct:]]",
                           ""
                           )
         ) %>% 
  select(-servicenotes_nostop)

dim(RatCalls_Time)
dim(Remove_StopWrds)
dim(ServiceNotesCleaned)

# View(select(ServiceNotesCleaned,
#             servicerequestid,
#             servicenotes,
#             servicenotes_nonums_nopunc
#             ) %>% 
#        filter(servicerequestid %in% nomatch$servicerequestid)
#      )

rm(RatCalls_Time, Remove_StopWrds)

head(ServiceNotesCleaned, 100)
View(head(ServiceNotesCleaned, 100))

```
  
    
    Now, we can inspect the frequencies of rat-related service requests.
```{r}

summary(ServiceNotesCleaned)
# summary(RatCalls_Time$serviceorderdate)
# library("psych")
# describe(RatCalls_Time$serviceorderdate)


ggplot_theme_basic <-
  theme(panel.background = element_blank(),
        panel.grid.minor = element_blank(),
        panel.grid.major = element_blank(),
        # axis.text.x = element_blank(),
        axis.ticks = element_blank(),
        axis.line = element_line(size = 1, colour = "black")
        )

# ggplot(data = RatCalls_Time,
#        aes(x = serviceorder_date)
#        ) +
#   geom_histogram() +
#   ggplot_theme_basic

yr_counts <- ServiceNotesCleaned %>% 
  group_by(serviceorder_yr_posix) %>% 
  summarise(Cnt = n()
            ) %>% 
  arrange(serviceorder_yr_posix)

ggplot(data = yr_counts,
       aes(x = serviceorder_yr_posix,
           y = Cnt
           )
       ) +
  geom_col(fill = "light blue") + #, alpha = 0.1) +
  geom_text(aes(label = Cnt),
            nudge_y = 50,
            size = 3
            ) +
  labs(title = "Counts of non-NA ServiceNotes",
       # subtitle = "by year",
       x = "Year",
       y = "Count"
       ) +
  ggplot_theme_basic +
  theme(axis.text.x = element_text(angle = 90)
        ) +
  scale_x_datetime(date_breaks = "1 year")



yrmth_counts <- ServiceNotesCleaned %>% 
  group_by(serviceorder_yrmth_posix) %>% 
  summarise(Cnt = n()
            ) %>% 
  arrange(serviceorder_yrmth_posix)

ggplot(data = yrmth_counts,
       aes(x = serviceorder_yrmth_posix,
           y = Cnt
           )
       ) +
  geom_col(fill = "light blue") + #, alpha = 0.1) +
  # geom_text(aes(label = Cnt),
  #           nudge_y = 50,
  #           size = 3
  #           ) +
  labs(title = "Counts of non-NA ServiceNotes",
       x = "Year-Month",
       y = "Count"
       ) +
  ggplot_theme_basic +
  theme(axis.text.x = element_text(angle = 90)
        ) +
  coord_cartesian(xlim = c(as.POSIXct("1998-12-01"),
                           as.POSIXct("2017-12-01")
                           ),
                  expand = TRUE
                  ) +
  scale_x_datetime(date_breaks = "6 months")

```
  
    
    Based on the frequencies of when we actually have 'servicenotes' data, let's try limiting the dataset to service calls from 2010 or later. This reduces the dataset further, from 15,581 rows to 11,037 rows. 
```{r}

rm(yr_counts, yrmth_counts)


ServiceNotesCleanedAfter2010 <- ServiceNotesCleaned %>%
  filter(serviceorderdate >= as_date("2010-01-01")
         )

nrow(ServiceNotesCleaned)
nrow(ServiceNotesCleanedAfter2010)

summary(ServiceNotesCleanedAfter2010)

```
  
    
    Then, transform the `servicenotes` field into one row per n-gram. Because we don't know what level of 'n' to use, we'll cycle through the possibilities from n = 1 to n = 5.
```{r}

ngram_list <- 1:5

Rat_Ngram_list <- list()

Rat_Ngram_list <- lapply(ngram_list,
                         function(i) {
                           # x <- paste0("0", i, "_gram")
                           ServiceNotesCleaned %>% 
                             unnest_tokens(n_gram,
                                           servicenotes_nonums_nopunc,
                                           token = "ngrams",
                                           n = i
                                           )
                           }
                         )

# rm(ngram_list)
Rat_Ngram_list
# str(Rat_Ngram_list[[1]])

```
  
    
    Counting the n-grams in each `servicerequestid`.
```{r}

word_counts_list <- list()

word_counts_list <- lapply(ngram_list,
                           function(i) {
                             Rat_Ngram_list[[i]] %>% 
                               count(servicerequestid,
                                     n_gram,
                                     sort = TRUE
                                     )
                             }
                           )

word_counts_list

```
  
    
    Transforming the dataframe into a document term matrix - i.e., documents (`servicerequestid`s) are the rows and n-grams are the columns.
```{r}

dtm_list <- list()

dtm_list <- lapply(ngram_list,
                   function(i) {
                     word_counts_list[[i]] %>% 
                       cast_dtm(document = servicerequestid,
                                term = n_gram,
                                value = n,
                                # weighting = tm::weightTfIdf,
                                # using term frequency inverse document frequency (TfIdf) weighting is another, possibly more accurate measure, but topicmodels::LDA (used below) only accepts document term matrices with term-frequency weighting
                                weighting = tm::weightTf
                                )
                     }
                   )

dtm_list

```
  
    
    To determine the "proper" number of topics, here I try using the `ldatuning::FindTopicsNumber` function. The code chunk is based on the vignette here:  [https://cran.r-project.org/web/packages/ldatuning/vignettes/topics.html](https://cran.r-project.org/web/packages/ldatuning/vignettes/topics.html).  
    
    This analyses was done separately for each n-gram level (n = 1:5), and the overall results were inconclusive - the "proper" number of topics fluctuated between the highest level tried (12 topics) and one of the lowest levels tried (2, 3, or 4 topics).
```{r}

detectCores(logical = TRUE) - 1


tunes_list <- dtm_list %>% 
  map(~ FindTopicsNumber(.x,
                         topics = c(2:12),
                         metrics = c("Griffiths2004",
                                     "CaoJuan2009",
                                     "Arun2010",
                                     "Deveaud2014"
                                     ),
                         method = "Gibbs",
                         control = list(seed = 123456789),
                         mc.cores = 3L,
                         verbose = TRUE
                         )
      )


# str(tunes_list[[5]])

topic_plots <-
  tunes_list %>% 
  map(~ FindTopicsNumber_plot(.x)
      )

topic_plots

```
  
    
    As an alternative, here I try to determine the "proper" number of topics using `topicmodels::perplexity`. Perplexity measures how well a probability model predicts a sample, and I use it here via 10-folder cross validation. For computational purposes, I'm only trying this for the 5-gram model.  
    
    This is based on the method used here:
    [http://ellisp.github.io/blog/2017/01/05/topic-model-cv](http://ellisp.github.io/blog/2017/01/05/topic-model-cv)
    
    As with the `ldatuning::FindTopicsNumber` function used previously, `topicmodels::perplexity` is also inconclusive as there is no clear "elbow" in the perplexity plot.
```{r}

full_data <- dtm_list[[5]]

n <- nrow(full_data)
seed <- 123456789
topic_guess <- 12
folds <- 10
burnin <- 1000
iter <-1000
keep <-50

#----------------10-fold cross-validation, different numbers of topics----------------
cluster <- makeCluster(detectCores(logical = TRUE) - 1
                       ) # leave one CPU spare...
registerDoParallel(cluster)

clusterEvalQ(cluster, {
   library(topicmodels)
})

folds <- folds
splitfolds <- sample(1:folds, n, replace = TRUE)
candidate_k <- c(2:topic_guess) # candidates for how many topics
clusterExport(cluster,
              c("full_data", "burnin", "iter", "keep", "splitfolds", "folds", "candidate_k")
              )

# we parallelize by the different number of topics.  A processor is allocated a value
# of k, and does the cross-validation serially.  This is because it is assumed there
# are more candidate values of k than there are cross-validation folds, hence it
# will be more efficient to parallelise
system.time({
results <- foreach(j = 1:length(candidate_k),
                   .combine = rbind
                   ) %dopar%{
   k <- candidate_k[j]
   
   results_1k <- matrix(0,
                        nrow = folds,
                        ncol = 2
                        )
   
   colnames(results_1k) <- c("k", "perplexity")
   
   for(i in 1:folds){
      train_set <- full_data[splitfolds != i , ]
      valid_set <- full_data[splitfolds == i, ]
      
      fitted <- LDA(train_set,
                    k = k,
                    method = "Gibbs",
                    control = list(seed = seed,
                                   verbose = 1,
                                   burnin = burnin,
                                   iter = iter,
                                   keep = keep
                                   )
                    )
      
      results_1k[i, ] <- c(k, perplexity(fitted, newdata = valid_set)
                           )
   }
   
   return(results_1k)
}
})

stopCluster(cluster)

results_df <- as.data.frame(results)


# ggplot(data = results_df,
#        aes(x = k,
#            y = perplexity)
#        ) +
#   geom_point() +
#   geom_smooth(se = FALSE) +
#   coord_cartesian(xlim = c(0, 12)
#                   ) +
#   scale_x_continuous(breaks = seq(0, 12, 2)
#                      ) +
#   ggplot_theme_basic +
#   ggtitle(label = "10-fold cross-validation of topic modelling",
#           subtitle = "(i.e., 10 different models fit for each potential number of topics)"
#           ) +
#   labs(x = "Potential Number of Topics",
#        y = "Perplexity When Fitting the Trained Model to the Hold-Out Set"
#        )


ggplot(data = results_df,
       aes(x = k,
           y = perplexity)
       ) +
  geom_point() +
  geom_smooth(se = TRUE) +
  coord_cartesian(xlim = c(0, 12),
                  ylim = c(0, 6000)
                  ) +
  scale_x_continuous(breaks = seq(0, 12, 2)
                     ) +
  # ggplot_theme_basic +
  ggtitle(label = "10-fold cross-validation of topic modelling",
          subtitle = "(i.e., 10 different models fit for each potential number of topics)"
          ) +
  labs(x = "Potential Number of Topics",
       y = "Perplexity When Fitting the Trained Model to the Hold-Out Set"
       )

```
  
    
    Remove the no-longer-needed files.
```{r}

rm(cluster, full_data, results, topic_plots, tunes_list, burnin, candidate_k, folds, iter, keep, n, seed, splitfolds, topic_guess)

```
  
    
    Here I use Latent Dirichlet allocation for topic modeling. As determining the "proper" number of topics was inconclusive, I'm cycling through every combination of ngrams (1:5) and topics (2:12).
```{r}

topic_guess <- 2:12

lda_list <- list()


cluster <- makeCluster(detectCores(logical = TRUE) - 1
                       ) # leave one CPU spare...
registerDoParallel(cluster)

for(i in ngram_list) {
  for(j in topic_guess) {
    x <- LDA(dtm_list[[i]],
             k = j,
             control = list(seed = 123456789,
                            verbose = 1
                            )
             )
    
    ifelse((i == min(ngram_list) &
              j == min(topic_guess)
            ),
           countx <- 1,
           countx <- countx + 1
           )
    
    lda_list[[countx]] <- list(ngram = i,
                               topic = j,
                               lda_model = x
                               )
    }
  }

stopCluster(cluster)

rm(ngram_list, topic_guess, i, j, x, countx, cluster)

lda_list

```
  
    
    Creating a dataframe with `beta` - the per-topic-per-ngram probability (i.e., the probability that each ngram is in each topic).
```{r}

PerTopicPerNgram <- list()

for(i in 1:length(lda_list)
    ) {
  x <- tidy(lda_list[[i]]$lda_model,
            matrix = "beta"
            ) %>% 
    arrange(term,
            desc(beta)
            )
  
  PerTopicPerNgram[[i]] <- list(ngram = lda_list[[i]]$ngram,
                                topic = lda_list[[i]]$topic,
                                PerTopicPerNgram = x
                                )
  }

rm(i, x)


str(PerTopicPerNgram[[55]]$PerTopicPerNgram)

# rm(serv_req_id_lda)
head(PerTopicPerNgram[[55]]$PerTopicPerNgram, 500)

```
  
    
    Creating a dataframe with just the top ten terms (ranked by beta) in each topic.
```{r}

top_terms <- list()

for(i in 1:length(PerTopicPerNgram)
    ) {
  x <- PerTopicPerNgram[[i]]$PerTopicPerNgram %>% 
    group_by(topic) %>% 
    top_n(10,
          beta
          ) %>% 
    ungroup() %>% 
    arrange(topic,
            -beta
            )
  
  top_terms[[i]] <- list(ngram = PerTopicPerNgram[[i]]$ngram,
                         topic = PerTopicPerNgram[[i]]$topic,
                         top_terms = x
                         )
  }

rm(i, x)


top_terms[[55]]$top_terms
View(top_terms[[55]]$top_terms)

```
  
    
    Now we can plot the top 10 n-grams in each topic to visually inspect if the topic classifications "make sense" based on the n-gram text.  
    
    Here, we're just creating and saving the plots themselves.
```{r}

TopNgrams_ByTopic_BarGraphs <-
  top_terms %>%
  # to_graph %>%
  map(function(x) 
    x$top_terms %>%
      mutate(term = reorder(term,
                            beta
                            ),
             topic = paste0("Topic ",
                            str_pad(as.character(topic),
                                    width = 2,
                                    side = "left",
                                    pad = "0"
                                    )
                            )
             ) %>% 
      ggplot(aes(x = term,
                 y = beta,
                 fill = factor(topic)
                 )
             ) +
      geom_col(show.legend = FALSE) +
      facet_wrap(~ topic,
                 scales = "free",
                 ncol = 2
                 ) +
      ggplot_theme_basic +
      theme(plot.title = element_text(size = 11),
            axis.title = element_text(size = 10),
            axis.text = element_text(size = 9)
            ) +
      labs(title = "Most Common Terms Per Topic",
           subtitle = paste0("(",
                             str_pad(x$ngram,
                                     width = 2,
                                     side = "left",
                                     pad = "0"
                                     ),
                             "gram",
                             str_pad(x$topic,
                                     width = 2,
                                     side = "left",
                                     pad = "0"
                             ),
                             "topic)"
                             ),
           x = paste0(str_pad(x$ngram,
                              width = 2,
                              side = "left",
                              pad = "0"
                              ),
                      "gram"
                      ),
           y = paste0("probability of the ",
                      str_pad(x$ngram,
                              width = 2,
                              side = "left",
                              pad = "0"
                              ),
                      "gram in the topic"
                      )
           ) +
      coord_flip()
    )

# TopNgrams_ByTopic_BarGraphs
TopNgrams_ByTopic_BarGraphs[[24]] # ngram = 3 & topics = 3
TopNgrams_ByTopic_BarGraphs[[25]] # ngram = 3 & topics = 4
TopNgrams_ByTopic_BarGraphs[[35]] # ngram = 4 & topics = 3
TopNgrams_ByTopic_BarGraphs[[36]] # ngram = 4 & topics = 4
TopNgrams_ByTopic_BarGraphs[[46]] # ngram = 5 & topics = 3
TopNgrams_ByTopic_BarGraphs[[47]] # ngram = 5 & topics = 4


TopNgrams_ByTopic_BarGraphs %>% 
  map(function(x)
    ggsave(paste0(wd,
                  "/Viz/",
                  # "New_",
                  substr(x$labels$subtitle,
                         2,
                         (nchar(x$labels$subtitle) - 1)
                  ),
                  "_Top10Terms_facet.png"
                  ),
           x,
           scale = 4,
           width = 6,
           height = 6,
           units = "cm"
           )
    )

```
  
    
    Creating a dataframe with `gamma` - the per-document-per-topic probability (i.e., the probability that each document (serv_req_id) is in each topic).  
    
    I chose to do this for six different combinations of ngrams and topics (ngram = 3 & topic = 3, 3 & 4, 4 & 3, 4 & 4, 5 & 3, 5 & 4). This was chosen in part becasue after looking at the graphs produced above, these models seemed (by visual inspection) to perform better. It was also done in part becasue a portion of the analyses below requries defining the topics as `unknown`, `no_rats_found`, or `rats_found` by visual inspection of the graphs produced above. Six also seemed to be a good medium between too few and too many visual inspections to do.
```{r}

rm(TopNgrams_ByTopic_BarGraphs)


top_terms[[55]] #lda model with ngram = 5 & topics = 12
top_terms[[2]] #lda model with ngram = 1 & topics = 3
top_terms[[3]] #lda model with ngram = 1 & topics = 4

top_terms[[24]] #lda model with ngram = 3 & topics = 3
top_terms[[25]] #lda model with ngram = 3 & topics = 4
top_terms[[35]] #lda model with ngram = 4 & topics = 3
top_terms[[36]] #lda model with ngram = 4 & topics = 4
top_terms[[46]] #lda model with ngram = 5 & topics = 3
top_terms[[47]] #lda model with ngram = 5 & topics = 4


ProbDocInTopic_ngram03_topic03 <-
  list(ngram = lda_list[[24]]$ngram,
       topic = lda_list[[24]]$topic,
       data = tidy(lda_list[[24]]$lda_model,
                   matrix = "gamma"
                   ) %>% 
         arrange(document,
                 desc(gamma)
                 ) %>% 
         mutate(topic_name = case_when(topic %in% c(1) ~ "unknown",
                                       topic %in% c(2) ~ "no_rats_found",
                                       topic %in% c(3) ~ "rats_found"
                                       ),
                model_ngram = lda_list[[24]]$ngram,
                model_topic = lda_list[[24]]$topic
                )
       )


ProbDocInTopic_ngram03_topic04 <-
  list(ngram = lda_list[[25]]$ngram,
       topic = lda_list[[25]]$topic,
       data = tidy(lda_list[[25]]$lda_model,
                   matrix = "gamma"
                   ) %>% 
         arrange(document,
                 desc(gamma)
                 ) %>% 
         mutate(topic_name = case_when(topic %in% c(1) ~ "unknown",
                                       topic %in% c(2) ~ "no_rats_found",
                                       topic %in% c(3, 4) ~ "rats_found"
                                       ),
                model_ngram = lda_list[[25]]$ngram,
                model_topic = lda_list[[25]]$topic
                )
       )


ProbDocInTopic_ngram04_topic03 <-
  list(ngram = lda_list[[35]]$ngram,
       topic = lda_list[[35]]$topic,
       data = tidy(lda_list[[35]]$lda_model,
                   matrix = "gamma"
                   ) %>% 
         arrange(document,
                 desc(gamma)
                 ) %>% 
         mutate(topic_name = case_when(#topic %in% c() ~ "unknown",
                                       topic %in% c(1) ~ "no_rats_found",
                                       topic %in% c(2, 3) ~ "rats_found"
                                       ),
                model_ngram = lda_list[[35]]$ngram,
                model_topic = lda_list[[35]]$topic
                )
       )


ProbDocInTopic_ngram04_topic04 <-
  list(ngram = lda_list[[36]]$ngram,
       topic = lda_list[[36]]$topic,
       data = tidy(lda_list[[36]]$lda_model,
                   matrix = "gamma"
                   ) %>% 
         arrange(document,
                 desc(gamma)
                 ) %>% 
         mutate(topic_name = case_when(topic %in% c(4) ~ "unknown",
                                       topic %in% c(1) ~ "no_rats_found",
                                       topic %in% c(2, 3) ~ "rats_found"
                                       ),
                model_ngram = lda_list[[36]]$ngram,
                model_topic = lda_list[[36]]$topic
                )
       )


ProbDocInTopic_ngram05_topic03 <-
  list(ngram = lda_list[[46]]$ngram,
       topic = lda_list[[46]]$topic,
       data = tidy(lda_list[[46]]$lda_model,
                   matrix = "gamma"
                   ) %>% 
         arrange(document,
                 desc(gamma)
                 ) %>% 
         mutate(topic_name = case_when(topic %in% c(2) ~ "unknown",
                                       topic %in% c(3) ~ "no_rats_found",
                                       topic %in% c(1) ~ "rats_found"
                                       ),
                model_ngram = lda_list[[46]]$ngram,
                model_topic = lda_list[[46]]$topic
                )
       )


ProbDocInTopic_ngram05_topic04 <-
  list(ngram = lda_list[[47]]$ngram,
       topic = lda_list[[47]]$topic,
       data = tidy(lda_list[[47]]$lda_model,
                   matrix = "gamma"
                   ) %>% 
         arrange(document,
                 desc(gamma)
                 ) %>% 
         mutate(topic_name = case_when(topic %in% c(4) ~ "unknown",
                                       topic %in% c(3) ~ "no_rats_found",
                                       topic %in% c(1, 2) ~ "rats_found"
                                       ),
                model_ngram = lda_list[[47]]$ngram,
                model_topic = lda_list[[47]]$topic
                )
       )

```
  
    
    Here, we put the six individual ProbDocInTopic models together, and add in some of the original information (e.g., the original `servicenotes`) for context.
```{r}

ProbDocInTopic_AllModels <-
  bind_rows(ProbDocInTopic_ngram03_topic03[[3]],
            ProbDocInTopic_ngram03_topic04[[3]],
            ProbDocInTopic_ngram04_topic03[[3]],
            ProbDocInTopic_ngram04_topic04[[3]],
            ProbDocInTopic_ngram05_topic03[[3]],
            ProbDocInTopic_ngram05_topic04[[3]]
            ) %>% 
  arrange(document,
          model_ngram,
          model_topic,
          gamma
          ) %>% 
  rename("servicerequestid" = "document") %>% 
  left_join(select(ServiceNotesCleaned,
                   servicerequestid,
                   servicenotes,
                   serviceorderdate
                   ),
            by = c("servicerequestid" = "servicerequestid")
            )

rm(list = ls(pattern = "ProbDocInTopic_ngram"))

str(ProbDocInTopic_AllModels)
head(ProbDocInTopic_AllModels, 100)
View(head(ProbDocInTopic_AllModels, 1000))

```
  
    
    Remove the datafiles that are no longer needed.
```{r}

rm(list = ls(pattern = "_list"))
rm(PerTopicPerNgram, top_terms)

```
  
    
    Next, for each model (e.g., 3-gram 4-topic), we sum the probabilities given for each numeric topic, into the "rats topics" (e.g., `rats_found`) which were defined above via visual inspection of the graphs on the Top 10 ngrams in each numeric topic.  
    
    I also pull out the 5-gram 4-topic model because it appeared (visually) to be the most accurate individual model.
```{r}

ProbDocInTopic_ProbsSummed_ByModel <-
  ProbDocInTopic_AllModels %>% 
  group_by(servicerequestid,
           model_ngram,
           model_topic,
           topic_name
           ) %>% 
  summarise(prob_ = sum(gamma)
            ) %>% 
  ungroup() %>% 
  left_join(select(ServiceNotesCleaned,
                   servicerequestid,
                   servicenotes,
                   serviceorderdate
                   ),
            by = c("servicerequestid" = "servicerequestid")
            )

str(ProbDocInTopic_ProbsSummed_ByModel)
head(ProbDocInTopic_ProbsSummed_ByModel, 100)
View(head(ProbDocInTopic_ProbsSummed_ByModel, 1000))



ProbDocInTopic_ProbsSummed_05gram04topic <-
  ProbDocInTopic_ProbsSummed_ByModel %>% 
  filter(model_ngram == 5 &
           model_topic == 4)

str(ProbDocInTopic_ProbsSummed_05gram04topic)
head(ProbDocInTopic_ProbsSummed_05gram04topic, 100)
View(head(ProbDocInTopic_ProbsSummed_05gram04topic, 1000))

```
  
    
    Next, for each ngram-topic combination, I create histograms of the probabilities assigned to each topic.  This is done to help visualy determine if the topic assignments are clearly separating documents (serv_request_id values).  
    
    A log10 transformation of the probability is done to help more clearly see any differences.
```{r}

# str(ProbDocInTopic_ProbsSummed_ByModel)

ProbDocInTopic_ProbsSummed_ByModel_Details <-
  ProbDocInTopic_ProbsSummed_ByModel %>% 
  mutate(model = paste0("0",
                        model_ngram,
                        "gram_",
                        "0",
                        model_topic,
                        "topic"
                        ),
         serviceorder_yr = year(serviceorderdate),
         model_and_yr = paste0(model,
                               "_",
                               as.character(serviceorder_yr)
                               )
         )

head(ProbDocInTopic_ProbsSummed_ByModel_Details, 100)
View(head(ProbDocInTopic_ProbsSummed_ByModel_Details, 1000))



TopicDistro_MainModels_Histogram_ByModel <-
  ProbDocInTopic_ProbsSummed_ByModel_Details %>% 
  split(.$model) %>%
  map(~ ggplot(data = .x,
               aes(x = prob_,
                   fill = topic_name
                   )
               ) +
        geom_histogram(binwidth = 0.05) +
        scale_x_continuous(limits = c(0, 1)
                           ) +
        scale_y_log10() + # this transformation is used to help more clearly see any differences in the probability values
        ggtitle(label = paste0(.x$model,
                               "_Histogram"
                               )
                ) +
        theme(legend.position = "none") +
        labs(x = "Prob of ServiceRequestId in the Topic",
             y = "log10 of counts"
             ) +
        facet_wrap(~topic_name)
      )



TopicDistro_MainModels_Histogram_ByModelYr <-
  ProbDocInTopic_ProbsSummed_ByModel_Details %>% 
  split(.$model_and_yr) %>% 
  map(~ ggplot(data = .x,
               aes(x = prob_,
                   fill = topic_name
                   )
               ) +
        geom_histogram(binwidth = 0.05) +
        scale_x_continuous(limits = c(0, 1)
                           ) +
        scale_y_log10() + # this transformation is used to help more clearly see any differences in the probability values
        ggtitle(label = paste0(.x$model_and_yr,
                               "_Histogram"
                               )
                ) +
        theme(legend.position = "none") +
        labs(x = "Prob of ServiceRequestId in the Topic",
             y = "log10 of counts"
             ) +
        facet_wrap(~topic_name)
      )

```
  
    
    Saving the histograms.
```{r message=FALSE, warning=FALSE, include=FALSE, paged.print=FALSE}

TopicDistro_MainModels_Histogram_ByModel %>%
  map(~ ggsave(paste0(wd,
                      "/Viz/",
                      # "New_",
                      "TopicDistro_MainModels_ByModel_",
                      .x$labels$title[[1]],
                      ".png"
                      ),
               .x,
               scale = 4,
               width = 6,
               height = 4,
               units = "cm"
               )
      )



TopicDistro_MainModels_Histogram_ByModelYr %>% 
  map(~ ggsave(paste0(wd,
                      "/Viz/",
                      # "New_",
                      "TopicDistro_MainModels_ByModelYr_",
                      .x$labels$title[[1]],
                      ".png"
                      ),
               .x,
               scale = 4,
               width = 6,
               height = 4,
               units = "cm"
               )
      )

```
  
    
    Removing unneeded files.
```{r}

rm(list = ls(pattern = "TopicDistro_"))

```
  
    
    Comparing how the histograms look with a regular y-scale vs a log10 y-scale.
```{r}

ProbDocInTopic_ProbsSummed_ByModel_Details %>% 
  split(.$model) %>%
  map(~ ggplot(data = .x,
               aes(x = prob_,
                   fill = topic_name
                   )
               ) +
        geom_histogram(binwidth = 0.05) +
        scale_x_continuous(limits = c(0, 1)
                           ) +
        scale_y_log10() +
        ggtitle(label = paste0(.x$model,
                               "_Histogram"
                               )
                ) +
        theme(legend.position = "none") +
        labs(x = "Prob of ServiceRequestId in the Topic",
             y = "log10 of counts"
             ) +
        facet_wrap(~topic_name,
                   scales = "fixed"
                   )
      )


ProbDocInTopic_ProbsSummed_ByModel_Details %>% 
  split(.$model) %>%
  map(~ ggplot(data = .x,
               aes(x = prob_,
                   fill = topic_name
                   )
               ) +
        geom_histogram(binwidth = 0.05) +
        scale_x_continuous(limits = c(0, 1)
                           ) +
        # scale_y_log10() +
        ggtitle(label = paste0(.x$model,
                               "_Histogram"
                               )
                ) +
        theme(legend.position = "none") +
        labs(x = "Prob of ServiceRequestId in the Topic",
             y = "counts"
             ) +
        coord_cartesian(ylim = c(0, 5000)
                        ) +
        facet_wrap(~topic_name,
                   scales = "fixed"
                   )
      )

```
  
    
    Then, for each `servicerequestid` and `topic_name` we calculate the mean topic probability across all the models. NOTE: This step could be modified more as my instinct is that more weight should probably be given to the models with larger ngrams and topics (e.g., the 5-gram & 4-topic model). However, using larger values of n-grams will not be able to analyze those records that do not have at least `n` words. Meaning that smaller values of n-grams can analyze more documents, but possibly less accurately.
```{r}

ProbDocInTopic_MeanProb_BySrvcRqstId <-
  ProbDocInTopic_ProbsSummed_ByModel %>% 
  group_by(servicerequestid,
           topic_name
           ) %>% 
  summarise(MeanProb = mean(prob_, na.rm = TRUE)
            ) %>% 
  left_join(select(ServiceNotesCleaned,
                   servicerequestid,
                   servicenotes,
                   serviceorderdate
                   ),
            by = c("servicerequestid" = "servicerequestid")
            ) %>% 
  arrange(servicerequestid,
          desc(MeanProb)
          )

str(ProbDocInTopic_MeanProb_BySrvcRqstId)
tail(ProbDocInTopic_MeanProb_BySrvcRqstId, 100)
View(head(ProbDocInTopic_MeanProb_BySrvcRqstId, 1000))
View(tail(ProbDocInTopic_MeanProb_BySrvcRqstId, 1000))

```
  
    
    Here, we create a dataset suitable for graphing and analyzing, by simply selecting the highest topic probability (the `MeanProb` value) assigned to each topic. We also do this for the single 5-gram 4-topic model.
```{r}

# ProbDocInTopic_AllModels
# ProbDocInTopic_ProbsSummed_ByModel
# ProbDocInTopic_MeanProb_BySrvcRqstId
# 
# 
# ProbDocInTopic_AllModels %>% select(servicerequestid) %>% distinct() %>% nrow
# ProbDocInTopic_ProbsSummed_ByModel %>% select(servicerequestid) %>% distinct() %>% nrow
# ProbDocInTopic_MeanProb_BySrvcRqstId %>% select(servicerequestid) %>% distinct() %>% nrow


TopProb_BySrvcRqstId <-
  ProbDocInTopic_MeanProb_BySrvcRqstId %>% 
  mutate(serviceorder_yr = year(serviceorderdate),
         # serviceorder_yr2 = as.factor(serviceorder_yr),
         yr_group = paste0(as.character(serviceorder_yr),
                           "_",
                           as.character(topic_name)
                           ),
         model = "AllModels_MeanProb"
         ) %>% 
  rename("prob" = "MeanProb") %>% 
  group_by(servicerequestid) %>% 
  top_n(1,
        prob
        ) %>% 
  ungroup() %>% 
  arrange(prob)

str(TopProb_BySrvcRqstId)
TopProb_BySrvcRqstId
View(TopProb_BySrvcRqstId)



TopProb_BySrvcRqstId_05gram04topic <- 
  ProbDocInTopic_ProbsSummed_05gram04topic %>% 
  mutate(serviceorder_yr = year(serviceorderdate),
         # serviceorder_yr2 = as.factor(serviceorder_yr),
         yr_group = paste0(as.character(serviceorder_yr),
                           "_",
                           as.character(topic_name)
                           ),
         model = "5gram4topic"
         ) %>% 
  rename("prob" = "prob_") %>% 
  group_by(servicerequestid) %>% 
  top_n(1,
        prob
        ) %>% 
  ungroup() %>% 
  arrange(prob)

str(TopProb_BySrvcRqstId_05gram04topic)
TopProb_BySrvcRqstId_05gram04topic
View(TopProb_BySrvcRqstId_05gram04topic)
  
```
  
    
    Now, we can simply create the freqpoly plots and density plots for each year-topic combination to investigate how the topic assignment did over time, and then save them.
```{r}

# str(TopProb_BySrvcRqstId)
# str(TopProb_BySrvcRqstId_05gram04topic)


TopicDistro_AllModelsMean_Freqpoly <-
  TopProb_BySrvcRqstId %>%
  split(.$serviceorder_yr) %>%
  map(~ ggplot(data = .x,
               aes(x = prob,
                   colour = topic_name
                   )
               ) +
        geom_freqpoly(binwidth = 0.05,
                      alpha = 0.6
                      ) +
        scale_x_continuous(limits = c(0, 1)
                           ) +
        ggtitle(label = paste0("TopicDistro_",
                               .x$model,
                               "_Freqpoly_",
                               as.character(.x$serviceorder_yr)
                               )
                ) +
        labs(x = "Prob of ServiceRequestId in the Topic")
      )

TopicDistro_AllModelsMean_Density <-
  TopProb_BySrvcRqstId %>%
  split(.$serviceorder_yr) %>%
  map(~ ggplot(data = .x,
               aes(x = prob,
                   colour = topic_name
                   )
               ) +
        geom_density() +
        scale_x_continuous(limits = c(0, 1)
                           ) +
        ggtitle(label = paste0("TopicDistro_",
                               .x$model,
                               "_Density_",
                               as.character(.x$serviceorder_yr)
                               )
                ) +
        labs(x = "Prob of ServiceRequestId in the Topic")
      )



TopicDistro_05gram04topic_Freqpoly <-
  TopProb_BySrvcRqstId_05gram04topic %>%
  split(.$serviceorder_yr) %>%
  map(~ ggplot(data = .x,
               aes(x = prob,
                   colour = topic_name
                   )
               ) +
        geom_freqpoly(binwidth = 0.05,
                      alpha = 0.6
                      ) +
        scale_x_continuous(limits = c(0, 1)
                           ) +
        ggtitle(label = paste0("TopicDistro_",
                               .x$model,
                               "_Freqpoly_",
                               as.character(.x$serviceorder_yr)
                               )
                ) +
        labs(x = "Prob of ServiceRequestId in the Topic")
      )

TopicDistro_05gram04topic_Density <-
  TopProb_BySrvcRqstId_05gram04topic %>%
  split(.$serviceorder_yr) %>%
  map(~ ggplot(data = .x,
               aes(x = prob,
                   colour = topic_name
                   )
               ) +
        geom_density() +
        scale_x_continuous(limits = c(0, 1)
                           ) +
        ggtitle(label = paste0("TopicDistro_",
                               .x$model,
                               "_Density_",
                               as.character(.x$serviceorder_yr)
                               )
                ) +
        labs(x = "Prob of ServiceRequestId in the Topic")
      )



# str(TopicDistro_AllModelsMean_Freqpoly[[18]])
# str(TopicDistro_AllModelsMean_Density[[18]])
# str(TopicDistro_05gram04topic_Freqpoly[[18]])
# str(TopicDistro_05gram04topic_Density[[18]])


TopicDistro_AllModelsMean_Freqpoly[[18]]
TopicDistro_AllModelsMean_Density[[18]]
TopicDistro_05gram04topic_Freqpoly[[18]]
TopicDistro_05gram04topic_Density[[18]]

```
  
    
    Saving the visuals created above.
```{r message=FALSE, warning=FALSE, include=FALSE, paged.print=FALSE}

TopicDistro_AllModelsMean_Freqpoly %>%
  map(function(x)
    ggsave(paste0(wd,
                  "/Viz/",
                  # "New_",
                  x$labels$title[[1]],
                  ".png"
                  ),
           x,
           scale = 4,
           width = 6,
           height = 4,
           units = "cm"
           )
    )



TopicDistro_AllModelsMean_Density %>%
  map(function(x)
    ggsave(paste0(wd,
                  "/Viz/",
                  # "New_",
                  x$labels$title[[1]],
                  ".png"
                  ),
           x,
           scale = 4,
           width = 6,
           height = 4,
           units = "cm"
           )
    )



TopicDistro_05gram04topic_Freqpoly %>%
  map(function(x)
    ggsave(paste0(wd,
                  "/Viz/",
                  # "New_",
                  x$labels$title[[1]],
                  ".png"
                  ),
           x,
           scale = 4,
           width = 6,
           height = 4,
           units = "cm"
           )
    )



TopicDistro_05gram04topic_Density %>%
  map(function(x)
    ggsave(paste0(wd,
                  "/Viz/",
                  # "New_",
                  x$labels$title[[1]],
                  ".png"
                  ),
           x,
           scale = 4,
           width = 6,
           height = 4,
           units = "cm"
           )
    )

```
  
    
    Removing unneeded files.
```{r}

rm(list = ls(pattern = "TopicDistro_"))

```
  
    
    As another method to determine the "correct" topic assignment, here I simply count the number of times a topic assignment was given to each document (servicerequestid), and assign the "correct" topic to the topic with the most assignments. In the case of ties (e.g., all three topics, each assigned twice), I use the `MeanProb` calculated above in the `ProbDocInTopic_MeanProb_BySrvcRqstId` dataframe previously.
```{r}

str(ProbDocInTopic_ProbsSummed_ByModel)
View(ProbDocInTopic_ProbsSummed_ByModel)
# ProbDocInTopic_ProbsSummed_ByModel %>% select(model_ngram, model_topic) %>% distinct()

str(ProbDocInTopic_MeanProb_BySrvcRqstId)
View(ProbDocInTopic_MeanProb_BySrvcRqstId)


TopicAssigned_ByCounts_ByMeanProb <-
  ProbDocInTopic_ProbsSummed_ByModel %>% 
  group_by(servicerequestid,
           model_ngram,
           model_topic
           ) %>% 
  top_n(1,
        prob_
        ) %>% 
  ungroup() %>% 
  count(servicerequestid,
        topic_name
        ) %>% 
  left_join(ProbDocInTopic_MeanProb_BySrvcRqstId,
            by = c("servicerequestid" = "servicerequestid",
                   "topic_name" = "topic_name"
                   )
            ) %>% 
  select(servicerequestid,
         topic_name,
         n,
         MeanProb
         ) %>% 
  group_by(servicerequestid) %>%
  arrange(servicerequestid,
          desc(n),
          desc(MeanProb)
          ) %>% 
  ungroup() %>% 
  group_by(servicerequestid) %>%
  mutate(RowNum = row_number()
         ) %>% 
  ungroup() %>% 
  filter(RowNum == 1) %>% 
  rename(times_topic_assigned = n) %>% 
  left_join(ServiceNotesCleaned,
            by = "servicerequestid"
            ) %>% 
  select(servicerequestid,
         topic_name,
         times_topic_assigned,
         MeanProb,
         servicenotes,
         servicenotes_nonums_nopunc,
         serviceorderdate,
         serviceorder_date,
         serviceorder_yr,
         serviceorder_yr_posix,
         serviceorder_mth,
         serviceorder_yrmth,
         serviceorder_yrmth_posix,
         serviceorder_day,
         serviceorder_wkday
         )

str(ServiceNotesCleaned)
str(TopicAssigned_ByCounts_ByMeanProb)
head(TopicAssigned_ByCounts_ByMeanProb, 1000)
View(TopicAssigned_ByCounts_ByMeanProb)

```
  
    
    So now, we can compare two different models, each being slight variations of six LDA models.  
    
    1) `TopProb_BySrvcRqstId` assigns the topic by taking the mean topic probability score across all six models.
    2) `TopicAssigned_ByCounts_ByMeanProb` assigns the topic by taking the most frequently assigned topic across all six models.
```{r}

str(TopicAssigned_ByCounts_ByMeanProb)
str(TopProb_BySrvcRqstId)
str(TopProb_BySrvcRqstId_05gram04topic)


dim(TopicAssigned_ByCounts_ByMeanProb)
dim(TopProb_BySrvcRqstId)
dim(TopProb_BySrvcRqstId_05gram04topic)


a <- TopProb_BySrvcRqstId %>% 
  select(servicerequestid,
         topic_name,
         prob
         ) %>% 
  rename(topic_name_meanprob = topic_name,
         prob_meanprob = prob
         )


b <- TopicAssigned_ByCounts_ByMeanProb %>% 
  rename(topic_name_topcounts = topic_name,
         times_topic_assigned_topcounts = times_topic_assigned,
         prob_topcounts = MeanProb
         )


ModelsCompare <- a %>% 
  left_join(b,
            by = "servicerequestid"
            )

rm(a, b)
str(ModelsCompare)
ModelsCompare
View(ModelsCompare)

```
  
    
    When comparing the two models, they coincide nearly 99% of the time...but the disagreements still raise questions.
```{r}

filter(ModelsCompare,
       topic_name_meanprob != topic_name_topcounts
       ) %>% 
  nrow() / nrow(ModelsCompare) * 100

filter(ModelsCompare,
       topic_name_meanprob == topic_name_topcounts
       ) %>% 
  nrow() / nrow(ModelsCompare) * 100


# View(filter(ModelsCompare,
#             topic_name_meanprob != topic_name_topcounts
#             )
#      )

View(filter(ModelsCompare,
            topic_name_meanprob != topic_name_topcounts
            ) %>% 
       select(servicerequestid,
              topic_name_meanprob,
              topic_name_topcounts,
              servicenotes,
              servicenotes_nonums_nopunc
              )
     )

```
  
    
    As the models assign topics almost identically, let's use the `topcounts` model to do some quick inspections about how often each of the topics were assigned, when, changes over time, etc.
```{r}


Counts_AllYears <- ModelsCompare %>% 
  mutate(topic_name = factor(topic_name_topcounts,
                             levels = c("unknown",
                                        "no_rats_found",
                                        "rats_found"
                                        )
                             )
         ) %>% 
group_by(topic_name) %>% 
  count() %>% 
  rename(counts = n)



Counts_AcrossYears <- ModelsCompare %>% 
  mutate(topic_name = factor(topic_name_topcounts,
                             levels = c("unknown",
                                        "no_rats_found",
                                        "rats_found"
                                        )
                             )
         ) %>% 
group_by(topic_name,
         serviceorder_yr
         ) %>% 
  count() %>% 
  rename(counts = n)



ggplot(data = Counts_AllYears,
       aes(x = topic_name,
           y = counts,
           fill = topic_name
           )
       ) +
  geom_col() +
  geom_text(aes(label = counts),
            nudge_y = -200,
            size = 3
            ) +
  # theme_minimal() +
  theme(legend.position = "none") +
  coord_flip()

 ggsave(paste0(wd,
               "/Viz/",
               # "New_",
               "Topics_CountModel_Counts_AllYears.png"
                  ),
        scale = 4,
        width = 6,
        height = 6,
        units = "cm"
        )


ggplot(data = Counts_AcrossYears,
       aes(x = topic_name,
           y = counts,
           fill = topic_name
           )
       ) +
  geom_col() +
  geom_text(aes(label = counts),
            nudge_y = 100,
            size = 2.5
            ) +
  scale_y_continuous(limits = c(0, 1800),
                     breaks = seq(0, 1800, 300)
                     ) +
  facet_wrap(~serviceorder_yr) +
  theme(legend.position = "none") +
  coord_flip()

 ggsave(paste0(wd,
               "/Viz/",
               # "New_",
               "Topics_CountModel_Counts_AcrossYears.png"
                  ),
        scale = 4,
        width = 8,
        height = 6,
        units = "cm"
        )

```



