Load the relevant libraries.

# rm(list = ls())
library("tidyverse")          # data manipulation
library("magrittr")           # data manipulation (pipeing data)
library("stringr")            # string manipulation
library("tidytext")           # text manipulation
library("ggplot2")            # viz
library("caret")              # measuring model performance

Session Info.

sessionInfo()
R version 3.3.3 (2017-03-06)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: macOS  10.13.1

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] bindrcpp_0.2    knitr_1.17      caret_6.0-77    lattice_0.20-35 tidytext_0.1.5 
 [6] magrittr_1.5    forcats_0.2.0   stringr_1.2.0   dplyr_0.7.4     purrr_0.2.4    
[11] readr_1.1.1     tidyr_0.7.2     tibble_1.3.4    ggplot2_2.2.1   tidyverse_1.2.1

loaded via a namespace (and not attached):
 [1] httr_1.3.1         ddalpha_1.3.1      sfsmisc_1.1-1      jsonlite_1.5      
 [5] splines_3.3.3      foreach_1.4.3      prodlim_1.6.1      modelr_0.1.1      
 [9] assertthat_0.2.0   highr_0.6          stats4_3.3.3       DRR_0.0.2         
[13] cellranger_1.1.0   yaml_2.1.15        robustbase_0.92-8  ipred_0.9-6       
[17] backports_1.1.1    glue_1.2.0         digest_0.6.12      rvest_0.3.2       
[21] colorspace_1.3-2   recipes_0.1.1      htmltools_0.3.6    Matrix_1.2-12     
[25] plyr_1.8.4         psych_1.7.8        timeDate_3042.101  pkgconfig_2.0.1   
[29] CVST_0.2-1         broom_0.4.3        haven_1.1.0        scales_0.5.0      
[33] gower_0.1.2        lava_1.5.1         withr_2.1.0        nnet_7.3-12       
[37] lazyeval_0.2.1     cli_1.0.0          mnormt_1.5-5       survival_2.41-3   
[41] crayon_1.3.4       readxl_1.0.0       evaluate_0.10.1    tokenizers_0.1.4  
[45] janeaustenr_0.1.5  nlme_3.1-131       SnowballC_0.5.1    MASS_7.3-47       
[49] xml2_1.1.1         dimRed_0.1.0       foreign_0.8-69     class_7.3-14      
[53] rsconnect_0.8.5    tools_3.3.3        hms_0.4.0          kernlab_0.9-25    
[57] munsell_0.4.3      e1071_1.6-8        RcppRoll_0.2.2     rlang_0.1.4       
[61] grid_3.3.3         iterators_1.0.8    rstudioapi_0.7     base64enc_0.1-3   
[65] rmarkdown_1.8      gtable_0.2.0       ModelMetrics_1.1.0 codetools_0.2-15  
[69] curl_3.0           reshape2_1.4.2     R6_2.2.2           lubridate_1.7.1   
[73] rprojroot_1.2      bindr_0.1          stringi_1.1.6      parallel_3.3.3    
[77] Rcpp_0.12.14       rpart_4.1-11       DEoptimR_1.0-8     tidyselect_0.2.3  

Setup the root directory.

Setting wd as the working directory.

wd <- getwd()
wd
[1] "/Users/mdturse/Desktop/Analytics/dc_doh_hackathon"

From the prior analyses as part of analyses, we’ll first load the ServiceNotesCleaned2 dataset. This is a dataset that performs some basically text cleaning (e.g., removing stop words, removing numerics, removing punctuation, etc.).

Note that this dataset has both the original text field (servicenotes) as well as the cleaned text field (servicenotes_cleaned).

Cleaned_Data <- readRDS(paste0(wd,
                               "/Data_Processed/",
                               "ServiceNotesCleaned2.Rds"
                               )
                        )
str(Cleaned_Data)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   26302 obs. of  18 variables:
 $ servicerequestid          : chr  "09-00001211" "09-00001323" "09-00001410" "09-00001865" ...
 $ servicepriority           : chr  "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
 $ servicecode               : chr  "S0311" "S0311" "S0311" "S0311" ...
 $ servicecodedescription    : chr  "Rat Abatement" "Rat Abatement" "Rat Abatement" "Rat Abatement" ...
 $ servicetypecode           : chr  "DEPAHEAL" "DEPAHEAL" "DEPAHEAL" "DEPAHEAL" ...
 $ servicetypecodedescription: chr  "DOH" "DOH" "DOH" "DOH" ...
 $ serviceorderdate          : POSIXct, format: "1999-04-27 12:59:00" "1999-04-30 19:59:00" ...
 $ servicenotes              : chr  "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "rats in the alley behind house" "the rat are coming from an apartment building adjacent to the        alley.  there is alot of trash pilled up behind the apartm"| __truncated__ "The vector control branch baited at 2874 Perry St. NE for rats on 5-25-99." ...
 $ serviceorder_date         : Date, format: "1999-04-27" "1999-04-30" ...
 $ serviceorder_yr           : num  1999 1999 1999 1999 1999 ...
 $ serviceorder_yr_posix     : POSIXct, format: "1999-01-01" "1999-01-01" ...
 $ serviceorder_mth          : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 4 4 5 5 5 5 5 5 6 6 ...
 $ serviceorder_yrmth        : chr  "1999-04" "1999-04" "1999-05" "1999-05" ...
 $ serviceorder_yrmth_posix  : POSIXct, format: "1999-04-01" "1999-04-01" ...
 $ serviceorder_day          : int  27 30 6 14 19 21 26 28 3 8 ...
 $ serviceorder_wkday        : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 3 6 5 6 4 6 4 6 5 3 ...
 $ servicenotes_nonums_nopunc: chr  "customer called vector control control no " "rats alley house" "rat coming apartment building adjacent alley alot trash pilled apartment building" "vector control branch baited  perry st ne rats   " ...
 $ servicenotes_cleaned      : chr  "customer called vector control control no " "rats alley house" "rat coming apartment building adjacent alley alot trash pilled apartment building" "vector control branch baited perry st rats " ...
Cleaned_Data
# View(head(Cleaned_Data, 1000))

Prior analyses created two versions of regex functions to determine if the result of the rat inspection was rats_found, no_rats_found, and unknown. Here, we’ll specify those functions before comparing how they agree/disagree.

From the prior analyses, for regex_1_ratsfound, I added the regex look-behind for “no” “rats found”. Also, for regex_1_noratsfound, I added the regex for “no activity”.

regex_1_ratsfound <- "(a){0,1}ba(i){0,1}ted|blocks epa( ){0,1}|ditrac|( ){0,1}epa( ){0,1}|(?<!no )rat(s){0,1} burrows found|reveal rat burrows|rat burrows (n|r)ear property|soft bait"
regex_1_noratsfound <- "no rat(s){0,1}|no rodent|no action|no (active ){0,1}burrow(s){0,1}|no activity|no(t){0,1} eviden(ce){0,1}(ts){0,1}|no sign(s){0,1} rat(s){0,1}|no sign(s){0,1}|no(t){0,1} find"
regex_2_ratsfound <- "((baited|found) *([\\d]*|one|two|three|four|five|six|seven|eight|nine|ten) *(rat|burrow))|( baited )"
regex_2_noratsfound <- "found no (rat|activity|rodent|evidence)"
Now we'll create the variables we need to do the comparisons.
Regex_Model_Compare <- Cleaned_Data %>% 
  select(servicerequestid,
         serviceorder_date,
         serviceorder_yr_posix,
         serviceorder_yrmth_posix,
         servicenotes,
         servicenotes_cleaned
         ) %>%
  mutate(sn_1_ratsfound = str_detect(str_to_lower(servicenotes),
                                     regex_1_ratsfound
                                     ),
         sn_1_noratsfound = str_detect(str_to_lower(servicenotes),
                                       regex_1_noratsfound
                                       ),
         sn_2_ratsfound = str_detect(str_to_lower(servicenotes),
                                     regex_2_ratsfound
                                     ),
         sn_2_noratsfound = str_detect(str_to_lower(servicenotes),
                                       regex_2_noratsfound
                                       ),
         snc_1_ratsfound = str_detect(servicenotes_cleaned,
                                      regex_1_ratsfound
                                      ),
         snc_1_noratsfound = str_detect(servicenotes_cleaned,
                                        regex_1_noratsfound
                                        ),
         snc_2_ratsfound = str_detect(servicenotes_cleaned,
                                      regex_2_ratsfound
                                      ),
         snc_2_noratsfound = str_detect(servicenotes_cleaned,
                                        regex_2_noratsfound
                                        ),
         outcome_sn_1 = case_when(sn_1_ratsfound == TRUE &
                                    sn_1_noratsfound == FALSE ~ "rats_found",
                                  sn_1_ratsfound == FALSE &
                                    sn_1_noratsfound == TRUE ~ "no_rats_found",
                                  TRUE ~ "unknown"
                                  ),
         outcome_sn_2 = case_when(sn_2_ratsfound == TRUE &
                                    sn_2_noratsfound == FALSE ~ "rats_found",
                                  sn_2_ratsfound == FALSE &
                                    sn_2_noratsfound == TRUE ~ "no_rats_found",
                                  TRUE ~ "unknown"
                                  ),
         outcome_snc_1 = case_when(snc_1_ratsfound == TRUE &
                                     snc_1_noratsfound == FALSE ~ "rats_found",
                                   snc_1_ratsfound == FALSE &
                                     snc_1_noratsfound == TRUE ~ "no_rats_found",
                                   TRUE ~ "unknown"
                                   ),
         outcome_snc_2 = case_when(snc_2_ratsfound == TRUE &
                                     snc_2_noratsfound == FALSE ~ "rats_found",
                                   snc_2_ratsfound == FALSE &
                                     snc_2_noratsfound == TRUE ~ "no_rats_found",
                                   TRUE ~ "unknown"
                                   ),
         regex_compare_snsnc_1 = case_when(outcome_sn_1 == "rats_found" &
                                             outcome_snc_1 == "rats_found" ~ "match_rats",
                                           outcome_sn_1 == "no_rats_found" &
                                             outcome_snc_1 == "no_rats_found" ~ "match_norats",
                                           outcome_sn_1 == "unknown" &
                                             outcome_snc_1 == "unknown" ~ "match_unknown",
                                           TRUE ~ "mismatch"
                                           ),
         regex_compare_snsnc_2 = case_when(outcome_sn_2 == "rats_found" &
                                             outcome_snc_2 == "rats_found" ~ "match_rats",
                                           outcome_sn_2 == "no_rats_found" &
                                             outcome_snc_2 == "no_rats_found" ~ "match_norats",
                                           outcome_sn_2 == "unknown" &
                                             outcome_snc_2 == "unknown" ~ "match_unknown",
                                           TRUE ~ "mismatch"
                                           ),
         regex_compare_sn_12 = case_when(outcome_sn_1 == "rats_found" &
                                           outcome_sn_2 == "rats_found" ~ "match_rats",
                                         outcome_sn_1 == "no_rats_found" &
                                           outcome_sn_2 == "no_rats_found" ~ "match_norats",
                                         outcome_sn_1 == "unknown" &
                                           outcome_sn_2 == "unknown" ~ "match_unknown",
                                         TRUE ~ "mismatch"
                                         ),
         regex_compare_snc_12 = case_when(outcome_snc_1 == "rats_found" &
                                            outcome_snc_2 == "rats_found" ~ "match_rats",
                                          outcome_snc_1 == "no_rats_found" &
                                            outcome_snc_2 == "no_rats_found" ~ "match_norats",
                                          outcome_snc_1 == "unknown" &
                                            outcome_snc_2 == "unknown" ~ "match_unknown",
                                          TRUE ~ "mismatch"
                                          )
         )
str(Regex_Model_Compare)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   26302 obs. of  22 variables:
 $ servicerequestid        : chr  "09-00001211" "09-00001323" "09-00001410" "09-00001865" ...
 $ serviceorder_date       : Date, format: "1999-04-27" "1999-04-30" ...
 $ serviceorder_yr_posix   : POSIXct, format: "1999-01-01" "1999-01-01" ...
 $ serviceorder_yrmth_posix: POSIXct, format: "1999-04-01" "1999-04-01" ...
 $ servicenotes            : chr  "CUSTOMER WAS CALLED BY VECTOR CONTROL. CONTROL NO.: 1382" "rats in the alley behind house" "the rat are coming from an apartment building adjacent to the        alley.  there is alot of trash pilled up behind the apartm"| __truncated__ "The vector control branch baited at 2874 Perry St. NE for rats on 5-25-99." ...
 $ servicenotes_cleaned    : chr  "customer called vector control control no " "rats alley house" "rat coming apartment building adjacent alley alot trash pilled apartment building" "vector control branch baited perry st rats " ...
 $ sn_1_ratsfound          : logi  FALSE FALSE FALSE TRUE TRUE FALSE ...
 $ sn_1_noratsfound        : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ sn_2_ratsfound          : logi  FALSE FALSE FALSE TRUE TRUE FALSE ...
 $ sn_2_noratsfound        : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ snc_1_ratsfound         : logi  FALSE FALSE FALSE TRUE TRUE FALSE ...
 $ snc_1_noratsfound       : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ snc_2_ratsfound         : logi  FALSE FALSE FALSE TRUE TRUE FALSE ...
 $ snc_2_noratsfound       : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ outcome_sn_1            : chr  "unknown" "unknown" "unknown" "rats_found" ...
 $ outcome_sn_2            : chr  "unknown" "unknown" "unknown" "rats_found" ...
 $ outcome_snc_1           : chr  "unknown" "unknown" "unknown" "rats_found" ...
 $ outcome_snc_2           : chr  "unknown" "unknown" "unknown" "rats_found" ...
 $ regex_compare_snsnc_1   : chr  "match_unknown" "match_unknown" "match_unknown" "match_rats" ...
 $ regex_compare_snsnc_2   : chr  "match_unknown" "match_unknown" "match_unknown" "match_rats" ...
 $ regex_compare_sn_12     : chr  "match_unknown" "match_unknown" "match_unknown" "match_rats" ...
 $ regex_compare_snc_12    : chr  "match_unknown" "match_unknown" "match_unknown" "match_rats" ...
head(Regex_Model_Compare, 100)
# View(head(Regex_Model_Compare, 1000))

As a first check, let’s see if the two regex functions provide the same result whether used on the unmodified servicenotes field or used on the servicenotes_cleaned field. To do this, we can use the newly created regex_compare_snsnc_1 and regex_compare_snsnc_2 fields.

First, let’s create a basic ggplot theme.

ggplot_theme_basic <-
  theme(panel.background = element_blank(),
        panel.grid.minor = element_blank(),
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(),
        axis.line = element_line(size = 1, colour = "black")
        )

Now for the plotting/inspection.

It looks like the bulk of inspections (although not all) are classified the same way regardless of whether the regex is run on the raw text (servicenotes) or the cleaned text (servicenotes_cleaned). Also, when comparing the number of mismatches between using raw text and cleaned text, Model 2 has less mismatches than Model 1.

Snsnc_Counts_1 <- Regex_Model_Compare %>% 
  count(regex_compare_snsnc_1) %>% 
  mutate(n_pct = n / sum(n)
         )
Snsnc_Counts_2 <- Regex_Model_Compare %>% 
  count(regex_compare_snsnc_2) %>% 
  mutate(n_pct = n / sum(n)
         )
Counts_Combined <- Snsnc_Counts_1 %>% 
  rename(match_status = regex_compare_snsnc_1) %>% 
  mutate(model_type = "regex_1") %>% 
  bind_rows(Snsnc_Counts_2 %>% 
              rename(match_status = regex_compare_snsnc_2) %>% 
              mutate(model_type = "regex_2")
            )
rm(list = ls(pattern = "Snsnc_Counts_"))
ggplot(data = Counts_Combined,
       aes(x = match_status,
           y = n_pct,
           fill = match_status
           )
       ) +
  geom_col() +
  geom_text(aes(label = round(n_pct, 3)
                ),
            nudge_y = .07,
            size = 3
            ) +
  ggplot_theme_basic +
  theme(legend.position = "none") +
  labs(title = "Comparing Text Type Across Regex Models",
       subtitle = "text type is servicenotes or servicenotes_cleaned",
       x = "Text Type Match Status",
       y = "Counts (pct)"
       ) +
  scale_y_continuous(limits = c(0, 1),
                     breaks = seq(0, 1, .2)
                     ) +
  facet_wrap(~ model_type) +
  coord_flip()
ggsave(paste0(wd,
              "/Viz/",
              "Compare_TextType_Across_Models.png"
              ),
       scale = 4,
       width = 6,
       height = 6,
       units = "cm"
       )

rm(Counts_Combined)

As the amount of mismatches between running the regex models on raw text and on cleaned text is minimal, and as the more valuable metric is comparing how accurate Model 1 and Model 2 are, let’s move on to that now.

As a first step, we can say that the models coincide ~80% of the time (regardless of whether servicenotes is used or servicenotes_cleaned is used).

Sn_Counts <- Regex_Model_Compare %>% 
  count(regex_compare_sn_12) %>% 
  mutate(n_pct = n / sum(n)
         )
Snc_Counts <- Regex_Model_Compare %>% 
  count(regex_compare_snc_12) %>% 
  mutate(n_pct = n / sum(n)
         )
Counts_Combined <- Sn_Counts %>% 
  rename(match_status = regex_compare_sn_12) %>% 
  mutate(text_type = "servicenotes_raw") %>% 
  bind_rows(Snc_Counts %>% 
              rename(match_status = regex_compare_snc_12) %>% 
              mutate(text_type = "servicenotes_cleaned")
            )
rm(list = ls(pattern = "_Counts"))
ggplot(data = Counts_Combined,
       aes(x = match_status,
           y = n_pct,
           fill = match_status
           )
       ) +
  geom_col() +
  geom_text(aes(label = round(n_pct, 3)
                ),
            nudge_y = .07,
            size = 3
            ) +
  ggplot_theme_basic +
  theme(legend.position = "none") +
  labs(title = "Comparing Regex Models Across Text Types",
       subtitle = "regex type is regex_1 or regex_2",
       x = "Regex Type Match Status",
       y = "Counts (pct)"
       ) +
  scale_y_continuous(limits = c(0, 1),
                     breaks = seq(0, 1, .2)
                     ) +
  facet_wrap(~ text_type) +
  coord_flip()
ggsave(paste0(wd,
              "/Viz/",
              "Compare_Models_Across_TextType.png"
              ),
       scale = 4,
       width = 6,
       height = 6,
       units = "cm"
       )

rm(Counts_Combined)

So now let’s dig into some mismatches.

It looks like the bulk of the mismatches (~98%) are when the Model 2 is unknown and Model 1 is either rats_found or no_rats_found.

Regex_Model_Compare %>% 
  filter(regex_compare_snc_12 == "mismatch") %>% 
  mutate(outcome_1__2 = paste0(outcome_snc_1,
                                 "__",
                                 outcome_snc_2)
         ) %>% 
  count(outcome_1__2) %>% 
  mutate(n_pct = n/sum(n)
         ) %>% 
  arrange(desc(n_pct)
          )
# View(
  Regex_Model_Compare %>% 
    filter(regex_compare_snc_12 == "mismatch" &
             outcome_snc_2 == "unknown"
           ) %>% 
    select(servicerequestid,
           serviceorder_date,
           servicenotes,
           servicenotes_cleaned,
           outcome_snc_1,
           outcome_snc_2) %>% 
    sample_n(15) %>% 
    arrange(outcome_snc_1,
            outcome_snc_2
            )#)

There are some mismatches (~1.5%) when Model 2 is rats_found or no_rats_found and Model 1 is unknown. This appears to be the case when the text triggers BOTH the regex for rats_found and no_rats_found.

This tends to happen either when the text describes the inspector baiting one residence, but not finding evidence of rats at a neighboring residence inspected at the same time (e.g., “On 8/18/15 @ 9:57am C. Redman found no rat burrows found on the property, Rat burrows found in front of 2363 Champlain Street., N.W.”).

This also happens when the text describes no rats being found during the inspection, but a pest controller having done abatement (e.g., “On 7/1/15 @ 10:58AM R. Herrington found no rat holes in the alley, pest controller has abated area”).

# View(
  Regex_Model_Compare %>% 
    filter(regex_compare_snc_12 == "mismatch" &
             outcome_snc_2 == "rats_found"
           ) %>% 
    sample_n(7) %>% 
    arrange(outcome_snc_1) %>% 
    bind_rows(Regex_Model_Compare %>% 
                filter(regex_compare_snc_12 == "mismatch" &
                         outcome_snc_2 == "no_rats_found"
                       ) %>% 
                sample_n(7) %>% 
                arrange(outcome_snc_1)
              ) %>% 
    select(servicerequestid,
           serviceorder_date,
           servicenotes,
           servicenotes_cleaned,
           outcome_snc_1,
           outcome_snc_2
           )#)

Manually verified data were created as part of GitHub Issue #27, with the resulting dataset found here. So here, we simply pull in those data.

Manually_Verified_Data <- read_csv("https://raw.githubusercontent.com/austinbrian/the-rat-hack/ca51f6275e34b7a7c6ccf3d1912160f7f86d9d72/scripts/feature_engineering/supervised_data.csv")
Parsed with column specification:
cols(
  original_index = col_integer(),
  rats_y_n = col_character(),
  SERVICENOTES = col_character()
)
str(Manually_Verified_Data)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   400 obs. of  3 variables:
 $ original_index: int  3398190 3676265 5012016 5113346 4898668 4294619 100314 3393476 2995352 3084891 ...
 $ rats_y_n      : chr  "n" "y" "y" "y" ...
 $ SERVICENOTES  : chr  "On 6/7/2012 @ 1 pm there was no rat burrows found in the area." "rat holes located along walkway to deck" "On 11/21/16@11:59 am R Herrington baited 1 rat burrow in the rear yd. Treatment will continue until rodent activity ceases. Dit"| __truncated__ "On 1/23/17@1:30 pm G Cornes baited 1 rat burrow in the front yd Treatment will continue until rodent activity ceases. Ditrac/po"| __truncated__ ...
 - attr(*, "spec")=List of 2
  ..$ cols   :List of 3
  .. ..$ original_index: list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ rats_y_n      : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ SERVICENOTES  : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  ..$ default: list()
  .. ..- attr(*, "class")= chr  "collector_guess" "collector"
  ..- attr(*, "class")= chr "col_spec"
Manually_Verified_Data
# View(Manually_Verified_Data)

A quick visual inspection of the Manually_Verified_Data dataset shows that some of the text in the SERVICENOTES field appears to be mis-categorized when that text was from the residnet, and not from the inspector. For example, “rats are everywhere” is classified as yes/rats_found. However, this is most likely a note from the resident requesting the service, and not the inspector.

To quickly remove this discrepancy, below I build a quick regex function to look for dates, as this seems to be a common indicator that the note was from the inspector and not from the resident. I then only use the Manually_Verified_Data that has notes from the inspector.

Note, that this crude regex function will also remove some instances of when the SERVICENOTES text appears to actually be from the inspector, but the inspector just did not add a date to his text. However, this was the quickest way to get to a viable dataset for confirmation. This process reduced the “ground truth” data from 400 rows to 273 rows.

regex_inspector_note <- "\\d{1,2}(/|-)\\d{1,2}(/|-)\\d{2,4}"
Mark_Inspector_Notes <- Manually_Verified_Data %>% 
  mutate(inspector_note = str_detect(SERVICENOTES,
                                     regex_inspector_note
                                     )
         )
# View(
  Mark_Inspector_Notes#)
Mark_Inspector_Notes %>% 
  count(inspector_note)
# View(
  Mark_Inspector_Notes %>% 
       filter(inspector_note == FALSE |
                is.na(inspector_note)
              ) %>% 
       arrange(inspector_note)#)
# View(
  Mark_Inspector_Notes %>% 
       filter(inspector_note == TRUE &
                rats_y_n == "y"
              ) %>% 
       sample_n(5) %>% 
       bind_rows(Mark_Inspector_Notes %>% 
                   filter(inspector_note == TRUE &
                            rats_y_n == "n"
                          ) %>% 
                   sample_n(5)
                 ) %>% 
       bind_rows(Mark_Inspector_Notes %>% 
                   filter(inspector_note == TRUE &
                            rats_y_n == "u"
                          ) %>% 
                   sample_n(5)
                 )#)
  
Verified_Inspector_Notes <- Mark_Inspector_Notes %>% 
  filter(inspector_note == TRUE)
nrow(Mark_Inspector_Notes)
[1] 400
nrow(Verified_Inspector_Notes)
[1] 273
# View(
Verified_Inspector_Notes#)

After left joining Verified_Inspector_Notes and Regex_Model_Compare, we can see that there are 384 rows as opposed to the 273 rows in Verified_Inspector_Notes.

Manual_Plus_ModelCompare <- Verified_Inspector_Notes %>% 
  left_join(Regex_Model_Compare,
            by = c("SERVICENOTES" = "servicenotes")
            ) %>% 
  mutate(outcome_manual = case_when(rats_y_n == "y" ~ "rats_found",
                                    rats_y_n == "n" ~ "no_rats_found",
                                    rats_y_n == "u" ~ "unknown",
                                    TRUE ~ "XXXX"
                                    )
         ) 
nrow(Verified_Inspector_Notes)
[1] 273
nrow(Manual_Plus_ModelCompare)
[1] 384
Manual_Plus_ModelCompare

After some inspection, we can se that both Verified_Inspector_Notes and Regex_Model_Compare have the same exact text entered multiple times.

Verified_Inspector_Notes %>%
  count(original_index) %>%
  arrange(desc(n)
          )
Verified_Inspector_Notes %>%
  count(SERVICENOTES) %>%
  arrange(desc(n)
          )
Regex_Model_Compare %>% 
  count(servicerequestid) %>% 
  arrange(desc(n)
          )
Regex_Model_Compare %>%
  count(servicenotes) %>%
  arrange(desc(n)
          )
Regex_Model_Compare %>%
  count(servicenotes_cleaned) %>%
  arrange(desc(n)
          )

Although this is an issue for the data overall, for the simple task of comparing the two regex models, we can get around this issue by simply selecting the distinct SERVICENOTES values.

Manual_Plus_ModelCompare_Distinct <-
  Manual_Plus_ModelCompare %>% 
  select(SERVICENOTES,
         servicenotes_cleaned,
         outcome_manual,
         outcome_sn_1,
         outcome_sn_2,
         outcome_snc_1,
         outcome_snc_2
         ) %>% 
  distinct()
Manual_Plus_ModelCompare_Distinct
# View(Manual_Plus_ModelCompare_Distinct)

So now we can get the accuracy statistics using caret::confusionMatrix.

CM_Sn1 <- confusionMatrix(Manual_Plus_ModelCompare_Distinct$outcome_sn_1,
                          Manual_Plus_ModelCompare_Distinct$outcome_manual,
                          positive = "rats_found",
                          mode = "everything"
                          )
CM_Sn2 <- confusionMatrix(Manual_Plus_ModelCompare_Distinct$outcome_sn_2,
                          Manual_Plus_ModelCompare_Distinct$outcome_manual,
                          positive = "rats_found",
                          mode = "everything"
                          )
CM_Snc1 <- confusionMatrix(Manual_Plus_ModelCompare_Distinct$outcome_snc_1,
                           Manual_Plus_ModelCompare_Distinct$outcome_manual,
                           positive = "rats_found",
                           mode = "everything"
                           )
CM_Snc2 <- confusionMatrix(Manual_Plus_ModelCompare_Distinct$outcome_snc_2,
                           Manual_Plus_ModelCompare_Distinct$outcome_manual,
                           positive = "rats_found",
                           mode = "everything"
                           )
CM_AllModels <- list(CM_Sn1, CM_Sn2, CM_Snc1, CM_Snc2)
rm(list = ls(pattern = "CM_S"))
names(CM_AllModels) <- c("CM_Sn1", "CM_Sn2", "CM_Snc1", "CM_Snc2")
CM_AllModels
$CM_Sn1
Confusion Matrix and Statistics

               Reference
Prediction      no_rats_found rats_found unknown
  no_rats_found           117          1       0
  rats_found                1        111       0
  unknown                   9          3       6

Overall Statistics
                                          
               Accuracy : 0.9435          
                 95% CI : (0.9071, 0.9688)
    No Information Rate : 0.5121          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.8965          
 Mcnemar's Test P-Value : 0.007383        

Statistics by Class:

                     Class: no_rats_found Class: rats_found Class: unknown
Sensitivity                        0.9213            0.9652        1.00000
Specificity                        0.9917            0.9925        0.95041
Pos Pred Value                     0.9915            0.9911        0.33333
Neg Pred Value                     0.9231            0.9706        1.00000
Precision                          0.9915            0.9911        0.33333
Recall                             0.9213            0.9652        1.00000
F1                                 0.9551            0.9780        0.50000
Prevalence                         0.5121            0.4637        0.02419
Detection Rate                     0.4718            0.4476        0.02419
Detection Prevalence               0.4758            0.4516        0.07258
Balanced Accuracy                  0.9565            0.9788        0.97521

$CM_Sn2
Confusion Matrix and Statistics

               Reference
Prediction      no_rats_found rats_found unknown
  no_rats_found            93          0       0
  rats_found                1         82       0
  unknown                  33         33       6

Overall Statistics
                                        
               Accuracy : 0.7298        
                 95% CI : (0.67, 0.7841)
    No Information Rate : 0.5121        
    P-Value [Acc > NIR] : 2.054e-12     
                                        
                  Kappa : 0.5816        
 Mcnemar's Test P-Value : 1.873e-14     

Statistics by Class:

                     Class: no_rats_found Class: rats_found Class: unknown
Sensitivity                        0.7323            0.7130        1.00000
Specificity                        1.0000            0.9925        0.72727
Pos Pred Value                     1.0000            0.9880        0.08333
Neg Pred Value                     0.7806            0.8000        1.00000
Precision                          1.0000            0.9880        0.08333
Recall                             0.7323            0.7130        1.00000
F1                                 0.8455            0.8283        0.15385
Prevalence                         0.5121            0.4637        0.02419
Detection Rate                     0.3750            0.3306        0.02419
Detection Prevalence               0.3750            0.3347        0.29032
Balanced Accuracy                  0.8661            0.8528        0.86364

$CM_Snc1
Confusion Matrix and Statistics

               Reference
Prediction      no_rats_found rats_found unknown
  no_rats_found           120          1       0
  rats_found                1        111       0
  unknown                   6          3       6

Overall Statistics
                                         
               Accuracy : 0.9556         
                 95% CI : (0.922, 0.9777)
    No Information Rate : 0.5121         
    P-Value [Acc > NIR] : < 2e-16        
                                         
                  Kappa : 0.9177         
 Mcnemar's Test P-Value : 0.02929        

Statistics by Class:

                     Class: no_rats_found Class: rats_found Class: unknown
Sensitivity                        0.9449            0.9652        1.00000
Specificity                        0.9917            0.9925        0.96281
Pos Pred Value                     0.9917            0.9911        0.40000
Neg Pred Value                     0.9449            0.9706        1.00000
Precision                          0.9917            0.9911        0.40000
Recall                             0.9449            0.9652        1.00000
F1                                 0.9677            0.9780        0.57143
Prevalence                         0.5121            0.4637        0.02419
Detection Rate                     0.4839            0.4476        0.02419
Detection Prevalence               0.4879            0.4516        0.06048
Balanced Accuracy                  0.9683            0.9788        0.98140

$CM_Snc2
Confusion Matrix and Statistics

               Reference
Prediction      no_rats_found rats_found unknown
  no_rats_found            94          0       0
  rats_found                1         82       0
  unknown                  32         33       6

Overall Statistics
                                          
               Accuracy : 0.7339          
                 95% CI : (0.6743, 0.7878)
    No Information Rate : 0.5121          
    P-Value [Acc > NIR] : 7.841e-13       
                                          
                  Kappa : 0.5866          
 Mcnemar's Test P-Value : 3.065e-14       

Statistics by Class:

                     Class: no_rats_found Class: rats_found Class: unknown
Sensitivity                        0.7402            0.7130        1.00000
Specificity                        1.0000            0.9925        0.73140
Pos Pred Value                     1.0000            0.9880        0.08451
Neg Pred Value                     0.7857            0.8000        1.00000
Precision                          1.0000            0.9880        0.08451
Recall                             0.7402            0.7130        1.00000
F1                                 0.8507            0.8283        0.15584
Prevalence                         0.5121            0.4637        0.02419
Detection Rate                     0.3790            0.3306        0.02419
Detection Prevalence               0.3790            0.3347        0.28629
Balanced Accuracy                  0.8701            0.8528        0.86570

All of the Confusion Matrix Statistics are provided above to review. To look at two common metrics, both the F1 value and the Kappa value both show the highest scores for using Regex Model 1 on the servicenotes_cleaned text.

CM_AllModels %>% 
  map_df(~ .x["byClass"][[1]][19:21]) %>% 
  rename(servicenotes_regex1 = CM_Sn1,
         servicenotes_regex2 = CM_Sn2,
         servicenotescleaned_regex1 = CM_Snc1,
         servicenotescleaned_regex2 = CM_Snc2
         ) %>% 
  t() %>% 
  set_colnames(c("f1_no_rats_found",
                 "f1_rats_found",
                 "f1_unknown"
                 )
               ) %>% 
  as.data.frame() %>% 
  rownames_to_column() %>% 
  separate(col = rowname,
           into = c("text_type", "regex_model"),
           sep = "_"
           )
CM_AllModels %>% 
  map_df(~ .["overall"][[1]][[2]]) %>% 
  rename(servicenotes_regex1 = CM_Sn1,
         servicenotes_regex2 = CM_Sn2,
         servicenotescleaned_regex1 = CM_Snc1,
         servicenotescleaned_regex2 = CM_Snc2
         ) %>% 
  t() %>% 
  set_colnames("kappa") %>% 
  as.data.frame() %>% 
  rownames_to_column() %>% 
  separate(col = rowname,
           into = c("text_type", "regex_model"),
           sep = "_"
           )
---
title: "Comparing Two Versions of RegEx Models to Pull Out Rat Features"
output: html_notebook
---
  
    
  Load the relevant libraries.
```{r, message=FALSE, warning=FALSE}

# rm(list = ls())


library("tidyverse")          # data manipulation
library("magrittr")           # data manipulation (pipeing data)
library("stringr")            # string manipulation
library("tidytext")           # text manipulation
library("ggplot2")            # viz
library("caret")              # measuring model performance

```
  
    
  Session Info.
```{r}

sessionInfo()

```
  
    
  Setup the root directory.
```{r "setup", include = FALSE}

require("knitr")

opts_knit$set(root.dir = "/Users/mdturse/Desktop/Analytics/dc_doh_hackathon")

```
  
    
  Setting `wd` as the working directory.
```{r}

wd <- getwd()

wd

```
  
    
  From the prior analyses as part of analyses, we'll first load the `ServiceNotesCleaned2` dataset. This is a dataset that performs some basically text cleaning (e.g., removing stop words, removing numerics, removing punctuation, etc.).  
  
  Note that this dataset has both the original text field (`servicenotes`) as well as the cleaned text field (`servicenotes_cleaned`).
```{r}

Cleaned_Data <- readRDS(paste0(wd,
                               "/Data_Processed/",
                               "ServiceNotesCleaned2.Rds"
                               )
                        )

str(Cleaned_Data)
Cleaned_Data
# View(head(Cleaned_Data, 1000))

```
  
    
  Prior analyses created two versions of regex functions to determine if the result of the rat inspection was `rats_found`, `no_rats_found`, and `unknown`. Here, we'll specify those functions before comparing how they agree/disagree.  
   
  From the prior analyses, for `regex_1_ratsfound`, I added the regex look-behind for "no" "rats found". Also, for `regex_1_noratsfound`, I added the regex for "no activity".
```{r}

regex_1_ratsfound <- "(a){0,1}ba(i){0,1}ted|blocks epa( ){0,1}|ditrac|( ){0,1}epa( ){0,1}|(?<!no )rat(s){0,1} burrows found|reveal rat burrows|rat burrows (n|r)ear property|soft bait"


regex_1_noratsfound <- "no rat(s){0,1}|no rodent|no action|no (active ){0,1}burrow(s){0,1}|no activity|no(t){0,1} eviden(ce){0,1}(ts){0,1}|no sign(s){0,1} rat(s){0,1}|no sign(s){0,1}|no(t){0,1} find"

regex_2_ratsfound <- "((baited|found) *([\\d]*|one|two|three|four|five|six|seven|eight|nine|ten) *(rat|burrow))|( baited )"

regex_2_noratsfound <- "found no (rat|activity|rodent|evidence)"

```
  
    
    Now we'll create the variables we need to do the comparisons.
```{r}

Regex_Model_Compare <- Cleaned_Data %>% 
  select(servicerequestid,
         serviceorder_date,
         serviceorder_yr_posix,
         serviceorder_yrmth_posix,
         servicenotes,
         servicenotes_cleaned
         ) %>%
  mutate(sn_1_ratsfound = str_detect(str_to_lower(servicenotes),
                                     regex_1_ratsfound
                                     ),
         sn_1_noratsfound = str_detect(str_to_lower(servicenotes),
                                       regex_1_noratsfound
                                       ),
         sn_2_ratsfound = str_detect(str_to_lower(servicenotes),
                                     regex_2_ratsfound
                                     ),
         sn_2_noratsfound = str_detect(str_to_lower(servicenotes),
                                       regex_2_noratsfound
                                       ),
         snc_1_ratsfound = str_detect(servicenotes_cleaned,
                                      regex_1_ratsfound
                                      ),
         snc_1_noratsfound = str_detect(servicenotes_cleaned,
                                        regex_1_noratsfound
                                        ),
         snc_2_ratsfound = str_detect(servicenotes_cleaned,
                                      regex_2_ratsfound
                                      ),
         snc_2_noratsfound = str_detect(servicenotes_cleaned,
                                        regex_2_noratsfound
                                        ),
         outcome_sn_1 = case_when(sn_1_ratsfound == TRUE &
                                    sn_1_noratsfound == FALSE ~ "rats_found",
                                  sn_1_ratsfound == FALSE &
                                    sn_1_noratsfound == TRUE ~ "no_rats_found",
                                  TRUE ~ "unknown"
                                  ),
         outcome_sn_2 = case_when(sn_2_ratsfound == TRUE &
                                    sn_2_noratsfound == FALSE ~ "rats_found",
                                  sn_2_ratsfound == FALSE &
                                    sn_2_noratsfound == TRUE ~ "no_rats_found",
                                  TRUE ~ "unknown"
                                  ),
         outcome_snc_1 = case_when(snc_1_ratsfound == TRUE &
                                     snc_1_noratsfound == FALSE ~ "rats_found",
                                   snc_1_ratsfound == FALSE &
                                     snc_1_noratsfound == TRUE ~ "no_rats_found",
                                   TRUE ~ "unknown"
                                   ),
         outcome_snc_2 = case_when(snc_2_ratsfound == TRUE &
                                     snc_2_noratsfound == FALSE ~ "rats_found",
                                   snc_2_ratsfound == FALSE &
                                     snc_2_noratsfound == TRUE ~ "no_rats_found",
                                   TRUE ~ "unknown"
                                   ),
         regex_compare_snsnc_1 = case_when(outcome_sn_1 == "rats_found" &
                                             outcome_snc_1 == "rats_found" ~ "match_rats",
                                           outcome_sn_1 == "no_rats_found" &
                                             outcome_snc_1 == "no_rats_found" ~ "match_norats",
                                           outcome_sn_1 == "unknown" &
                                             outcome_snc_1 == "unknown" ~ "match_unknown",
                                           TRUE ~ "mismatch"
                                           ),
         regex_compare_snsnc_2 = case_when(outcome_sn_2 == "rats_found" &
                                             outcome_snc_2 == "rats_found" ~ "match_rats",
                                           outcome_sn_2 == "no_rats_found" &
                                             outcome_snc_2 == "no_rats_found" ~ "match_norats",
                                           outcome_sn_2 == "unknown" &
                                             outcome_snc_2 == "unknown" ~ "match_unknown",
                                           TRUE ~ "mismatch"
                                           ),
         regex_compare_sn_12 = case_when(outcome_sn_1 == "rats_found" &
                                           outcome_sn_2 == "rats_found" ~ "match_rats",
                                         outcome_sn_1 == "no_rats_found" &
                                           outcome_sn_2 == "no_rats_found" ~ "match_norats",
                                         outcome_sn_1 == "unknown" &
                                           outcome_sn_2 == "unknown" ~ "match_unknown",
                                         TRUE ~ "mismatch"
                                         ),
         regex_compare_snc_12 = case_when(outcome_snc_1 == "rats_found" &
                                            outcome_snc_2 == "rats_found" ~ "match_rats",
                                          outcome_snc_1 == "no_rats_found" &
                                            outcome_snc_2 == "no_rats_found" ~ "match_norats",
                                          outcome_snc_1 == "unknown" &
                                            outcome_snc_2 == "unknown" ~ "match_unknown",
                                          TRUE ~ "mismatch"
                                          )
         )

str(Regex_Model_Compare)
head(Regex_Model_Compare, 100)
# View(head(Regex_Model_Compare, 1000))

```
  
    
  As a first check, let's see if the two regex functions provide the same result whether used on the unmodified `servicenotes` field or used on the `servicenotes_cleaned` field. To do this, we can use the newly created `regex_compare_snsnc_1` and `regex_compare_snsnc_2` fields.  
    
  First, let's create a basic ggplot theme.
```{r}

ggplot_theme_basic <-
  theme(panel.background = element_blank(),
        panel.grid.minor = element_blank(),
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(),
        axis.line = element_line(size = 1, colour = "black")
        )


```
  
    
  Now for the plotting/inspection.  
    
  It looks like the bulk of inspections (although not all) are classified the same way regardless of whether the regex is run on the raw text (`servicenotes`) or the cleaned text (`servicenotes_cleaned`).  Also, when comparing the number of mismatches between using raw text and cleaned text, Model 2 has less mismatches than Model 1.
```{r}

Snsnc_Counts_1 <- Regex_Model_Compare %>% 
  count(regex_compare_snsnc_1) %>% 
  mutate(n_pct = n / sum(n)
         )

Snsnc_Counts_2 <- Regex_Model_Compare %>% 
  count(regex_compare_snsnc_2) %>% 
  mutate(n_pct = n / sum(n)
         )


Counts_Combined <- Snsnc_Counts_1 %>% 
  rename(match_status = regex_compare_snsnc_1) %>% 
  mutate(model_type = "regex_1") %>% 
  bind_rows(Snsnc_Counts_2 %>% 
              rename(match_status = regex_compare_snsnc_2) %>% 
              mutate(model_type = "regex_2")
            )

rm(list = ls(pattern = "Snsnc_Counts_"))



ggplot(data = Counts_Combined,
       aes(x = match_status,
           y = n_pct,
           fill = match_status
           )
       ) +
  geom_col() +
  geom_text(aes(label = round(n_pct, 3)
                ),
            nudge_y = .07,
            size = 3
            ) +
  ggplot_theme_basic +
  theme(legend.position = "none") +
  labs(title = "Comparing Text Type Across Regex Models",
       subtitle = "text type is servicenotes or servicenotes_cleaned",
       x = "Text Type Match Status",
       y = "Counts (pct)"
       ) +
  scale_y_continuous(limits = c(0, 1),
                     breaks = seq(0, 1, .2)
                     ) +
  facet_wrap(~ model_type) +
  coord_flip()

ggsave(paste0(wd,
              "/Viz/",
              "Compare_TextType_Across_Models.png"
              ),
       scale = 4,
       width = 6,
       height = 6,
       units = "cm"
       )

rm(Counts_Combined)

```
  
    
  As the amount of mismatches between running the regex models on raw text and on cleaned text is minimal, and as the more valuable metric is comparing how accurate Model 1 and Model 2 are, let's move on to that now.  
    
  As a first step, we can say that the models coincide ~80% of the time (regardless of whether `servicenotes` is used or `servicenotes_cleaned` is used).
```{r}

Sn_Counts <- Regex_Model_Compare %>% 
  count(regex_compare_sn_12) %>% 
  mutate(n_pct = n / sum(n)
         )

Snc_Counts <- Regex_Model_Compare %>% 
  count(regex_compare_snc_12) %>% 
  mutate(n_pct = n / sum(n)
         )

Counts_Combined <- Sn_Counts %>% 
  rename(match_status = regex_compare_sn_12) %>% 
  mutate(text_type = "servicenotes_raw") %>% 
  bind_rows(Snc_Counts %>% 
              rename(match_status = regex_compare_snc_12) %>% 
              mutate(text_type = "servicenotes_cleaned")
            )

rm(list = ls(pattern = "_Counts"))



ggplot(data = Counts_Combined,
       aes(x = match_status,
           y = n_pct,
           fill = match_status
           )
       ) +
  geom_col() +
  geom_text(aes(label = round(n_pct, 3)
                ),
            nudge_y = .07,
            size = 3
            ) +
  ggplot_theme_basic +
  theme(legend.position = "none") +
  labs(title = "Comparing Regex Models Across Text Types",
       subtitle = "regex type is regex_1 or regex_2",
       x = "Regex Type Match Status",
       y = "Counts (pct)"
       ) +
  scale_y_continuous(limits = c(0, 1),
                     breaks = seq(0, 1, .2)
                     ) +
  facet_wrap(~ text_type) +
  coord_flip()

ggsave(paste0(wd,
              "/Viz/",
              "Compare_Models_Across_TextType.png"
              ),
       scale = 4,
       width = 6,
       height = 6,
       units = "cm"
       )

rm(Counts_Combined)

```
  
    
  So now let's dig into some mismatches.  
    
  It looks like the bulk of the mismatches (~98%) are when the Model 2 is `unknown` and Model 1 is either `rats_found` or `no_rats_found`.  
```{r}

Regex_Model_Compare %>% 
  filter(regex_compare_snc_12 == "mismatch") %>% 
  mutate(outcome_1__2 = paste0(outcome_snc_1,
                                 "__",
                                 outcome_snc_2)
         ) %>% 
  count(outcome_1__2) %>% 
  mutate(n_pct = n/sum(n)
         ) %>% 
  arrange(desc(n_pct)
          )


# View(
  Regex_Model_Compare %>% 
    filter(regex_compare_snc_12 == "mismatch" &
             outcome_snc_2 == "unknown"
           ) %>% 
    select(servicerequestid,
           serviceorder_date,
           servicenotes,
           servicenotes_cleaned,
           outcome_snc_1,
           outcome_snc_2) %>% 
    sample_n(15) %>% 
    arrange(outcome_snc_1,
            outcome_snc_2
            )#)

```
  
    
  There are some mismatches (~1.5%) when Model 2 is `rats_found` or `no_rats_found` and Model 1 is `unknown`. This appears to be the case when the text triggers BOTH the regex for `rats_found` and `no_rats_found`.  
    
  This tends to happen either when the text describes the inspector baiting one residence, but not finding evidence of rats at a neighboring residence inspected at the same time (e.g., "On 8/18/15 @ 9:57am C. Redman found no rat burrows found on the property, Rat burrows found in front of 2363 Champlain Street., N.W.").  
    
  This also happens when the text describes no rats being found during the inspection, but a pest controller having done abatement (e.g., "On 7/1/15 @ 10:58AM R. Herrington found no rat holes in the alley, pest controller has abated area").
```{r}

# View(
  Regex_Model_Compare %>% 
    filter(regex_compare_snc_12 == "mismatch" &
             outcome_snc_2 == "rats_found"
           ) %>% 
    sample_n(7) %>% 
    arrange(outcome_snc_1) %>% 
    bind_rows(Regex_Model_Compare %>% 
                filter(regex_compare_snc_12 == "mismatch" &
                         outcome_snc_2 == "no_rats_found"
                       ) %>% 
                sample_n(7) %>% 
                arrange(outcome_snc_1)
              ) %>% 
    select(servicerequestid,
           serviceorder_date,
           servicenotes,
           servicenotes_cleaned,
           outcome_snc_1,
           outcome_snc_2
           )#)

```
  
    
  Manually verified data were created as part of [GitHub Issue #27](https://github.com/eclee25/the-rat-hack/pull/29), with the resulting dataset found [here](https://raw.githubusercontent.com/austinbrian/the-rat-hack/ca51f6275e34b7a7c6ccf3d1912160f7f86d9d72/scripts/feature_engineering/supervised_data.csv). So here, we simply pull in those data.
```{r}

Manually_Verified_Data <- read_csv("https://raw.githubusercontent.com/austinbrian/the-rat-hack/ca51f6275e34b7a7c6ccf3d1912160f7f86d9d72/scripts/feature_engineering/supervised_data.csv")

str(Manually_Verified_Data)
Manually_Verified_Data
# View(Manually_Verified_Data)

```
  
    
  A quick visual inspection of the `Manually_Verified_Data` dataset shows that some of the text in the `SERVICENOTES` field appears to be mis-categorized when that text was from the residnet, and not from the inspector. For example, "rats are everywhere" is classified as yes/rats_found. However, this is most likely a note from the resident requesting the service, and not the inspector.  
    
  To quickly remove this discrepancy, below I build a quick regex function to look for dates, as this seems to be a common indicator that the note was from the inspector and not from the resident. I then only use the `Manually_Verified_Data` that has notes from the inspector.  
    
  Note, that this crude regex function will also remove some instances of when the `SERVICENOTES` text appears to actually be from the inspector, but the inspector just did not add a date to his text. However, this was the quickest way to get to a viable dataset for confirmation. This process reduced the "ground truth" data from 400 rows to 273 rows.
```{r}

regex_inspector_note <- "\\d{1,2}(/|-)\\d{1,2}(/|-)\\d{2,4}"

Mark_Inspector_Notes <- Manually_Verified_Data %>% 
  mutate(inspector_note = str_detect(SERVICENOTES,
                                     regex_inspector_note
                                     )
         )

# View(
  Mark_Inspector_Notes#)

Mark_Inspector_Notes %>% 
  count(inspector_note)

# View(
  Mark_Inspector_Notes %>% 
       filter(inspector_note == FALSE |
                is.na(inspector_note)
              ) %>% 
       arrange(inspector_note)#)

# View(
  Mark_Inspector_Notes %>% 
       filter(inspector_note == TRUE &
                rats_y_n == "y"
              ) %>% 
       sample_n(5) %>% 
       bind_rows(Mark_Inspector_Notes %>% 
                   filter(inspector_note == TRUE &
                            rats_y_n == "n"
                          ) %>% 
                   sample_n(5)
                 ) %>% 
       bind_rows(Mark_Inspector_Notes %>% 
                   filter(inspector_note == TRUE &
                            rats_y_n == "u"
                          ) %>% 
                   sample_n(5)
                 )#)
  

Verified_Inspector_Notes <- Mark_Inspector_Notes %>% 
  filter(inspector_note == TRUE)


nrow(Mark_Inspector_Notes)
nrow(Verified_Inspector_Notes)

# View(
Verified_Inspector_Notes#)

```
  
    
  After left joining `Verified_Inspector_Notes` and `Regex_Model_Compare`, we can see that there are 384 rows as opposed to the 273 rows in `Verified_Inspector_Notes`.
```{r}

Manual_Plus_ModelCompare <- Verified_Inspector_Notes %>% 
  left_join(Regex_Model_Compare,
            by = c("SERVICENOTES" = "servicenotes")
            ) %>% 
  mutate(outcome_manual = case_when(rats_y_n == "y" ~ "rats_found",
                                    rats_y_n == "n" ~ "no_rats_found",
                                    rats_y_n == "u" ~ "unknown",
                                    TRUE ~ "XXXX"
                                    )
         ) 


nrow(Verified_Inspector_Notes)
nrow(Manual_Plus_ModelCompare)

Manual_Plus_ModelCompare

```
  
    
  After some inspection, we can se that both `Verified_Inspector_Notes` and `Regex_Model_Compare` have the same exact text entered multiple times.
```{r}

Verified_Inspector_Notes %>%
  count(original_index) %>%
  arrange(desc(n)
          )

Verified_Inspector_Notes %>%
  count(SERVICENOTES) %>%
  arrange(desc(n)
          )



Regex_Model_Compare %>% 
  count(servicerequestid) %>% 
  arrange(desc(n)
          )

Regex_Model_Compare %>%
  count(servicenotes) %>%
  arrange(desc(n)
          )

Regex_Model_Compare %>%
  count(servicenotes_cleaned) %>%
  arrange(desc(n)
          )

```
  
    
  Although this is an issue for the data overall, for the simple task of comparing the two regex models, we can get around this issue by simply selecting the distinct `SERVICENOTES` values.
```{r}

Manual_Plus_ModelCompare_Distinct <-
  Manual_Plus_ModelCompare %>% 
  select(SERVICENOTES,
         servicenotes_cleaned,
         outcome_manual,
         outcome_sn_1,
         outcome_sn_2,
         outcome_snc_1,
         outcome_snc_2
         ) %>% 
  distinct()

Manual_Plus_ModelCompare_Distinct
# View(Manual_Plus_ModelCompare_Distinct)

```
  
    
  So now we can get the accuracy statistics using `caret::confusionMatrix`.
```{r}

CM_Sn1 <- confusionMatrix(Manual_Plus_ModelCompare_Distinct$outcome_sn_1,
                          Manual_Plus_ModelCompare_Distinct$outcome_manual,
                          positive = "rats_found",
                          mode = "everything"
                          )


CM_Sn2 <- confusionMatrix(Manual_Plus_ModelCompare_Distinct$outcome_sn_2,
                          Manual_Plus_ModelCompare_Distinct$outcome_manual,
                          positive = "rats_found",
                          mode = "everything"
                          )


CM_Snc1 <- confusionMatrix(Manual_Plus_ModelCompare_Distinct$outcome_snc_1,
                           Manual_Plus_ModelCompare_Distinct$outcome_manual,
                           positive = "rats_found",
                           mode = "everything"
                           )


CM_Snc2 <- confusionMatrix(Manual_Plus_ModelCompare_Distinct$outcome_snc_2,
                           Manual_Plus_ModelCompare_Distinct$outcome_manual,
                           positive = "rats_found",
                           mode = "everything"
                           )


CM_AllModels <- list(CM_Sn1, CM_Sn2, CM_Snc1, CM_Snc2)
rm(list = ls(pattern = "CM_S"))

names(CM_AllModels) <- c("CM_Sn1", "CM_Sn2", "CM_Snc1", "CM_Snc2")

CM_AllModels

```
  
    
  All of the Confusion Matrix Statistics are provided above to review. To look at two common metrics, both the F1 value and the Kappa value both show the highest scores for using Regex Model 1 on the `servicenotes_cleaned` text.
```{r}

CM_AllModels %>% 
  map_df(~ .x["byClass"][[1]][19:21]) %>% 
  rename(servicenotes_regex1 = CM_Sn1,
         servicenotes_regex2 = CM_Sn2,
         servicenotescleaned_regex1 = CM_Snc1,
         servicenotescleaned_regex2 = CM_Snc2
         ) %>% 
  t() %>% 
  set_colnames(c("f1_no_rats_found",
                 "f1_rats_found",
                 "f1_unknown"
                 )
               ) %>% 
  as.data.frame() %>% 
  rownames_to_column() %>% 
  separate(col = rowname,
           into = c("text_type", "regex_model"),
           sep = "_"
           )


CM_AllModels %>% 
  map_df(~ .["overall"][[1]][[2]]) %>% 
  rename(servicenotes_regex1 = CM_Sn1,
         servicenotes_regex2 = CM_Sn2,
         servicenotescleaned_regex1 = CM_Snc1,
         servicenotescleaned_regex2 = CM_Snc2
         ) %>% 
  t() %>% 
  set_colnames("kappa") %>% 
  as.data.frame() %>% 
  rownames_to_column() %>% 
  separate(col = rowname,
           into = c("text_type", "regex_model"),
           sep = "_"
           )

```


