Data Summary

glimpse(permit_decision)
## Rows: 971,486
## Columns: 15
## $ status           <fct> Permit Expired, Permit Finaled, Permit Finaled, Permi…
## $ permitType       <fct> Plumbing, Plumbing, Plumbing, Plumbing, Electrical, P…
## $ permitSubtype    <fct> 1 or 2 Family Dwelling, 1 or 2 Family Dwelling, Comme…
## $ permitCategory   <fct> No Plan Check, No Plan Check, No Plan Check, No Plan …
## $ initiatingOffice <fct> INTERNET, INTERNET, INTERNET, INTERNET, INTERNET, INT…
## $ ZIP              <fct> 90046, 90004, 90021, 90029, 90039, 90039, 91406, 9004…
## $ valuation        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ floorArea        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ numberUnits      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ stories          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ contractorState  <fct> "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA",…
## $ licenseType      <fct> C36, C36, C36, C36, C10, C36, C10, C10, C20, C36, C36…
## $ zone             <fct> "R1-1", "R2-1", "M2-2D", "R1-1-HPOZ", "R1-1", "R1-1VL…
## $ year             <fct> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,…
## $ month            <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
str(permit_decision)
## spc_tbl_ [971,486 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ status          : Factor w/ 25 levels "Permit Expired",..: 1 2 2 2 2 2 2 2 2 2 ...
##  $ permitType      : Factor w/ 18 levels "Plumbing","Electrical",..: 1 1 1 1 2 1 2 2 3 1 ...
##  $ permitSubtype   : Factor w/ 7 levels "1 or 2 Family Dwelling",..: 1 1 2 1 1 1 1 1 2 1 ...
##  $ permitCategory  : Factor w/ 2 levels "No Plan Check",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ initiatingOffice: Factor w/ 7 levels "INTERNET","WEST LA",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ ZIP             : Factor w/ 139 levels "90046","90004",..: 1 2 3 4 5 5 6 7 8 9 ...
##  $ valuation       : num [1:971486] NA NA NA NA NA NA NA NA NA NA ...
##  $ floorArea       : num [1:971486] NA NA NA NA NA NA NA NA NA NA ...
##  $ numberUnits     : num [1:971486] NA NA NA NA NA NA NA NA NA NA ...
##  $ stories         : num [1:971486] NA NA NA NA NA NA NA NA NA NA ...
##  $ contractorState : Factor w/ 57 levels "CA","AZ","MI",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ licenseType     : Factor w/ 105 levels "C36","C10","C20",..: 1 1 1 1 2 1 2 2 3 1 ...
##  $ zone            : Factor w/ 1723 levels "R1-1","R2-1",..: 1 2 3 4 1 5 1 1 6 1 ...
##  $ year            : Factor w/ 7 levels "2013","2014",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ month           : Factor w/ 12 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   status = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   permitType = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   permitSubtype = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   permitCategory = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   initiatingOffice = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   ZIP = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   valuation = col_number(),
##   ..   floorArea = col_number(),
##   ..   numberUnits = col_number(),
##   ..   stories = col_number(),
##   ..   contractorState = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   licenseType = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   zone = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   year = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   month = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE)
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(permit_decision)
##             status                   permitType    
##  Permit Finaled:644876   Electrical       :274356  
##  Issued        :196696   Bldg-Alter/Repair:222644  
##  Permit Expired: 54706   Plumbing         :185189  
##  CofO Issued   : 43917   HVAC             : 96490  
##  Permit Closed : 12832   Fire Sprinkler   : 38404  
##  (Other)       : 18419   (Other)          :154363  
##  NA's          :    40   NA's             :    40  
##                 permitSubtype          permitCategory   initiatingOffice 
##  1 or 2 Family Dwelling:542641   No Plan Check:646957   METRO   :289327  
##  Commercial            :248659   Plan Check   :324489   VAN NUYS:283862  
##  Apartment             :161264   NA's         :    40   INTERNET:251721  
##  Onsite                : 12536                          WEST LA : 76451  
##  Special Equipment     :  5299                          SOUTH LA: 37615  
##  (Other)               :  1047                          (Other) : 32470  
##  NA's                  :    40                          NA's    :    40  
##       ZIP           valuation           floorArea        numberUnits    
##  90045  : 25362   Min.   :        0   Min.   :-154151   Min.   :-147.0  
##  90049  : 21111   1st Qu.:     2100   1st Qu.:     32   1st Qu.:   0.0  
##  91331  : 17270   Median :     8000   Median :    500   Median :   0.0  
##  91367  : 16631   Mean   :   153474   Mean   :   3869   Mean   :   1.8  
##  90026  : 16109   3rd Qu.:    30000   3rd Qu.:   2180   3rd Qu.:   1.0  
##  (Other):874902   Max.   :525000000   Max.   :1788210   Max.   : 910.0  
##  NA's   :   101   NA's   :602487      NA's   :888698    NA's   :927409  
##     stories       contractorState   licenseType          zone       
##  Min.   :  -3.0   CA     :809934   B      :327643   R1-1   :179475  
##  1st Qu.:   0.0   TN     :  3670   C10    :175364   R3-1   : 51635  
##  Median :   1.0   GA     :  3666   C36    :125550   RS-1   : 41478  
##  Mean   :   1.6   WA     :  3597   C20    : 73022   R2-1   : 26992  
##  3rd Qu.:   2.0   FL     :  3236   C16    : 37949   RA-1   : 25430  
##  Max.   :4654.0   (Other): 13663   (Other): 98788   (Other):644096  
##  NA's   :891769   NA's   :133720   NA's   :133170   NA's   :  2380  
##       year            month       
##  2018   :175912   4      : 92875  
##  2017   :169791   3      : 91715  
##  2016   :156165   8      : 84622  
##  2015   :148824   10     : 83117  
##  2014   :132524   1      : 82425  
##  (Other):188230   (Other):536692  
##  NA's   :    40   NA's   :    40
sum(is.na(permit_decision))
## [1] 3580014

Replace each negative number by NAs

permit_decision$numberUnits <- ifelse(permit_decision$numberUnits < 1, NA, permit_decision$numberUnits)

permit_decision$floorArea <- ifelse(permit_decision$floorArea < 1, NA, permit_decision$floorArea)

permit_decision$valuation <- ifelse(permit_decision$valuation < 1, NA, permit_decision$valuation)

permit_decision$stories <- ifelse(permit_decision$stories < 1, NA, permit_decision$stories)

permit_decision$stories <- ifelse(permit_decision$stories > 73, NA, permit_decision$stories)

Selecting some variables from the data

permit_decision_select <- permit_decision %>% select(permitType, permitSubtype, initiatingOffice,permitCategory)

Splitting the data

set.seed(1234)
permit_sample_set <- sample(nrow(permit_decision_select), round(nrow(permit_decision_select)* .80), replace = FALSE)

permit_train <- permit_decision_select[permit_sample_set, ]
permit_test <- permit_decision_select[-permit_sample_set, ]

Checking for class imbalance

round(prop.table(table(select(permit_decision_select, permitCategory))), 2)
## permitCategory
## No Plan Check    Plan Check 
##          0.67          0.33
round(prop.table(table(select(permit_train, permitCategory))), 2)
## permitCategory
## No Plan Check    Plan Check 
##          0.67          0.33
round(prop.table(table(select(permit_test, permitCategory))), 2)
## permitCategory
## No Plan Check    Plan Check 
##          0.67          0.33

Training the model

library(rpart)
permit_mod <- rpart(permitCategory ~., method = "class", data = permit_train)

Evaluating the model

library(rpart.plot)

rpart.plot(permit_mod)

permits_pred <- predict(permit_mod, permit_test, type = "class")

permit_pred_table <- table(permit_test$permitCategory, permits_pred)
permit_pred_table
##                permits_pred
##                 No Plan Check Plan Check
##   No Plan Check        121929       7357
##   Plan Check            19054      45949
sum(diag(permit_pred_table))/ nrow(permit_test)
## [1] 0.8640278