1 Reading in the Data

loan01 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational01.csv", header = TRUE)[, -1]
loan02 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational02.csv", header = TRUE)[, -1]
loan03 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational03.csv", header = TRUE)[, -1]
loan04 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational04.csv", header = TRUE)[, -1]
loan05 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational05.csv", header = TRUE)[, -1]
loan06 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational06.csv", header = TRUE)[, -1]
loan07 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational07.csv", header = TRUE)[, -1]
loan08 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational08.csv", header = TRUE)[, -1]
loan09 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational09.csv", header = TRUE)[, -1]
loan = rbind(loan01, loan02, loan03, loan04, loan05, loan06, loan07, loan08, loan09)

2 1. Delete all records whose MIS_Status is missing

loan <- loan[!(loan$MIS_Status %in% ""),]

loan %>% group_by(MIS_Status) %>% summarize(n=n())
## # A tibble: 2 × 2
##   MIS_Status      n
##   <chr>       <int>
## 1 CHGOFF     157558
## 2 P I F      739609

3 2. Change all currency format variables to regular numerical variables

loan$DisbursementGross <- as.numeric(gsub("[\\$,]", "", loan$DisbursementGross))
loan$BalanceGross <- as.numeric(gsub("[\\$,]", "", loan$BalanceGross))
loan$ChgOffPrinGr <- as.numeric(gsub("[\\$,]", "", loan$ChgOffPrinGr))
loan$GrAppv <- as.numeric(gsub("[\\$,]", "", loan$GrAppv))
loan$SBA_Appv <- as.numeric(gsub("[\\$,]", "", loan$SBA_Appv))

3.1 3. Combine sparse categories of categorical variables

loan %>% group_by(State) %>% summarize(n = n())
## # A tibble: 52 × 2
##    State      n
##    <chr>  <int>
##  1 ""        13
##  2 "AK"    2403
##  3 "AL"    8360
##  4 "AR"    6333
##  5 "AZ"   17624
##  6 "CA"  130488
##  7 "CO"   20598
##  8 "CT"   12127
##  9 "DC"    1613
## 10 "DE"    2193
## # ℹ 42 more rows
loan <- loan[!(loan$State %in% ""),]

loan <- loan %>% mutate(
  StateRegion = case_when(
    State %in% c("MA", "ME", "VT", "NH", "CT", "RI") ~ "Northeast",
    State %in% c("NY", "PA", "NJ") ~ "Middle Atlantic",
    State %in% c("WV", "MD", "DE", "DC", "VA", "NC", "GA", "FL", "SC") ~ "South Atlantic",
    State %in% c("KY", "TN", "AL", "MS") ~ "East South Central", 
    State %in% c("OK", "AR", "LA", "TX") ~ "West South Central",
    State %in% c("OH", "IN", "IL", "MI", "WI") ~ "East North Central",
    State %in% c("MN", "IA", "MO", "KS", "NE", "SD", "ND") ~ "West North Central",
    State %in% c("MT", "WY", "CO", "UT", "AZ", "NM","NV", "ID") ~ "Mountain",
    State %in% c("AK", "OR", "CA", "AK", "HI","WA") ~ "Pacific",
    TRUE ~ "Other"
  )
)

loan %>% group_by(StateRegion) %>% summarize(n = n())
## # A tibble: 9 × 2
##   StateRegion             n
##   <chr>               <int>
## 1 East North Central 117685
## 2 East South Central  33148
## 3 Middle Atlantic    116205
## 4 Mountain            92113
## 5 Northeast           69560
## 6 Pacific            170794
## 7 South Atlantic     116902
## 8 West North Central  84498
## 9 West South Central  96249

3.2 4. Calculate default rates by state region

statetotals <- loan %>% group_by(StateRegion) %>% summarize(n=n())
default <- loan %>% group_by(StateRegion, MIS_Status) %>% summarize(n=n())
## `summarise()` has grouped output by 'StateRegion'. You can override using the
## `.groups` argument.
chgoff <- data.frame(StateRegion=c("", "", "", "", "", "", "", "", "","","","","","","","","",""), CHGOFF=c(0, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0))

for(row1 in 1:nrow(default)){
  if(default$MIS_Status[row1] == "CHGOFF"){
    chgoff$StateRegion[row1] <- default$StateRegion[row1]
    chgoff$CHGOFF[row1] <- default$n[row1]
  }
}

chgoff <- chgoff %>% filter(CHGOFF>0)

rate <- merge(statetotals, chgoff, by = "StateRegion")
rate$DefaultRate <- rate$CHGOFF/rate$n  

knitr::kable(rate)
StateRegion n CHGOFF DefaultRate
East North Central 117685 21712 0.1844925
East South Central 33148 6098 0.1839628
Middle Atlantic 116205 21368 0.1838819
Mountain 92113 15296 0.1660569
Northeast 69560 8252 0.1186314
Pacific 170794 29947 0.1753399
South Atlantic 116902 26923 0.2303040
West North Central 84498 10357 0.1225709
West South Central 96249 17603 0.1828902

3.3 5. Discretize GrAppv into 5 categories

loan <- loan %>% mutate(
  GrAppvGroup = case_when(
    GrAppv > 0 & GrAppv <= 25000 ~ "0-25000",
    GrAppv > 25000 & GrAppv <= 60000 ~ ">25000-60000",
    GrAppv > 60000 & GrAppv <= 130000 ~ ">60000-130000",
    GrAppv >130000 & GrAppv <= 300000 ~ ">130000-300000", 
    GrAppv > 300000 ~ ">300000" 
  )
)

loan %>% group_by(GrAppvGroup) %>% summarize(n=n())
## # A tibble: 5 × 2
##   GrAppvGroup         n
##   <chr>           <int>
## 1 0-25000        179287
## 2 >130000-300000 176473
## 3 >25000-60000   193665
## 4 >300000        166517
## 5 >60000-130000  181212

3.4 6. Plot density curves for SBA_Appv of all 5 sub-populations defined in (5)

plot(density(loan$SBA_Appv[which(loan$GrAppvGroup=="0-25000")]), xlim = c(0, 300000))
lines(density(loan$SBA_Appv[which(loan$GrAppvGroup==">25000-60000")]), col="blue")
lines(density(loan$SBA_Appv[which(loan$GrAppvGroup==">60000-130000")]), col = "red")
lines(density(loan$SBA_Appv[which(loan$GrAppvGroup==">130000-300000")]), col="green")
lines(density(loan$SBA_Appv[which(loan$GrAppvGroup==">300000")]), col="yellow")