3. Combine sparse
categories of categorical variables
loan %>% group_by(State) %>% summarize(n = n())
## # A tibble: 52 × 2
## State n
## <chr> <int>
## 1 "" 13
## 2 "AK" 2403
## 3 "AL" 8360
## 4 "AR" 6333
## 5 "AZ" 17624
## 6 "CA" 130488
## 7 "CO" 20598
## 8 "CT" 12127
## 9 "DC" 1613
## 10 "DE" 2193
## # ℹ 42 more rows
loan <- loan[!(loan$State %in% ""),]
loan <- loan %>% mutate(
StateRegion = case_when(
State %in% c("MA", "ME", "VT", "NH", "CT", "RI") ~ "Northeast",
State %in% c("NY", "PA", "NJ") ~ "Middle Atlantic",
State %in% c("WV", "MD", "DE", "DC", "VA", "NC", "GA", "FL", "SC") ~ "South Atlantic",
State %in% c("KY", "TN", "AL", "MS") ~ "East South Central",
State %in% c("OK", "AR", "LA", "TX") ~ "West South Central",
State %in% c("OH", "IN", "IL", "MI", "WI") ~ "East North Central",
State %in% c("MN", "IA", "MO", "KS", "NE", "SD", "ND") ~ "West North Central",
State %in% c("MT", "WY", "CO", "UT", "AZ", "NM","NV", "ID") ~ "Mountain",
State %in% c("AK", "OR", "CA", "AK", "HI","WA") ~ "Pacific",
TRUE ~ "Other"
)
)
loan %>% group_by(StateRegion) %>% summarize(n = n())
## # A tibble: 9 × 2
## StateRegion n
## <chr> <int>
## 1 East North Central 117685
## 2 East South Central 33148
## 3 Middle Atlantic 116205
## 4 Mountain 92113
## 5 Northeast 69560
## 6 Pacific 170794
## 7 South Atlantic 116902
## 8 West North Central 84498
## 9 West South Central 96249
4. Calculate default
rates by state region
statetotals <- loan %>% group_by(StateRegion) %>% summarize(n=n())
default <- loan %>% group_by(StateRegion, MIS_Status) %>% summarize(n=n())
## `summarise()` has grouped output by 'StateRegion'. You can override using the
## `.groups` argument.
chgoff <- data.frame(StateRegion=c("", "", "", "", "", "", "", "", "","","","","","","","","",""), CHGOFF=c(0, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0))
for(row1 in 1:nrow(default)){
if(default$MIS_Status[row1] == "CHGOFF"){
chgoff$StateRegion[row1] <- default$StateRegion[row1]
chgoff$CHGOFF[row1] <- default$n[row1]
}
}
chgoff <- chgoff %>% filter(CHGOFF>0)
rate <- merge(statetotals, chgoff, by = "StateRegion")
rate$DefaultRate <- rate$CHGOFF/rate$n
knitr::kable(rate)
East North Central |
117685 |
21712 |
0.1844925 |
East South Central |
33148 |
6098 |
0.1839628 |
Middle Atlantic |
116205 |
21368 |
0.1838819 |
Mountain |
92113 |
15296 |
0.1660569 |
Northeast |
69560 |
8252 |
0.1186314 |
Pacific |
170794 |
29947 |
0.1753399 |
South Atlantic |
116902 |
26923 |
0.2303040 |
West North Central |
84498 |
10357 |
0.1225709 |
West South Central |
96249 |
17603 |
0.1828902 |
5. Discretize GrAppv
into 5 categories
loan <- loan %>% mutate(
GrAppvGroup = case_when(
GrAppv > 0 & GrAppv <= 25000 ~ "0-25000",
GrAppv > 25000 & GrAppv <= 60000 ~ ">25000-60000",
GrAppv > 60000 & GrAppv <= 130000 ~ ">60000-130000",
GrAppv >130000 & GrAppv <= 300000 ~ ">130000-300000",
GrAppv > 300000 ~ ">300000"
)
)
loan %>% group_by(GrAppvGroup) %>% summarize(n=n())
## # A tibble: 5 × 2
## GrAppvGroup n
## <chr> <int>
## 1 0-25000 179287
## 2 >130000-300000 176473
## 3 >25000-60000 193665
## 4 >300000 166517
## 5 >60000-130000 181212
6. Plot density
curves for SBA_Appv of all 5 sub-populations defined in (5)
plot(density(loan$SBA_Appv[which(loan$GrAppvGroup=="0-25000")]), xlim = c(0, 300000))
lines(density(loan$SBA_Appv[which(loan$GrAppvGroup==">25000-60000")]), col="blue")
lines(density(loan$SBA_Appv[which(loan$GrAppvGroup==">60000-130000")]), col = "red")
lines(density(loan$SBA_Appv[which(loan$GrAppvGroup==">130000-300000")]), col="green")
lines(density(loan$SBA_Appv[which(loan$GrAppvGroup==">300000")]), col="yellow")
