1 Reading in the Data

loan01 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational01.csv", header = TRUE)[, -1]
loan02 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational02.csv", header = TRUE)[, -1]
loan03 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational03.csv", header = TRUE)[, -1]
loan04 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational04.csv", header = TRUE)[, -1]
loan05 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational05.csv", header = TRUE)[, -1]
loan06 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational06.csv", header = TRUE)[, -1]
loan07 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational07.csv", header = TRUE)[, -1]
loan08 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational08.csv", header = TRUE)[, -1]
loan09 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational09.csv", header = TRUE)[, -1]
loan = rbind(loan01, loan02, loan03, loan04, loan05, loan06, loan07, loan08, loan09)

2 1. Delete all records whose MIS_Status is missing

loan <- loan[!(loan$MIS_Status %in% ""),]

loan %>% group_by(MIS_Status) %>% summarize(n=n())

## # A tibble: 2 × 2
##   MIS_Status      n
##   <chr>       <int>
## 1 CHGOFF     157558
## 2 P I F      739609

3 2. Change all currency format variables to regular numerical variables

loan$DisbursementGross <- as.numeric(gsub("[\\$,]", "", loan$DisbursementGross))
loan$BalanceGross <- as.numeric(gsub("[\\$,]", "", loan$BalanceGross))
loan$ChgOffPrinGr <- as.numeric(gsub("[\\$,]", "", loan$ChgOffPrinGr))
loan$GrAppv <- as.numeric(gsub("[\\$,]", "", loan$GrAppv))
loan$SBA_Appv <- as.numeric(gsub("[\\$,]", "", loan$SBA_Appv))

3.1 3. Combine sparse categories of categorical variables

loan %>% group_by(State) %>% summarize(n = n())

## # A tibble: 52 × 2
##    State      n
##    <chr>  <int>
##  1 ""        13
##  2 "AK"    2403
##  3 "AL"    8360
##  4 "AR"    6333
##  5 "AZ"   17624
##  6 "CA"  130488
##  7 "CO"   20598
##  8 "CT"   12127
##  9 "DC"    1613
## 10 "DE"    2193
## # ℹ 42 more rows

loan <- loan[!(loan$State %in% ""),]

loan <- loan %>% mutate(
  StateRegion = case_when(
    State %in% c("MA", "ME", "VT", "NH", "CT", "RI") ~ "Northeast",
    State %in% c("NY", "PA", "NJ") ~ "Middle Atlantic",
    State %in% c("WV", "MD", "DE", "DC", "VA", "NC", "GA", "FL", "SC") ~ "South Atlantic",
    State %in% c("KY", "TN", "AL", "MS") ~ "East South Central", 
    State %in% c("OK", "AR", "LA", "TX") ~ "West South Central",
    State %in% c("OH", "IN", "IL", "MI", "WI") ~ "East North Central",
    State %in% c("MN", "IA", "MO", "KS", "NE", "SD", "ND") ~ "West North Central",
    State %in% c("MT", "WY", "CO", "UT", "AZ", "NM","NV", "ID") ~ "Mountain",
    State %in% c("AK", "OR", "CA", "AK", "HI","WA") ~ "Pacific",
    TRUE ~ "Other"
  )
)

loan %>% group_by(StateRegion) %>% summarize(n = n())

## # A tibble: 9 × 2
##   StateRegion             n
##   <chr>               <int>
## 1 East North Central 117685
## 2 East South Central  33148
## 3 Middle Atlantic    116205
## 4 Mountain            92113
## 5 Northeast           69560
## 6 Pacific            170794
## 7 South Atlantic     116902
## 8 West North Central  84498
## 9 West South Central  96249

3.2 4. Calculate default rates by state region

statetotals <- loan %>% group_by(StateRegion) %>% summarize(n=n())
default <- loan %>% group_by(StateRegion, MIS_Status) %>% summarize(n=n())

## `summarise()` has grouped output by 'StateRegion'. You can override using the
## `.groups` argument.

chgoff <- data.frame(StateRegion=c("", "", "", "", "", "", "", "", "","","","","","","","","",""), CHGOFF=c(0, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0))

for(row1 in 1:nrow(default)){
  if(default$MIS_Status[row1] == "CHGOFF"){
    chgoff$StateRegion[row1] <- default$StateRegion[row1]
    chgoff$CHGOFF[row1] <- default$n[row1]
  }
}

chgoff <- chgoff %>% filter(CHGOFF>0)

rate <- merge(statetotals, chgoff, by = "StateRegion")
rate$DefaultRate <- rate$CHGOFF/rate$n  

knitr::kable(rate)

StateRegion	n	CHGOFF	DefaultRate
East North Central	117685	21712	0.1844925
East South Central	33148	6098	0.1839628
Middle Atlantic	116205	21368	0.1838819
Mountain	92113	15296	0.1660569
Northeast	69560	8252	0.1186314
Pacific	170794	29947	0.1753399
South Atlantic	116902	26923	0.2303040
West North Central	84498	10357	0.1225709
West South Central	96249	17603	0.1828902

3.3 5. Discretize GrAppv into 5 categories

loan <- loan %>% mutate(
  GrAppvGroup = case_when(
    GrAppv > 0 & GrAppv <= 25000 ~ "0-25000",
    GrAppv > 25000 & GrAppv <= 60000 ~ ">25000-60000",
    GrAppv > 60000 & GrAppv <= 130000 ~ ">60000-130000",
    GrAppv >130000 & GrAppv <= 300000 ~ ">130000-300000", 
    GrAppv > 300000 ~ ">300000" 
  )
)

loan %>% group_by(GrAppvGroup) %>% summarize(n=n())

## # A tibble: 5 × 2
##   GrAppvGroup         n
##   <chr>           <int>
## 1 0-25000        179287
## 2 >130000-300000 176473
## 3 >25000-60000   193665
## 4 >300000        166517
## 5 >60000-130000  181212

3.4 6. Plot density curves for SBA_Appv of all 5 sub-populations defined in (5)

plot(density(loan$SBA_Appv[which(loan$GrAppvGroup=="0-25000")]), xlim = c(0, 300000))
lines(density(loan$SBA_Appv[which(loan$GrAppvGroup==">25000-60000")]), col="blue")
lines(density(loan$SBA_Appv[which(loan$GrAppvGroup==">60000-130000")]), col = "red")
lines(density(loan$SBA_Appv[which(loan$GrAppvGroup==">130000-300000")]), col="green")
lines(density(loan$SBA_Appv[which(loan$GrAppvGroup==">300000")]), col="yellow")

STA 321 Assignment 6: EDA amd Applications of Combined Bank Loan Data

Alice Xiang

3/3/2024

1 Reading in the Data

2 1. Delete all records whose MIS_Status is missing

3 2. Change all currency format variables to regular numerical variables

3.1 3. Combine sparse categories of categorical variables

3.2 4. Calculate default rates by state region

3.3 5. Discretize GrAppv into 5 categories

3.4 6. Plot density curves for SBA_Appv of all 5 sub-populations defined in (5)