title: “DA 6813 Final Project” author: “Marina Allen” date: “12/2/2021” output: html_document —
library(readr)
## Warning: package 'readr' was built under R version 4.0.5
SBAnational <- read_csv("https://uofi.box.com/shared/static/vi37omgitiaa2yyplrom779qvwk1g14x.csv", col_types = cols(BalanceGross = col_number(), DisbursementGross = col_number(), GrAppv = col_number(), SBA_Appv = col_number()))
drops <- c("LoanNr_ChkDgt", "Name","City","Zip","NAICS", "ApprovalDate", "ApprovalFY","Bank", "ChgOffDate","DisbursementDate","ChgOffPrinGr")
SBAdata = SBAnational[ , !(names(SBAnational) %in% drops)]
library(sqldf)
## Warning: package 'sqldf' was built under R version 4.0.5
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 4.0.5
## Loading required package: proto
## Warning: package 'proto' was built under R version 4.0.5
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 4.0.5
SBAdata$FranchiseCode[SBAdata$FranchiseCode > 1] = 2 # all large franchise
SBAdata = sqldf('select * from SBAdata where (MIS_Status = "P I F" or MIS_Status = "CHGOFF") and (NewExist = 1 or NewExist = 2) and (FranchiseCode = 0 or FranchiseCode = 1 or FranchiseCode = 2) and (UrbanRural = 1 or UrbanRural = 2) and (RevLineCr = "Y" or RevLineCr = "N") and (LowDoc = "Y" or LowDoc = "N") and (BankState <> "AN")')
SBAdata= na.omit(SBAdata)
Copy_SBAdata = SBAdata
# Change character data to input to factor
SBAdata$MIS_Status = factor(SBAdata$MIS_Status)
SBAdata$State = factor(SBAdata$State)
SBAdata$BankState = factor(SBAdata$BankState)
SBAdata$RevLineCr = factor(SBAdata$RevLineCr)
SBAdata$LowDoc = factor(SBAdata$LowDoc)
SBAdata$FranchiseCode = factor(SBAdata$FranchiseCode)
summary(SBAdata)
## State BankState Term NoEmp
## CA : 48477 NC : 53071 Min. : 0.00 Min. : 0.000
## NY : 28837 IL : 37020 1st Qu.: 50.00 1st Qu.: 2.000
## TX : 22231 CA : 35161 Median : 84.00 Median : 3.000
## FL : 20296 RI : 33377 Mean : 82.73 Mean : 8.339
## PA : 16068 OH : 30342 3rd Qu.: 84.00 3rd Qu.: 8.000
## OH : 15623 VA : 18213 Max. :527.00 Max. :8000.000
## (Other):207788 (Other):152136
## NewExist CreateJob RetainedJob FranchiseCode
## Min. :1.000 Min. : 0.000 Min. : 0.000 0:174197
## 1st Qu.:1.000 1st Qu.: 0.000 1st Qu.: 1.000 1:173852
## Median :1.000 Median : 0.000 Median : 2.000 2: 11271
## Mean :1.269 Mean : 2.036 Mean : 5.683
## 3rd Qu.:2.000 3rd Qu.: 2.000 3rd Qu.: 6.000
## Max. :2.000 Max. :5085.000 Max. :7250.000
##
## UrbanRural RevLineCr LowDoc DisbursementGross BalanceGross
## Min. :1.000 N:167459 N:357759 Min. : 4000 Min. : 0.0
## 1st Qu.:1.000 Y:191861 Y: 1561 1st Qu.: 28000 1st Qu.: 0.0
## Median :1.000 Median : 63482 Median : 0.0
## Mean :1.181 Mean : 162311 Mean : 5.6
## 3rd Qu.:1.000 3rd Qu.: 160143 3rd Qu.: 0.0
## Max. :2.000 Max. :11446325 Max. :996262.0
##
## MIS_Status GrAppv SBA_Appv
## CHGOFF: 94663 Min. : 1000 Min. : 500
## P I F :264657 1st Qu.: 25000 1st Qu.: 12500
## Median : 50000 Median : 25000
## Mean : 139130 Mean : 101986
## 3rd Qu.: 120000 3rd Qu.: 75000
## Max. :5000000 Max. :4500000
##
str(SBAdata)
## 'data.frame': 359320 obs. of 16 variables:
## $ State : Factor w/ 51 levels "AK","AL","AR",..: 28 3 5 10 19 44 1 44 5 6 ...
## $ BankState : Factor w/ 52 levels "AK","AL","AR",..: 24 26 43 10 19 43 43 43 43 15 ...
## $ Term : num 162 126 83 84 60 80 83 26 36 54 ...
## $ NoEmp : num 2 7 18 4 3 2 2 4 18 4 ...
## $ NewExist : num 2 1 2 1 1 1 1 2 1 1 ...
## $ CreateJob : num 0 0 5 0 0 4 0 1 1 8 ...
## $ RetainedJob : num 0 0 23 4 0 6 2 5 19 4 ...
## $ FranchiseCode : Factor w/ 3 levels "0","1","2": 3 2 2 2 2 2 2 2 2 2 ...
## $ UrbanRural : num 1 1 1 1 1 2 1 2 1 2 ...
## $ RevLineCr : Factor w/ 2 levels "N","Y": 1 1 2 2 1 2 2 2 2 2 ...
## $ LowDoc : Factor w/ 2 levels "N","Y": 1 1 1 1 1 1 1 1 1 1 ...
## $ DisbursementGross: num 253400 137300 438541 51440 50000 ...
## $ BalanceGross : num 0 0 0 0 0 0 0 0 0 0 ...
## $ MIS_Status : Factor w/ 2 levels "CHGOFF","P I F": 2 2 2 2 2 2 2 1 2 1 ...
## $ GrAppv : num 253400 137300 100000 35000 50000 ...
## $ SBA_Appv : num 190050 116705 50000 17500 42500 ...
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
train <- sample(nrow(SBAdata), 0.7*nrow(SBAdata), replace = FALSE)
TrainSet <- SBAdata[train,]
ValidSet <- SBAdata[-train,]
model <- randomForest(MIS_Status ~ ., data = TrainSet, ntree = 100, mtry = 5, importance = TRUE)
model
##
## Call:
## randomForest(formula = MIS_Status ~ ., data = TrainSet, ntree = 100, mtry = 5, importance = TRUE)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 5.72%
## Confusion matrix:
## CHGOFF P I F class.error
## CHGOFF 57433 9050 0.13612502
## P I F 5346 179694 0.02889105
# Predicting on Validation set
predValid <- predict(model, ValidSet, type = "class")
# Checking classification accuracy
accuracy = mean(predValid == ValidSet$MIS_Status)
#confusion matrix
table(predValid,ValidSet$MIS_Status)
##
## predValid CHGOFF P I F
## CHGOFF 24373 2203
## P I F 3807 77414
print(accuracy)
## [1] 0.9442471
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.3
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
varImpPlot(model)
features = c("State","BankState","Term","FranchiseCode","DisbursementGross","MIS_Status")
temp = SBAdata[ , (names(SBAdata) %in% features)]
SBAdata = temp
total_performance = 0
best_logistic = 0
best_score = 0
best_table = 0
for (i in 1:10){
index_1 = createDataPartition(SBAdata$State, p = 0.85, list = F)
index_2 = createDataPartition(SBAdata$BankState, p = 0.85, list = F)
train = intersect(index_1, index_2)
TrainSet <- SBAdata[train,]
ValidSet <- SBAdata[-train,]
glm.fit <- glm(MIS_Status ~ State + BankState + Term + FranchiseCode + DisbursementGross, data = TrainSet, family = binomial)
#levels(ValidSet$MIS_Status)
glm.probs <- predict(glm.fit, newdata = ValidSet, type = "response")
glm.pred <- ifelse(glm.probs > 0.5, "P I F", "CHGOFF")
cur_score = mean(glm.pred == ValidSet$MIS_Status)
total_performance = total_performance + cur_score
if (cur_score > best_score){
best_score = cur_score
best_logistic = glm.fit
best_table = table(glm.pred, ValidSet$MIS_Status)
}
}
summary(best_logistic)
##
## Call:
## glm(formula = MIS_Status ~ State + BankState + Term + FranchiseCode +
## DisbursementGross, family = binomial, data = TrainSet)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.3230 -0.5882 0.3903 0.6587 2.7821
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.192e-01 3.442e-01 -2.670 0.007577 **
## StateAL -1.134e+00 2.064e-01 -5.495 3.92e-08 ***
## StateAR -1.711e+00 2.161e-01 -7.916 2.45e-15 ***
## StateAZ -1.268e+00 1.974e-01 -6.424 1.33e-10 ***
## StateCA -1.085e+00 1.947e-01 -5.572 2.51e-08 ***
## StateCO -1.167e+00 1.974e-01 -5.914 3.34e-09 ***
## StateCT -2.555e-01 2.011e-01 -1.271 0.203826
## StateDC -5.994e-01 2.333e-01 -2.570 0.010181 *
## StateDE -9.112e-01 2.197e-01 -4.147 3.37e-05 ***
## StateFL -1.413e+00 1.955e-01 -7.230 4.84e-13 ***
## StateGA -1.595e+00 1.973e-01 -8.082 6.38e-16 ***
## StateHI -1.398e+00 2.537e-01 -5.509 3.61e-08 ***
## StateIA -7.706e-01 2.184e-01 -3.528 0.000419 ***
## StateID -7.850e-01 2.042e-01 -3.844 0.000121 ***
## StateIL -1.005e+00 1.961e-01 -5.127 2.95e-07 ***
## StateIN -8.642e-01 1.994e-01 -4.334 1.46e-05 ***
## StateKS -1.026e+00 2.106e-01 -4.872 1.11e-06 ***
## StateKY -9.483e-01 2.051e-01 -4.625 3.75e-06 ***
## StateLA -1.258e+00 2.053e-01 -6.128 8.87e-10 ***
## StateMA -5.183e-01 1.980e-01 -2.618 0.008854 **
## StateMD -9.009e-01 1.997e-01 -4.511 6.44e-06 ***
## StateME -9.557e-02 2.181e-01 -0.438 0.661217
## StateMI -1.079e+00 1.968e-01 -5.483 4.19e-08 ***
## StateMN -7.946e-01 1.997e-01 -3.979 6.91e-05 ***
## StateMO -1.064e+00 2.006e-01 -5.303 1.14e-07 ***
## StateMS -1.665e+00 2.147e-01 -7.757 8.70e-15 ***
## StateMT -3.182e-01 2.453e-01 -1.297 0.194672
## StateNC -6.952e-01 1.990e-01 -3.494 0.000477 ***
## StateND 4.452e-01 2.845e-01 1.564 0.117701
## StateNE -6.083e-01 2.284e-01 -2.663 0.007740 **
## StateNH -5.433e-01 2.028e-01 -2.680 0.007373 **
## StateNJ -8.215e-01 1.968e-01 -4.174 3.00e-05 ***
## StateNM -4.522e-01 2.192e-01 -2.063 0.039113 *
## StateNV -1.706e+00 2.012e-01 -8.481 < 2e-16 ***
## StateNY -7.496e-01 1.954e-01 -3.836 0.000125 ***
## StateOH -6.859e-01 1.962e-01 -3.496 0.000472 ***
## StateOK -1.099e+00 2.093e-01 -5.250 1.52e-07 ***
## StateOR -5.990e-01 1.999e-01 -2.997 0.002727 **
## StatePA -7.015e-01 1.970e-01 -3.562 0.000369 ***
## StateRI -2.588e-01 2.032e-01 -1.274 0.202809
## StateSC -7.256e-01 2.075e-01 -3.497 0.000471 ***
## StateSD 1.976e-01 2.342e-01 0.844 0.398875
## StateTN -1.370e+00 2.025e-01 -6.766 1.33e-11 ***
## StateTX -7.963e-01 1.956e-01 -4.070 4.70e-05 ***
## StateUT -9.121e-01 2.009e-01 -4.541 5.60e-06 ***
## StateVA -5.611e-01 1.994e-01 -2.814 0.004893 **
## StateVT -4.761e-02 2.327e-01 -0.205 0.837890
## StateWA -9.279e-01 1.977e-01 -4.694 2.68e-06 ***
## StateWI -6.197e-01 2.017e-01 -3.073 0.002122 **
## StateWV -4.992e-01 2.330e-01 -2.142 0.032160 *
## StateWY -1.336e-01 3.006e-01 -0.444 0.656875
## BankStateAL -2.556e-01 3.971e-01 -0.644 0.519882
## BankStateAR 1.183e+00 4.074e-01 2.904 0.003683 **
## BankStateAZ 1.106e+00 4.341e-01 2.547 0.010866 *
## BankStateCA -9.173e-01 3.935e-01 -2.331 0.019741 *
## BankStateCO 5.472e-01 4.174e-01 1.311 0.189845
## BankStateCT -9.542e-02 4.017e-01 -0.238 0.812249
## BankStateDC -1.004e+00 4.787e-01 -2.097 0.036006 *
## BankStateDE 8.234e-02 3.943e-01 0.209 0.834575
## BankStateFL -8.251e-01 3.958e-01 -2.085 0.037096 *
## BankStateGA 1.048e+00 3.993e-01 2.626 0.008646 **
## BankStateHI 1.811e+00 4.456e-01 4.065 4.80e-05 ***
## BankStateIA 4.598e-01 4.132e-01 1.113 0.265775
## BankStateID 5.766e-02 4.095e-01 0.141 0.888021
## BankStateIL -3.825e-01 3.936e-01 -0.972 0.331145
## BankStateIN 1.457e+00 4.046e-01 3.600 0.000318 ***
## BankStateKS 1.446e+00 4.135e-01 3.496 0.000472 ***
## BankStateKY 9.555e-01 4.136e-01 2.310 0.020866 *
## BankStateLA 1.257e+00 4.217e-01 2.982 0.002868 **
## BankStateMA 1.010e+00 3.999e-01 2.527 0.011517 *
## BankStateMD 1.365e+00 4.075e-01 3.350 0.000808 ***
## BankStateME 1.135e+00 4.258e-01 2.666 0.007673 **
## BankStateMI 1.231e+00 4.053e-01 3.037 0.002388 **
## BankStateMN 9.292e-01 3.993e-01 2.327 0.019976 *
## BankStateMO 5.731e-01 3.995e-01 1.434 0.151443
## BankStateMS 2.152e+00 4.121e-01 5.223 1.76e-07 ***
## BankStateMT 1.581e+00 4.325e-01 3.655 0.000257 ***
## BankStateNC -3.577e-01 3.935e-01 -0.909 0.363323
## BankStateND 6.146e-01 4.444e-01 1.383 0.166668
## BankStateNE 8.017e-01 4.207e-01 1.906 0.056666 .
## BankStateNH 5.894e-01 4.095e-01 1.439 0.150087
## BankStateNJ 4.634e-01 4.030e-01 1.150 0.250272
## BankStateNM 1.056e+00 4.220e-01 2.503 0.012303 *
## BankStateNV 3.623e-01 4.126e-01 0.878 0.379903
## BankStateNY 1.971e-01 3.943e-01 0.500 0.617239
## BankStateOH 1.373e-01 3.936e-01 0.349 0.727181
## BankStateOK 7.783e-01 4.096e-01 1.900 0.057411 .
## BankStateOR -7.456e-01 3.959e-01 -1.883 0.059638 .
## BankStatePA 1.263e+00 3.982e-01 3.171 0.001519 **
## BankStatePR -4.129e-01 1.149e+00 -0.359 0.719381
## BankStateRI -1.664e-01 3.939e-01 -0.422 0.672711
## BankStateSC -2.263e+00 4.141e-01 -5.464 4.65e-08 ***
## BankStateSD -5.298e-01 3.934e-01 -1.347 0.178064
## BankStateTN -4.107e-02 4.095e-01 -0.100 0.920124
## BankStateTX -7.987e-02 3.961e-01 -0.202 0.840199
## BankStateUT 1.716e-02 3.968e-01 0.043 0.965507
## BankStateVA -1.073e+00 3.936e-01 -2.726 0.006401 **
## BankStateVT 1.859e+00 4.376e-01 4.248 2.16e-05 ***
## BankStateWA 3.351e-01 4.033e-01 0.831 0.405961
## BankStateWI 3.217e-01 3.988e-01 0.807 0.419890
## BankStateWV -2.879e-01 4.590e-01 -0.627 0.530604
## BankStateWY 1.407e+00 5.383e-01 2.613 0.008978 **
## Term 4.132e-02 2.147e-04 192.401 < 2e-16 ***
## FranchiseCode1 8.517e-01 1.168e-02 72.903 < 2e-16 ***
## FranchiseCode2 -4.071e-03 3.496e-02 -0.116 0.907302
## DisbursementGross 2.210e-07 2.760e-08 8.007 1.17e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 299698 on 259710 degrees of freedom
## Residual deviance: 219502 on 259605 degrees of freedom
## AIC: 219714
##
## Number of Fisher Scoring iterations: 6
best_score
## [1] 0.8309089
library(magrittr) # needs to be run every time you start R and want to use %>%
## Warning: package 'magrittr' was built under R version 4.0.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.3
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(class)
standardized.X = data.frame(scale(SBAdata[ , -c(1, 2, 4, 6)]))
summary(standardized.X)
## Term DisbursementGross
## Min. :-1.40239 Min. :-0.55314
## 1st Qu.:-0.55483 1st Qu.:-0.46928
## Median : 0.02151 Median :-0.34531
## Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.02151 3rd Qu.:-0.00757
## Max. : 7.53091 Max. :39.42652
unstandardized.X = data.frame(SBAdata[, -c(3, 5)])
combined.X = cbind(unstandardized.X, standardized.X)
summary(combined.X)
## State BankState FranchiseCode MIS_Status
## CA : 48477 NC : 53071 0:174197 CHGOFF: 94663
## NY : 28837 IL : 37020 1:173852 P I F :264657
## TX : 22231 CA : 35161 2: 11271
## FL : 20296 RI : 33377
## PA : 16068 OH : 30342
## OH : 15623 VA : 18213
## (Other):207788 (Other):152136
## Term DisbursementGross
## Min. :-1.40239 Min. :-0.55314
## 1st Qu.:-0.55483 1st Qu.:-0.46928
## Median : 0.02151 Median :-0.34531
## Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.02151 3rd Qu.:-0.00757
## Max. : 7.53091 Max. :39.42652
##
# Convert factor predictors into numerics to be used in KNN classification
combined.X$State = as.integer(combined.X$State)
combined.X$BankState = as.integer(combined.X$BankState)
# use 20% of data for test set
test.X = combined.X[, -4] %>% slice(1:69609)
train.X = combined.X[, -4] %>% slice(69610:348049)
test.Y = combined.X$MIS_Status[1:69609]
train.Y = combined.X$MIS_Status[69610:348049]
# 1st attempt with K = 1
set.seed(1)
knn_pred = knn(train.X, test.X, train.Y, k = 1)
k_1_error_rate = mean(test.Y != knn_pred) # KNN error rate
print(k_1_error_rate)
## [1] 0.1047135
table(knn_pred, test.Y)
## test.Y
## knn_pred CHGOFF P I F
## CHGOFF 17812 3254
## P I F 4035 44508
# 2nd Attempt with K = 3
set.seed(2)
knn_pred = knn(train.X, test.X, train.Y, k = 3)
k_3_error_rate = mean(test.Y != knn_pred) # KNN error rate
print(k_3_error_rate)
## [1] 0.09639558
table(knn_pred, test.Y)
## test.Y
## knn_pred CHGOFF P I F
## CHGOFF 17929 2792
## P I F 3918 44970
# 3rd Attempt with K = 5
set.seed(3)
knn_pred = knn(train.X, test.X, train.Y, k = 5)
k_5_error_rate = mean(test.Y != knn_pred) # KNN error rate
print(k_5_error_rate)
## [1] 0.09426942
table(knn_pred, test.Y)
## test.Y
## knn_pred CHGOFF P I F
## CHGOFF 18032 2747
## P I F 3815 45015
# 4th Attempt with K = 7
set.seed(4)
knn_pred = knn(train.X, test.X, train.Y, k = 7)
k_7_error_rate = mean(test.Y != knn_pred) # KNN error rate
print(k_7_error_rate)
## [1] 0.09240184
table(knn_pred, test.Y)
## test.Y
## knn_pred CHGOFF P I F
## CHGOFF 18076 2661
## P I F 3771 45101
# 5th Attempt with K = 10
set.seed(5)
knn_pred = knn(train.X, test.X, train.Y, k = 10)
k_10_error_rate = mean(test.Y != knn_pred) # KNN error rate
print(k_10_error_rate)
## [1] 0.09781781
table(knn_pred, test.Y)
## test.Y
## knn_pred CHGOFF P I F
## CHGOFF 17790 2752
## P I F 4057 45010
# 6th Attempt with K = 100
set.seed(100)
knn_pred= knn(train.X, test.X, train.Y, k = 100)
k_100_error_rate = mean(test.Y != knn_pred) # KNN error rate
print(k_100_error_rate)
## [1] 0.127541
table(knn_pred, test.Y)
## test.Y
## knn_pred CHGOFF P I F
## CHGOFF 16234 3265
## P I F 5613 44497
X = Copy_SBAdata$State == Copy_SBAdata$BankState
same_rate = sum(X, na.rm = TRUE) / nrow(Copy_SBAdata) *100
new.df = data.frame("state" = Copy_SBAdata$BankState, "default" = Copy_SBAdata$MIS_Status)
new.df = na.omit(new.df)
new.df$dfr = new.df$default == "CHGOFF"
new.df = new.df[,-2]
my.df = aggregate.data.frame(x = new.df$dfr, by=list(new.df$state), FUN = mean)
colnames(my.df) = c("state", "DefaultRate")
usmap::plot_usmap(data = my.df, values = "DefaultRate", lines = "black")+
scale_fill_continuous(low = "white", high = "cyan4", name = "Default Rates for Banks by State")
## Warning: Ignoring unknown parameters: lines
new.df = data.frame("state" = Copy_SBAdata$State, "default" = Copy_SBAdata$MIS_Status)
new.df = na.omit(new.df)
new.df$dfr = new.df$default == "CHGOFF"
new.df = new.df[,-2]
my.df = aggregate.data.frame(x = new.df$dfr, by=list(new.df$state), FUN = mean)
colnames(my.df) = c("state", "DefaultRate")
usmap::plot_usmap(data = my.df, values = "DefaultRate", lines = "black")+
scale_fill_continuous(low = "white", high = "deeppink", name = "Default Rates by State")
## Warning: Ignoring unknown parameters: lines
SBAPIF = SBAnational[SBAnational$MIS_Status == "P I F",]
## Warning: One or more parsing issues, see `problems()` for details
SBACHGOFF = SBAnational[SBAnational$MIS_Status == "CHGOFF",]
new.df = data.frame("approvalyear" = SBAPIF$ApprovalFY, "disbursementgross" = SBAPIF$DisbursementGross)
new.df = na.omit(new.df)
my.df = aggregate.data.frame(x = new.df$disbursementgross, by=list(new.df$approvalyear), FUN = mean)
colnames(my.df) = c("Year", "AvgLoanAmt")
ggplot2::ggplot(data = my.df)+
aes(x = Year, y = AvgLoanAmt)+
geom_line()+
labs(title = "Average Loan Amount Over Time for Loans Paid in Full", x = "Year", y = "Average Disbursement Gross")
new.df = data.frame("approvalyear" = SBACHGOFF$ApprovalFY, "disbursementgross" = SBACHGOFF$DisbursementGross)
new.df = na.omit(new.df)
my.df = aggregate.data.frame(x = new.df$disbursementgross, by=list(new.df$approvalyear), FUN = mean)
colnames(my.df) = c("Year", "AvgLoanAmt")
ggplot2::ggplot(data = my.df)+
aes(x = Year, y = AvgLoanAmt)+
geom_line()+
labs(title = "Average Loan Amount Over Time for Defaulted Loans", x = "Year", y = "Average Disbursement Gross")