library(distill)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ stringr 1.4.0
## ✔ tidyr   1.2.0     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(knitr)
library(ggplot2)
library("readxl")

BACKGROUND

One of the federal loan programs available to help finance college education is the parent PLUS program. This college financing program has increased popularity through the years. It is a loan program for parents of undergraduates to borrow money from the federal government to bridge the gap between the cost of attendance and available financial resources such as student loans, grants, and scholarships. According to the data from over 800 school institutions, there are currently over 273,000 parent borrowers comprising the $ 84.3B outstanding loan balance in this program. There are minimal requirements to qualify for parent PLUS loans, namely being a parent of a dependent student and not have adverse credit history (with exemptions allowed). There is no debt-to-income ratio requirement to secure any loan amount. These minimal requirements have made the loans and college education more accessible to parents, notwithstanding their ability to repay the debt. Parent debt is an inter-generational social science problem. Both parents and children are saddled with loans that threaten their current and future financial security.

RESEARCH AND METHODS

This multifaceted study uses quantitative methods to examine the characteristics of parent PLUS loans including demographic profiles, loan amounts, type of borrowers, and analyze trends on repayment and default rates from over 800 post-secondary schools in the US.

        METHODS:
        -Data is sourced from March 2022  US Department of Education
        College Scorecard 
        -Missing Not At Random (MNAR) data (e.g. NULL, Privacy
        Suppressed) was handled using listwise deletion. Imputation
        of MNAR is not preferred as it will give misleading results
        -Exploratory data analysis, data visualization and
        descriptive statistics.
        -Correlation Matrix to evaluate strongly correlated variables
        -Simple and multiple regression analyses 
        -Post hoc test using EMM (Estimated Marginal Means)
        -Diagnostic Plots evaluation of regression model    

Data

loans<- read_excel("LOANS.xlsx",)
Eloans <-na.omit(loans)
Eloans<-rename(Eloans, State=STABBR)
Eloans<-rename(Eloans,SAT=SAT_AVG)
Eloans<-  rename(Eloans,COST=COSTT4_A)
Eloans <-rename(Eloans,Students=LPPPLUS_CNT)
Eloans<-  rename(Eloans,Repay5yr=DBRR5_PP_UG_RT) 
Eloans<-  rename(Eloans,Repay10yr=DBRR10_PP_UG_RT)
Eloans<-  rename(Eloans,Repay20yr=DBRR20_PP_UG_RT)
Eloans<-  rename(Eloans,ParentLoan=PLUS_DEBT_INST_MD)
Eloans<-  rename(Eloans,StudentLoan=DEBT_MDN )
Eloans<-rename(Eloans, Default3yr=BBRR3_PP_UG_DFLT)
Eloans<-  rename(Eloans,NoDegree=OMENRUP_ALL_POOLED_SUPP)
Eloans<-rename(Eloans, PellBorrowerLoan=PLUS_DEBT_INST_PELL_MD)
Eloans<-  rename(Eloans,NoPellBorrowerLoan=PLUS_DEBT_INST_NOPELL_MD)
Eloans<-rename(Eloans,Parents=BBRR1_PP_UG_N)
dim(Eloans)
## [1] 865  35

Sum of Total outstanding Parent PLUS Loan balance

TPL<-sum(Eloans$LPPPLUS_AMT)
print(paste("Total outstanding Parent PLUS Loan balance is",TPL))
## [1] "Total outstanding Parent PLUS Loan balance is 84303788470"

Sum of Number of students associated with outstanding Parent PLUS Loan balances

TSL<-sum(Eloans$Students)
print(paste("Total Number of students associated with outstanding Parent PLUS Loan balances is", TSL))
## [1] "Total Number of students associated with outstanding Parent PLUS Loan balances is 2730197"

LOAN DISTRIBUTION ACROSS THE US

Fig 1. Who is borrowing how much?

Method: Descriptive Statistics

ggplot(Eloans, aes(x = State , y= Parents, fill = COST)) +
  geom_bar(position="dodge",stat = "identity") +
  labs(x = "State",
       y = "Parents with Loans",
       title = "Number of Parents who took Loans vs Cost to attend
       (One Academic Year Data)") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1,size=8),
        axis.text.y = element_text(size=12),
        axis.title.y = element_text(size=12),
        axis.title.x = element_text(size=12),
        plot.title = element_text(size=12,hjust=0.5))

Observation

    -One Year Data Distribution
    -Pennsylvania had highest number of parent PLUS borrowers followed by 
    Arizona,Texas and Michigan, ave cost of attendance <$40,000
    -New York has highest ave cost of attendance > $40,000
    -California has a mix of high and mid-level ave cost of attendance.
    -Mean parent PLUS loans =$15,551
    -Ave cost of attendance =$27,548

LOAN AMOUNTS AND SCHOOL SELECTIVITY

Fig 2. Where are the loans disbursed?

Method: Regression Analysis

library(hexbin)
ggplot(Eloans, aes(SAT, ParentLoan)) +
  geom_hex(bins = 20, color = "white")+
  scale_fill_gradient(low =  "#00AFBB", high = "#FC4E07")+
  theme_minimal() +
  geom_smooth(method = "lm") +
ggtitle( "Loans Distribution Across Selective Universities (p-value<0.05)" )+
  theme(axis.text=element_text(size=14),
        axis.title=element_text(size=14,face="bold"))
## `geom_smooth()` using formula 'y ~ x'

SAT_lm<-lm(ParentLoan~SAT,Eloans)
summary(SAT_lm)
## 
## Call:
## lm(formula = ParentLoan ~ SAT, data = Eloans)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -21865  -6885  -1560   4992  66762 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -42600.486   3577.839  -11.91   <2e-16 ***
## SAT             58.120      3.101   18.74   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10420 on 863 degrees of freedom
## Multiple R-squared:  0.2893, Adjusted R-squared:  0.2885 
## F-statistic: 351.3 on 1 and 863 DF,  p-value: < 2.2e-16

Observations

      -Figure 2 shows positive correlation between parent PLUS loan amounts and average SAT scores at the institutions. 
      -Higher loan amounts are disbursed at more selective schools with higher SAT score averages 
      -Universities with mid-range SAT scores disbursed more loans < $25,000

BORROWER BEHAVIOR

Fig 3. Does Income Influence Borrower Behavior?

Method: Regression Analysis

The maximum 4-year undergraduate student loan is $27,000 at an interest rate of 3.73% while parent loans have no limit and a higher interest rate of 6.28%. This study used regression analysis to compare borrowing behaviors between 2 groups namely Pell and Non-Pell Grant recipients. Pell grants are need based awards that does not have to be repaid and are available only to undergraduates who display exceptional financial need. Most Pell grants are awarded to 90.6% of parent of households with income < $20,000 and 86.7% of parents with income 20K-40K. The mean debt of parent of Pell grant recipients is $18,412 vs non-Pell grant recipients with $30,821.

PPL2<-lm(ParentLoan~PellBorrowerLoan, Eloans)
summary(PPL2)
## 
## Call:
## lm(formula = ParentLoan ~ PellBorrowerLoan, data = Eloans)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -20307  -2563  -1041   1338  32771 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.962e+03  3.358e+02   8.819   <2e-16 ***
## PellBorrowerLoan 1.150e+00  1.605e-02  71.623   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4689 on 863 degrees of freedom
## Multiple R-squared:  0.856,  Adjusted R-squared:  0.8558 
## F-statistic:  5130 on 1 and 863 DF,  p-value: < 2.2e-16
PPL1<-lm(ParentLoan~NoPellBorrowerLoan, Eloans)
summary(PPL1)
## 
## Call:
## lm(formula = ParentLoan ~ NoPellBorrowerLoan, data = Eloans)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14541.6  -1441.7    303.1   1717.2  21088.1 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -7.704e+02  2.656e+02    -2.9  0.00382 ** 
## NoPellBorrowerLoan  8.079e-01  7.779e-03   103.9  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3363 on 863 degrees of freedom
## Multiple R-squared:  0.9259, Adjusted R-squared:  0.9258 
## F-statistic: 1.079e+04 on 1 and 863 DF,  p-value: < 2.2e-16
L1<-ggplot(Eloans, aes(x = StudentLoan,y = PellBorrowerLoan)) +
    geom_point(color=2) + 
    geom_smooth(method = "lm") +
    labs(x="Student Loan Amounts", y="Parent Loan Amounts", title = "Pell Grant Recipients
         (𝑅²:  0.856,p-value: < 0.05)")
L2<-ggplot(Eloans, aes(x = StudentLoan,y = NoPellBorrowerLoan)) +
    geom_point(color=2) + 
    geom_smooth(method = "lm") +
    labs(x="Student Loan Amounts", y="Parent Loan Amounts", title = "Non-Pell Grant Recipents
         (𝑅²:0.9259,p-value: < 0.05)")
library(see)
plots(L1, L2, n_rows = 1)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

Default_Cor <- select(Eloans,c("Default3yr","ParentLoan","COST",  "Repay5yr", "Repay10yr" , "Repay20yr",
                          "PellBorrowerLoan","NoPellBorrowerLoan","NoDegree"))

Observations

      -Student maximum loan amounts for both groups are $27,000
      -Low- and moderate-income parents borrowed less amount compared
      to moderate-and high-income households. 
      -However, considering the median household income, the 
      debt-to-income ratio of low- and moderate-income parents of
      Pell grant recipients is much higher than parents of Non-Pell
      grant recipients. Parents with higher debt-to-income ratio are
      more likely to have difficulty repaying loans.
      -Median 3-year default rate of parents loans with Pell grants
      is higher at 14% compared to 10% for parents Non-Pell grant
      recipients.

EXAMINING DEFAULT RATES

Fig 4. What influences default rates?

Method: Correlation Matrix

The College Scorecard has a rich and very wide data on aid, repayment, income, completion, etc on an institutional level. This study is limited to exploring some variables that may be correlated to default rates on parent loans. The correlation matrix to understand default rates factors the following variables: Loan Amounts, Cost of Attendance, 5-,10-,and 20-year Repayment Rates, Pell Grant and Non-Pell Grant Loan Amounts and Degree Completion.

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggcorr(Default_Cor, nbreaks = 4,low = "steelblue", mid = "white", high = "red", label = TRUE, label_size = 5, hjust = 0.85, layout.exp = 3)

Observation

        The two variableswith highest correlation coefficients (positively 
        correlation) to default rates are:
        1.***Repay5yr, (cor 0.6)*** - Undergraduate Parent PLUS Loan 
        dollar-based 5-year repayment rate
        2.***No Degree, (cor 0.5)***- Percentage of all students that did 
        not receive an award and whose enrollment status is unknown after 
        leaving this institution within 8 years of entry

PREDICTING DEFAULT RATES

Fig 5. Can default rates be predicted?

Method: Regression Analysis, Model Evaluation, Estimated Marginal Means (EMM) Post Hoc Test lm(formula = Default3yr ~ Repay5yr + NoDegree, data = Eloans)

Predicted values of a 3-year default rate using EMM (estimated marginal means) fitted with multiple linear regression model.

Default1 <-lm(Default3yr~Repay5yr+NoDegree, data = Eloans)
summary(Default1)
## 
## Call:
## lm(formula = Default3yr ~ Repay5yr + NoDegree, data = Eloans)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.102440 -0.021622 -0.005443  0.016983  0.131494 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.041400   0.005007  -8.268 5.12e-16 ***
## Repay5yr     0.139053   0.006954  19.995  < 2e-16 ***
## NoDegree     0.111405   0.010970  10.155  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03111 on 862 degrees of freedom
## Multiple R-squared:  0.4659, Adjusted R-squared:  0.4646 
## F-statistic: 375.9 on 2 and 862 DF,  p-value: < 2.2e-16
library(ggeffects)
lm(Default3yr~Repay5yr+NoDegree, data = Eloans) %>% 
  ggemmeans(terms = c("Repay5yr", "NoDegree")) %>% 
  plot () +
  ggtitle( "EMMEANS:Predicted Values of 3-year Default Rate" )+
  theme(axis.text=element_text(size=14),
        axis.title=element_text(size=14,face="bold"))
## Loading required namespace: emmeans

Observation

The mean percentage of undergraduate student Parent PLUS Loan borrowers in default after 3 years is 14.7%, with a max of 59% (Data from 2,766 institutions)

    HIGHER DEFAULT RATE FOR: 
    -Parents  who entered a repayment plan within 5 years of loan disbursement AND
    -Parents whose students did not complete a degree

MODEL EVALUATION -Diagnostic Plots

Default_lm<- lm(Default3yr~Repay5yr+NoDegree, data=Eloans)
summary(Default_lm)
## 
## Call:
## lm(formula = Default3yr ~ Repay5yr + NoDegree, data = Eloans)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.102440 -0.021622 -0.005443  0.016983  0.131494 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.041400   0.005007  -8.268 5.12e-16 ***
## Repay5yr     0.139053   0.006954  19.995  < 2e-16 ***
## NoDegree     0.111405   0.010970  10.155  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03111 on 862 degrees of freedom
## Multiple R-squared:  0.4659, Adjusted R-squared:  0.4646 
## F-statistic: 375.9 on 2 and 862 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(Default_lm)

Assumptions: Multicollinearity and Heteroskedascity

performance::check_collinearity(Default_lm)
## # Check for Multicollinearity
## 
## Low Correlation
## 
##      Term  VIF Increased SE Tolerance
##  Repay5yr 1.15         1.07      0.87
##  NoDegree 1.15         1.07      0.87

Low Correlation, No Multicollinearity, regression model is good.

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift

Assumption: Heteroskesdacity, use Breusch-Pagan test

lmtest::bptest(lm(Default3yr~NoDegree+Repay5yr, data=Eloans))
## 
##  studentized Breusch-Pagan test
## 
## data:  lm(Default3yr ~ NoDegree + Repay5yr, data = Eloans)
## BP = 20.909, df = 2, p-value = 2.882e-05

Result: At p-value <0.05, homoscedasticity rejected in favor of heteroskedasticity

Transformation:

Transformed dependent variable using Box Cox Transformation and applied Breusch-Pagan test

Default_BCMod <- caret::BoxCoxTrans(Eloans$Default3yr)
print(Default_BCMod)
## Box-Cox Transformation
## 
## 865 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02000 0.05000 0.08000 0.08081 0.10000 0.30000 
## 
## Largest/Smallest: 15 
## Sample Skewness: 1.18 
## 
## Estimated Lambda: 0.1 
## With fudge factor, Lambda = 0 will be used for transformations
NEloans <- cbind(Eloans, dist_new=predict(Default_BCMod, Eloans$Default3yr)) # append the transformed variable 
head(NEloans) # view the top 6 rows
##                                INSTNM State  UGDS  SAT  COST PCTPELL PCTFLOAN
## 1            Alabama A & M University    AL  5271  939 23053  0.7019   0.7361
## 2 University of Alabama at Birmingham    AL 13328 1234 24495  0.3512   0.4798
## 3 University of Alabama in Huntsville    AL  7785 1319 23917  0.2536   0.3976
## 4            Alabama State University    AL   375  946 21866  0.7627   0.8232
## 5           The University of Alabama    AL   319 1261 29872  0.1772   0.3802
## 6     Auburn University at Montgomery    AL   447 1082 19849  0.4644   0.5391
##   LPSTAFFORD_CNT LPSTAFFORD_AMT Students LPPPLUS_AMT Parents BBRR1_PP_NOPELL_N
## 1          32214      952539931     5398   124751228    1062               150
## 2          59241     1805698825     3903    95560297     806               322
## 3          20387      423083320     1444    32335435     289               112
## 4          34809      968253338     4638    98316982    1135               147
## 5          77636     1837713282    12086   676026693    2568              1377
## 6          23837      511627872     1369    18241142     300                73
##   BBRR1_PP_PELL_N  CDR3 DBRR1_PP_UG_RT  Repay5yr Repay10yr Default3yr
## 1             912 0.176       1.155044 1.2040422 0.9109411       0.20
## 2             484 0.063       1.068891 0.9748241 0.6820359       0.08
## 3             177 0.065       1.040121 0.8645943 0.4597370       0.09
## 4             988 0.180       1.140748 1.2583712 0.8876563       0.20
## 5            1191 0.054       1.065396 0.9187667 0.6487990       0.07
## 6             227 0.109       1.097795 1.1208157 0.5972791       0.14
##   ParentLoan StudentLoan PPLUS_PCT_HIGH_POOLED_SUPP Repay20yr BBRR3_PP_PELL_N
## 1      16552       15250                         30 0.3771945             683
## 2      18586       15085                         10 0.4306524             353
## 3      15354       14000                         10 0.2390932             155
## 4      18289       17500                         25 0.3885780             711
## 5      44851       17671                         15 0.3586384            1062
## 6       8000       12000                         10 0.4559485             139
##   BBRR3_PP_PELL_DFLT BBRR3_PP_NOPELL_DFLT BBRR3_PP_NOPELL_N PellBorrowerLoan
## 1               0.29                 0.10               116            15944
## 2               0.14                 0.05               249            15304
## 3               0.20                 0.20                84            13300
## 4               0.39                 0.20                87            16882
## 5               0.10                 0.03              1082            38868
## 6               0.20                 0.20                64             7507
##   NoPellBorrowerLoan PELL_RPY_3YR_RT_SUPP NOPELL_RPY_3YR_RT_SUPP OMENRUP_ALL
## 1              19004            0.2390405              0.3492958      0.3016
## 2              22178            0.4716981              0.5892070      0.1832
## 3              16956            0.4832808              0.6783920      0.1716
## 4              25046            0.2079327              0.3333333      0.2262
## 5              48411            0.5159860              0.7126540      0.0820
## 6              10000            0.3853631              0.5775034      0.2807
##   NoDegree OMENRUP_PELL_ALL OMENRUP8_FTFT  dist_new
## 1   0.2789           0.3271        0.3243 -1.609438
## 2   0.2017           0.1915        0.1549 -2.525729
## 3   0.1730           0.2063        0.1450 -2.407946
## 4   0.2481           0.2543        0.2229 -1.609438
## 5   0.0813           0.1448        0.0623 -2.659260
## 6   0.2993           0.2945        0.2898 -1.966113
lmMod_def <- lm(dist_new ~ NoDegree, data=NEloans)
bptest(lmMod_def)
## 
##  studentized Breusch-Pagan test
## 
## data:  lmMod_def
## BP = 0.047752, df = 1, p-value = 0.827

Result: New p-value = 0.827 after transformation: homoscedasticity not rejected

Evaluate New Diagnostic Plots using Box Cox Transformed Variable

Default_lm2<- lm(dist_new~Repay5yr+NoDegree, data=NEloans)
summary(Default_lm2)
## 
## Call:
## lm(formula = dist_new ~ Repay5yr + NoDegree, data = NEloans)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.20735 -0.25334 -0.01247  0.29037  1.17292 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.06186    0.06509 -62.402   <2e-16 ***
## Repay5yr     1.58272    0.09041  17.507   <2e-16 ***
## NoDegree     1.40450    0.14261   9.849   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4045 on 862 degrees of freedom
## Multiple R-squared:  0.4149, Adjusted R-squared:  0.4135 
## F-statistic: 305.6 on 2 and 862 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(Default_lm2)

CONCLUSION

With rising costs of education, parent PLUS loans have become increasing popular for families. The availability of parent PLUS loans have made post-secondary education accessible to more low- and moderate-income families. The mean cost of attendance per year is $38,144 and the mean parent PLUS loan is $24,131, with a max of $91,987 at an interest rate of 6.28%. According to the US Census Bureau, the median household income in the US is $67,521 The lack of limit of loan amounts and absence of debt-to-income ratio requirement have allowed parents to borrow more than their ability to repay. Comparison of parents of Pell grant recipients and non-Pell grant recipient borrower behavior is alarming. Most Pell grant recipient parents have income <$40,000 and median loan amount of $16,000, with a max of $95,089. Parents who entered repayment with 5 years of loan disbursement with students who did not complete the degree are most likely to default on the loan. This is likely due to personal circumstances such as family obligations or sought employment.

At a default rate of 14.7%, further analysis on this federal loan program is needed to better understand and prevent families from being entrapped in debt and exacerbate wealth disparities across races.

LIMITATIONS OF THE STUDY

  -The College Scorecard has data on 6,694 institutions all over the US but 
  this study is limited to only 865 schools due to missing data (e.g NULL and Privacy Suppressed).
  -The most recent data also discontinued collecting information relating to 
  income, an important variable that could increase statistical power of this
  model.
  -The researcher used listwise deletion to handle the missing data. Due to 
  the large size of the missing not at random (MNAR)  data, multiple 
  imputation is not preferred as it may provide misleading results.
  -Regression model used to analyze default rates was limited to a couple of
  the variables with high correlation coefficients. The College Scorecard 
  data is a very rich source of information. Unfortunately, the most recent
  data have discontinued releasing information on earnings and income. 
  Further examination and analysis of underlying factors (e.g. race, 
  first-generation students, gender, family characteristics) could probably 
  increase the explanatory power of the model.

FUTURE WORK: Loan Default Prediction using Decision Tree Classifier

Model currently has low accuracy, possible due to variance of continuous variables.Further work is needed.

Eloans_train_test <- function(Eloans, size = 0.8, train = TRUE) {
    n_row = nrow(Eloans)
    total_row = size * n_row
    train_sample <- 1:total_row
    if (train == TRUE) {
        return (Eloans[train_sample, ])
    } else {
        return (Eloans[-train_sample, ])
    }
}
Eloans_train <-Eloans_train_test(Eloans, 0.8, train = TRUE)
Eloans_test <- Eloans_train_test(Eloans, 0.8, train = FALSE)
dim(Eloans_train)
## [1] 692  35
dim(Eloans_test)
## [1] 173  35
prop.table(table(Eloans_train$Default3yr))
## 
##        0.02        0.03        0.04        0.05        0.06        0.07 
## 0.031791908 0.044797688 0.075144509 0.235549133 0.062138728 0.010115607 
##        0.08        0.09         0.1        0.11        0.12        0.14 
## 0.036127168 0.174855491 0.187861272 0.001445087 0.011560694 0.069364162 
##        0.16        0.17        0.18        0.19         0.2        0.22 
## 0.007225434 0.002890173 0.002890173 0.031791908 0.005780347 0.004335260 
##        0.24         0.3 
## 0.002890173 0.001445087
prop.table(table(Eloans_test$Default3yr))
## 
##        0.02        0.03        0.04        0.05        0.06        0.07 
## 0.046242775 0.075144509 0.075144509 0.260115607 0.069364162 0.023121387 
##        0.08        0.09         0.1        0.12        0.14        0.16 
## 0.011560694 0.104046243 0.173410405 0.023121387 0.057803468 0.017341040 
##        0.18        0.19         0.2        0.22        0.24 
## 0.005780347 0.040462428 0.005780347 0.005780347 0.005780347
library(rpart)
library(rpart.plot)
Eloansfit <- rpart(Default3yr~NoDegree+Repay5yr, data = Eloans_train, method = 'anova')
rpart.plot(Eloansfit, extra = 101)

predict(Eloansfit, Eloans_train, type = 'vector')
##          1          2          3          4          5          6          7 
## 0.19294118 0.13200000 0.09466667 0.19294118 0.10161290 0.19294118 0.07046512 
##          8          9         10         11         12         13         14 
## 0.07745342 0.07745342 0.14884615 0.19294118 0.09466667 0.07745342 0.07745342 
##         15         16         17         18         19         20         21 
## 0.19294118 0.05383886 0.09466667 0.07046512 0.13200000 0.19294118 0.09466667 
##         22         23         24         25         26         27         28 
## 0.07046512 0.05383886 0.13200000 0.07745342 0.13200000 0.09466667 0.07745342 
##         29         30         31         32         33         34         35 
## 0.13200000 0.05383886 0.05383886 0.05383886 0.07046512 0.09466667 0.07745342 
##         36         37         38         39         40         41         42 
## 0.05383886 0.09466667 0.05383886 0.05383886 0.09466667 0.05383886 0.05383886 
##         43         44         45         46         47         48         49 
## 0.09466667 0.05383886 0.05383886 0.05383886 0.05383886 0.05383886 0.05383886 
##         50         51         52         53         54         55         56 
## 0.05383886 0.05383886 0.07745342 0.05383886 0.07046512 0.07745342 0.07745342 
##         57         58         59         60         61         62         63 
## 0.05383886 0.09466667 0.13200000 0.05383886 0.05383886 0.07046512 0.09466667 
##         64         65         66         67         68         69         70 
## 0.09466667 0.07046512 0.05383886 0.05383886 0.05383886 0.07745342 0.05383886 
##         71         72         73         74         75         76         77 
## 0.05383886 0.05383886 0.07046512 0.05383886 0.05383886 0.07046512 0.05383886 
##         78         79         80         81         82         83         84 
## 0.07046512 0.05383886 0.07046512 0.07745342 0.09466667 0.07745342 0.05383886 
##         85         86         87         88         89         90         91 
## 0.05383886 0.07745342 0.05383886 0.07745342 0.07745342 0.07745342 0.07745342 
##         92         93         94         95         96         97         98 
## 0.05383886 0.09466667 0.07745342 0.14884615 0.07745342 0.05383886 0.07046512 
##         99        100        101        102        103        104        105 
## 0.10161290 0.10161290 0.07046512 0.09466667 0.19294118 0.07745342 0.14884615 
##        106        107        108        109        110        111        112 
## 0.07046512 0.05383886 0.05383886 0.14884615 0.10161290 0.19294118 0.09466667 
##        113        114        115        116        117        118        119 
## 0.07745342 0.07745342 0.07046512 0.14884615 0.09466667 0.07745342 0.09466667 
##        120        121        122        123        124        125        126 
## 0.10161290 0.09466667 0.07046512 0.07046512 0.13200000 0.07046512 0.05383886 
##        127        128        129        130        131        132        133 
## 0.14884615 0.09466667 0.09466667 0.09466667 0.07046512 0.13200000 0.07745342 
##        134        135        136        137        138        139        140 
## 0.07745342 0.14884615 0.14884615 0.05383886 0.13200000 0.13200000 0.05383886 
##        141        142        143        144        145        146        147 
## 0.14884615 0.05383886 0.07046512 0.09466667 0.13200000 0.05383886 0.07046512 
##        148        149        150        151        152        153        154 
## 0.09466667 0.14884615 0.05383886 0.09466667 0.10161290 0.14884615 0.09466667 
##        155        156        157        158        159        160        161 
## 0.14884615 0.09466667 0.10161290 0.09466667 0.09466667 0.09466667 0.07046512 
##        162        163        164        165        166        167        168 
## 0.09466667 0.07745342 0.07745342 0.09466667 0.07745342 0.07745342 0.09466667 
##        169        170        171        172        173        174        175 
## 0.07046512 0.05383886 0.07745342 0.05383886 0.09466667 0.05383886 0.07745342 
##        176        177        178        179        180        181        182 
## 0.05383886 0.05383886 0.05383886 0.07046512 0.07046512 0.07745342 0.07745342 
##        183        184        185        186        187        188        189 
## 0.07745342 0.05383886 0.09466667 0.07046512 0.05383886 0.05383886 0.07745342 
##        190        191        192        193        194        195        196 
## 0.09466667 0.10161290 0.07046512 0.07745342 0.10161290 0.07745342 0.05383886 
##        197        198        199        200        201        202        203 
## 0.07046512 0.05383886 0.07745342 0.05383886 0.05383886 0.05383886 0.07046512 
##        204        205        206        207        208        209        210 
## 0.07046512 0.09466667 0.09466667 0.09466667 0.09466667 0.13200000 0.07745342 
##        211        212        213        214        215        216        217 
## 0.09466667 0.05383886 0.09466667 0.07745342 0.07745342 0.05383886 0.05383886 
##        218        219        220        221        222        223        224 
## 0.05383886 0.05383886 0.07745342 0.07046512 0.07745342 0.05383886 0.07745342 
##        225        226        227        228        229        230        231 
## 0.07046512 0.07745342 0.05383886 0.09466667 0.09466667 0.07745342 0.05383886 
##        232        233        234        235        236        237        238 
## 0.05383886 0.05383886 0.05383886 0.09466667 0.07745342 0.05383886 0.05383886 
##        239        240        241        242        243        244        245 
## 0.05383886 0.07745342 0.07745342 0.07745342 0.05383886 0.07745342 0.07745342 
##        246        247        248        249        250        251        252 
## 0.07745342 0.07745342 0.05383886 0.07745342 0.07745342 0.07745342 0.07046512 
##        253        254        255        256        257        258        259 
## 0.09466667 0.09466667 0.09466667 0.19294118 0.07046512 0.09466667 0.07745342 
##        260        261        262        263        264        265        266 
## 0.09466667 0.07745342 0.09466667 0.09466667 0.19294118 0.07046512 0.09466667 
##        267        268        269        270        271        272        273 
## 0.13200000 0.09466667 0.13200000 0.09466667 0.19294118 0.09466667 0.07745342 
##        274        275        276        277        278        279        280 
## 0.14884615 0.07745342 0.07745342 0.07046512 0.07745342 0.07745342 0.14884615 
##        281        282        283        284        285        286        287 
## 0.07046512 0.07745342 0.05383886 0.07745342 0.05383886 0.07046512 0.19294118 
##        288        289        290        291        292        293        294 
## 0.14884615 0.07046512 0.05383886 0.05383886 0.07046512 0.10161290 0.05383886 
##        295        296        297        298        299        300        301 
## 0.05383886 0.05383886 0.05383886 0.07745342 0.09466667 0.14884615 0.07046512 
##        302        303        304        305        306        307        308 
## 0.07046512 0.07745342 0.07745342 0.05383886 0.05383886 0.07046512 0.07046512 
##        309        310        311        312        313        314        315 
## 0.07745342 0.05383886 0.07046512 0.07745342 0.05383886 0.07046512 0.09466667 
##        316        317        318        319        320        321        322 
## 0.07046512 0.05383886 0.07745342 0.07046512 0.05383886 0.07046512 0.05383886 
##        323        324        325        326        327        328        329 
## 0.07745342 0.05383886 0.05383886 0.13200000 0.05383886 0.05383886 0.05383886 
##        330        331        332        333        334        335        336 
## 0.07745342 0.07046512 0.09466667 0.09466667 0.07745342 0.07745342 0.07745342 
##        337        338        339        340        341        342        343 
## 0.05383886 0.07745342 0.09466667 0.07745342 0.05383886 0.05383886 0.05383886 
##        344        345        346        347        348        349        350 
## 0.07745342 0.09466667 0.07745342 0.09466667 0.09466667 0.09466667 0.07745342 
##        351        352        353        354        355        356        357 
## 0.09466667 0.07745342 0.09466667 0.07745342 0.05383886 0.05383886 0.05383886 
##        358        359        360        361        362        363        364 
## 0.07745342 0.05383886 0.05383886 0.07745342 0.07745342 0.07745342 0.07046512 
##        365        366        367        368        369        370        371 
## 0.07745342 0.05383886 0.07745342 0.05383886 0.19294118 0.09466667 0.10161290 
##        372        373        374        375        376        377        378 
## 0.13200000 0.14884615 0.09466667 0.05383886 0.07745342 0.07745342 0.07046512 
##        379        380        381        382        383        384        385 
## 0.07745342 0.09466667 0.07745342 0.07745342 0.09466667 0.05383886 0.07745342 
##        386        387        388        389        390        391        392 
## 0.05383886 0.09466667 0.05383886 0.07745342 0.07046512 0.05383886 0.07745342 
##        393        394        395        396        397        398        399 
## 0.07745342 0.07745342 0.05383886 0.05383886 0.07745342 0.07745342 0.07745342 
##        400        401        402        403        404        405        406 
## 0.07745342 0.05383886 0.07745342 0.05383886 0.05383886 0.07745342 0.07745342 
##        407        408        409        410        411        412        413 
## 0.05383886 0.05383886 0.09466667 0.05383886 0.05383886 0.07745342 0.05383886 
##        414        415        416        417        418        419        420 
## 0.10161290 0.13200000 0.10161290 0.05383886 0.10161290 0.07046512 0.07046512 
##        421        422        423        424        425        426        427 
## 0.07046512 0.05383886 0.10161290 0.09466667 0.07046512 0.07046512 0.14884615 
##        428        429        430        431        432        433        434 
## 0.10161290 0.05383886 0.07046512 0.05383886 0.09466667 0.09466667 0.10161290 
##        435        436        437        438        439        440        441 
## 0.05383886 0.05383886 0.05383886 0.05383886 0.05383886 0.05383886 0.09466667 
##        442        443        444        445        446        447        448 
## 0.07046512 0.07046512 0.07745342 0.05383886 0.10161290 0.05383886 0.07046512 
##        449        450        451        452        453        454        455 
## 0.13200000 0.05383886 0.09466667 0.07745342 0.09466667 0.14884615 0.07046512 
##        456        457        458        459        460        461        462 
## 0.05383886 0.10161290 0.10161290 0.05383886 0.07046512 0.10161290 0.19294118 
##        463        464        465        466        467        468        469 
## 0.09466667 0.09466667 0.07745342 0.07745342 0.05383886 0.05383886 0.07046512 
##        470        471        472        473        474        475        476 
## 0.09466667 0.05383886 0.09466667 0.05383886 0.07745342 0.05383886 0.10161290 
##        477        478        479        480        481        482        483 
## 0.07745342 0.07745342 0.07745342 0.07745342 0.09466667 0.09466667 0.05383886 
##        484        485        486        487        488        489        490 
## 0.05383886 0.05383886 0.05383886 0.05383886 0.07745342 0.05383886 0.09466667 
##        491        492        493        494        495        496        497 
## 0.05383886 0.05383886 0.05383886 0.05383886 0.05383886 0.05383886 0.05383886 
##        498        499        500        501        502        503        504 
## 0.09466667 0.05383886 0.05383886 0.07046512 0.10161290 0.07745342 0.05383886 
##        505        506        507        508        509        510        511 
## 0.10161290 0.09466667 0.07046512 0.07745342 0.14884615 0.05383886 0.05383886 
##        512        513        514        515        516        517        518 
## 0.05383886 0.10161290 0.10161290 0.09466667 0.07046512 0.19294118 0.09466667 
##        519        520        521        522        523        524        525 
## 0.13200000 0.07745342 0.07046512 0.09466667 0.14884615 0.05383886 0.07745342 
##        526        527        528        529        530        531        532 
## 0.07745342 0.14884615 0.05383886 0.14884615 0.05383886 0.07046512 0.10161290 
##        533        534        535        536        537        538        539 
## 0.10161290 0.13200000 0.07745342 0.07046512 0.14884615 0.05383886 0.07745342 
##        540        541        542        543        544        545        546 
## 0.07745342 0.09466667 0.05383886 0.05383886 0.09466667 0.09466667 0.07046512 
##        547        548        549        550        551        552        553 
## 0.05383886 0.05383886 0.05383886 0.09466667 0.09466667 0.05383886 0.09466667 
##        554        555        556        557        558        559        560 
## 0.09466667 0.09466667 0.09466667 0.05383886 0.09466667 0.07745342 0.07745342 
##        561        562        563        564        565        566        567 
## 0.05383886 0.05383886 0.07046512 0.07745342 0.07745342 0.07745342 0.09466667 
##        568        569        570        571        572        573        574 
## 0.09466667 0.05383886 0.05383886 0.09466667 0.07046512 0.05383886 0.09466667 
##        575        576        577        578        579        580        581 
## 0.09466667 0.10161290 0.07046512 0.07745342 0.07745342 0.07745342 0.05383886 
##        582        583        584        585        586        587        588 
## 0.09466667 0.09466667 0.09466667 0.07745342 0.07745342 0.07745342 0.09466667 
##        589        590        591        592        593        594        595 
## 0.05383886 0.05383886 0.13200000 0.07745342 0.07745342 0.09466667 0.05383886 
##        596        597        598        599        600        601        602 
## 0.05383886 0.05383886 0.07745342 0.07745342 0.05383886 0.05383886 0.09466667 
##        603        604        605        606        607        608        609 
## 0.05383886 0.07745342 0.07745342 0.07745342 0.05383886 0.07745342 0.07046512 
##        610        611        612        613        614        615        616 
## 0.07046512 0.05383886 0.05383886 0.09466667 0.09466667 0.09466667 0.05383886 
##        617        618        619        620        621        622        623 
## 0.09466667 0.09466667 0.09466667 0.09466667 0.05383886 0.09466667 0.07745342 
##        624        625        626        627        628        629        630 
## 0.07745342 0.05383886 0.07745342 0.05383886 0.09466667 0.09466667 0.09466667 
##        631        632        633        634        635        636        637 
## 0.09466667 0.09466667 0.09466667 0.09466667 0.07745342 0.05383886 0.14884615 
##        638        639        640        641        642        643        644 
## 0.09466667 0.05383886 0.09466667 0.07046512 0.05383886 0.07745342 0.07046512 
##        645        646        647        648        649        650        651 
## 0.14884615 0.09466667 0.07046512 0.05383886 0.05383886 0.07745342 0.07745342 
##        652        653        654        655        656        657        658 
## 0.07745342 0.05383886 0.07745342 0.09466667 0.09466667 0.07745342 0.07745342 
##        659        660        661        662        663        664        665 
## 0.05383886 0.05383886 0.07745342 0.05383886 0.13200000 0.07046512 0.05383886 
##        666        667        668        669        670        671        672 
## 0.05383886 0.07046512 0.05383886 0.07745342 0.09466667 0.07046512 0.05383886 
##        673        674        675        676        677        678        679 
## 0.05383886 0.09466667 0.05383886 0.07745342 0.05383886 0.05383886 0.07745342 
##        680        681        682        683        684        685        686 
## 0.10161290 0.19294118 0.05383886 0.07046512 0.19294118 0.05383886 0.13200000 
##        687        688        689        690        691        692 
## 0.07046512 0.13200000 0.13200000 0.09466667 0.07046512 0.10161290
predict_unseen <-predict(Eloansfit, Eloans_test, type = 'vector')
Eloans_mat <- table(Eloans_test$Default3yr, predict_unseen)
Eloans_mat
##       predict_unseen
##        0.0538388625592417 0.0704651162790697 0.0774534161490683
##   0.02                  5                  0                  3
##   0.03                 10                  0                  3
##   0.04                  3                  4                  4
##   0.05                 15                  8                 17
##   0.06                  1                  2                  5
##   0.07                  0                  0                  1
##   0.08                  0                  0                  0
##   0.09                  1                  5                  3
##   0.1                   4                  4                 11
##   0.12                  0                  1                  0
##   0.14                  0                  0                  0
##   0.16                  0                  1                  0
##   0.18                  0                  0                  0
##   0.19                  0                  1                  3
##   0.2                   0                  0                  0
##   0.22                  0                  0                  0
##   0.24                  0                  0                  0
##       predict_unseen
##        0.0946666666666665 0.101612903225806 0.132 0.148846153846154
##   0.02                  0                 0     0                 0
##   0.03                  0                 0     0                 0
##   0.04                  2                 0     0                 0
##   0.05                  4                 1     0                 0
##   0.06                  3                 0     1                 0
##   0.07                  3                 0     0                 0
##   0.08                  2                 0     0                 0
##   0.09                  6                 3     0                 0
##   0.1                   9                 1     0                 1
##   0.12                  2                 0     0                 0
##   0.14                  6                 1     1                 1
##   0.16                  0                 0     0                 1
##   0.18                  0                 0     0                 1
##   0.19                  1                 1     0                 0
##   0.2                   0                 0     0                 0
##   0.22                  0                 0     0                 1
##   0.24                  0                 0     0                 0
##       predict_unseen
##        0.192941176470588
##   0.02                 0
##   0.03                 0
##   0.04                 0
##   0.05                 0
##   0.06                 0
##   0.07                 0
##   0.08                 0
##   0.09                 0
##   0.1                  0
##   0.12                 1
##   0.14                 1
##   0.16                 1
##   0.18                 0
##   0.19                 1
##   0.2                  1
##   0.22                 0
##   0.24                 1
accuracy_Test <- sum(diag(Eloans_mat)) / sum(Eloans_mat)
print(paste('Accuracy for test', accuracy_Test))
## [1] "Accuracy for test 0.0751445086705202"

FUTURE WORK: Loan Default Prediction using Random Forest

Random Forest using mtry=3 and 500 trees has been initiated Need to continue work on running and testing the model, confusionMatrix and importance plots

Eloans <-select(Eloans,-c(INSTNM))
library(randomForest)
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(caret)
library(e1071)
Eloans.rf <- randomForest(Default3yr~ ., data = Eloans, mtry = 3,
                         importance = TRUE, na.action = na.omit)
print(Eloans.rf)
## 
## Call:
##  randomForest(formula = Default3yr ~ ., data = Eloans, mtry = 3,      importance = TRUE, na.action = na.omit) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 0.000375702
##                     % Var explained: 79.2
Etrain_test <- function(Eloans, size = 0.8, train = TRUE) {
    n_row = nrow(Eloans)
    total_row = size * n_row
    train_sample <- 1:total_row
    if (train == TRUE) {
        return (Eloans[train_sample, ])
    } else {
        return (Eloans[-train_sample, ])
    }
}
Etrain <-Etrain_test(Eloans, 0.8, train = TRUE)
Etest <- Etrain_test(Eloans, 0.8, train = FALSE)
dim(Etrain)
## [1] 692  34
dim(Etest)
## [1] 173  34
set.seed(120)  # Setting seed
classifier_RF <- randomForest(x = Etrain[-5],
                             y = Etrain$STABBR,
                             ntree = 500)
## Warning: Unknown or uninitialised column: `STABBR`.
classifier_RF
## 
## Call:
##  randomForest(x = Etrain[-5], y = Etrain$STABBR, ntree = 500) 
##                Type of random forest: unsupervised
##                      Number of trees: 500
## No. of variables tried at each split: 5

REFERENCES

  1. Data Home: College Scorecard. Data Home | College Scorecard. (n.d.). Retrieved May 11, 2022, from https://collegescorecard.ed.gov/data/

  2. 2015–16 National Postsecondary Student Aid Study (npsas:16) student … (n.d.). Retrieved May 12, 2022, from https://nces.ed.gov/pubs2018/2018466.pdf

  3. Federal Student Aid. (n.d.). Retrieved May 11, 2022, from https://studentaid.gov/plus-app/parent/landing

  4. Wickham, H., & Grolemund, G. (2016). R for data science: Visualize, model, transform, tidy, and import data. OReilly Media.

  5. R Core Team (2017). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/.

  6. [SMSS] Agresti, Alan. Statistical Methods for the Social Sciences. 5th Edition. Pearson, 2018. Wickham, H., & Grolemund, G. (2016). R for data science: Visualize, model, transform, tidy, and import data. OReilly Media