install.packages(“jsonlite”)
install.packages(“tidyverse”)

Question 1

data(cars)
median(cars[,1])
## [1] 15

Question 2

url <- "https://min-api.cryptocompare.com/data/v2/histoday?fsym=BTC&tsym=USD&limit=99"
library(jsonlite)
btc_data <- fromJSON(url)
btc_prices <- btc_data$Data$Data
head(btc_prices)
##         time     high      low     open volumefrom   volumeto    close
## 1 1763769600 85562.12 83462.76 85087.62   23440.30 1978095093 84696.98
## 2 1763856000 88106.69 84639.73 84696.98   27848.78 2411303413 86823.45
## 3 1763942400 89227.26 85242.01 86823.45   44709.41 3899541726 88288.33
## 4 1764028800 88494.81 86089.70 88288.33   32065.82 2799155557 87340.82
## 5 1764115200 90634.17 86298.47 87340.82   30484.49 2694221351 90487.28
## 6 1764201600 91934.77 90083.46 90487.28   21381.51 1949917850 91327.26
##   conversionType conversionSymbol
## 1         direct                 
## 2         direct                 
## 3         direct                 
## 4         direct                 
## 5         direct                 
## 6         direct
max_close <- max(btc_prices$close)
max_close
## [1] 96945.09

Question 3

Title - Predicting Loan Default Risk Using Borrower Financial Profiles

Questions

What borrower characteristics are most associated with loan default?

Does income level significantly affect loan approval risk?

How does credit score relate to default probability?

Are certain employment types more risky?

Which financial variables appear most important for prediction?

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()  masks stats::filter()
## ✖ purrr::flatten() masks jsonlite::flatten()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
 loan_data <- read.csv("loan_risk_prediction_dataset.csv")
head(loan_data)
##   Age Income LoanAmount CreditScore YearsExperience Gender   Education
## 1  56  48353      31258         675              20 Female High School
## 2  69  57462      23262         586               6   Male High School
## 3  46  44219      26530         781              26   Male         PhD
## 4  32  56307      11531         549              11   Male            
## 5  60  37034      27871         500              19 Female High School
## 6  25  47886      18106         835              13   Male     Masters
##            City EmploymentType LoanApproved
## 1       Houston     Unemployed            0
## 2 San Francisco  Self-Employed            0
## 3       Houston  Self-Employed            1
## 4      New York     Unemployed            0
## 5       Chicago     Unemployed            0
## 6      New York       Salaried            1
str(loan_data)
## 'data.frame':    5000 obs. of  10 variables:
##  $ Age            : int  56 69 46 32 60 25 38 56 36 40 ...
##  $ Income         : num  48353 57462 44219 56307 37034 ...
##  $ LoanAmount     : num  31258 23262 26530 11531 27871 ...
##  $ CreditScore    : num  675 586 781 549 500 835 760 599 777 382 ...
##  $ YearsExperience: int  20 6 26 11 19 13 9 22 29 30 ...
##  $ Gender         : chr  "Female" "Male" "Male" "Male" ...
##  $ Education      : chr  "High School" "High School" "PhD" "" ...
##  $ City           : chr  "Houston" "San Francisco" "Houston" "New York" ...
##  $ EmploymentType : chr  "Unemployed" "Self-Employed" "Self-Employed" "Unemployed" ...
##  $ LoanApproved   : int  0 0 1 0 0 1 1 0 0 0 ...
summary(loan_data)
##       Age            Income        LoanAmount      CreditScore   
##  Min.   :18.00   Min.   :-3731   Min.   :-10059   Min.   :300.0  
##  1st Qu.:31.00   1st Qu.:39608   1st Qu.: 14455   1st Qu.:433.0  
##  Median :43.00   Median :49488   Median : 19842   Median :579.0  
##  Mean   :43.58   Mean   :49738   Mean   : 19871   Mean   :575.5  
##  3rd Qu.:56.00   3rd Qu.:59917   3rd Qu.: 25327   3rd Qu.:712.0  
##  Max.   :69.00   Max.   :99146   Max.   : 48353   Max.   :849.0  
##                  NA's   :196                      NA's   :194    
##  YearsExperience    Gender           Education             City          
##  Min.   : 0.0    Length:5000        Length:5000        Length:5000       
##  1st Qu.:10.0    Class :character   Class :character   Class :character  
##  Median :20.0    Mode  :character   Mode  :character   Mode  :character  
##  Mean   :19.6                                                            
##  3rd Qu.:29.0                                                            
##  Max.   :39.0                                                            
##                                                                          
##  EmploymentType      LoanApproved   
##  Length:5000        Min.   :0.0000  
##  Class :character   1st Qu.:0.0000  
##  Mode  :character   Median :0.0000  
##                     Mean   :0.2302  
##                     3rd Qu.:0.0000  
##                     Max.   :1.0000  
## 
colSums(is.na(loan_data))
##             Age          Income      LoanAmount     CreditScore YearsExperience 
##               0             196               0             194               0 
##          Gender       Education            City  EmploymentType    LoanApproved 
##               0               0               0               0               0
loan_data <- na.omit(loan_data)
table(loan_data$LoanApproved)
## 
##    0    1 
## 3521 1100
prop.table(table(loan_data$LoanApproved))
## 
##         0         1 
## 0.7619563 0.2380437

Aggregate

sapply(loan_data, is.numeric)
##             Age          Income      LoanAmount     CreditScore YearsExperience 
##            TRUE            TRUE            TRUE            TRUE            TRUE 
##          Gender       Education            City  EmploymentType    LoanApproved 
##           FALSE           FALSE           FALSE           FALSE            TRUE
aggregate(Income ~ LoanApproved, data=loan_data, mean)
##   LoanApproved   Income
## 1            0 47997.98
## 2            1 55072.87
aggregate(CreditScore ~ LoanApproved, data=loan_data, mean)
##   LoanApproved CreditScore
## 1            0    534.6515
## 2            1    708.7355
aggregate(LoanAmount ~ LoanApproved, data=loan_data, mean)
##   LoanApproved LoanAmount
## 1            0   19802.15
## 2            1   20185.48

Correlation

numeric_data <- loan_data[, sapply(loan_data, is.numeric)]
cor(numeric_data)
##                          Age       Income   LoanAmount  CreditScore
## Age              1.000000000 -0.004988596 -0.013102997  0.002920979
## Income          -0.004988596  1.000000000  0.001155858 -0.049995094
## LoanAmount      -0.013102997  0.001155858  1.000000000  0.003068258
## CreditScore      0.002920979 -0.049995094  0.003068258  1.000000000
## YearsExperience -0.015283007 -0.015803062  0.007852464 -0.001731739
## LoanApproved    -0.011412881  0.199620003  0.020367262  0.461309176
##                 YearsExperience LoanApproved
## Age                -0.015283007 -0.011412881
## Income             -0.015803062  0.199620003
## LoanAmount          0.007852464  0.020367262
## CreditScore        -0.001731739  0.461309176
## YearsExperience     1.000000000 -0.006802128
## LoanApproved       -0.006802128  1.000000000