# load packages
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(dplyr)
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ggplot2)
# load packages
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(dplyr)
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ggplot2)
#list the files in the directory and load the datasets
#step1: list the files in the directory
list.files("/cloud/project/market_risk")
## [1] "borrowers.csv" "loans.csv" "market_risks.csv"
## [4] "questions for analysis"
#step2: load the datasets in the directory
borrowers <- read.csv("/cloud/project/market_risk/borrowers.csv")
colnames(borrowers)
## [1] "BorrowerID" "Name" "Age" "Gender"
## [5] "CreditScore" "AnnualIncome" "EmploymentStatus" "Location"
loans <- read.csv("/cloud/project/market_risk/loans.csv")
colnames(loans)
## [1] "LoanID" "BorrowerID" "LoanAmount" "InterestRate" "TermYears"
## [6] "StartDate" "EndDate" "LoanStatus" "RiskRating"
risks <- read.csv("/cloud/project/market_risk/market_risks.csv")
colnames(risks)
## [1] "RiskFactorID" "LoanID" "EconomicIndicator"
## [4] "StockIndex" "MarketVolatility" "Date"
## [7] "RiskLevel"
# What is the average loan amount borrowed by different age
# groups?
# step1: create age classification
age_classification <- cut(borrowers$Age,
breaks=c(20,35,45,55,69),
labels=c("20-35","36-45","46-55","56-69"),
include.lowest=TRUE)
# step2: create age classification column
library(dplyr) # load package
borrowers <- borrowers %>%
mutate(Age_Classification=age_classification)
# step3: join the borrowers and loans tables
borrowers_loans <- left_join(borrowers,loans,by='BorrowerID')
# step4: establish average loan by age group
loan_age <- borrowers_loans %>%
group_by(Age_Classification)%>%
summarise(avg_loan = mean(LoanAmount,na.rm=TRUE),.groups='drop')
# step5: print results
print(loan_age)
## # A tibble: 4 × 2
## Age_Classification avg_loan
## <fct> <dbl>
## 1 20-35 52298.
## 2 36-45 52341.
## 3 46-55 52955.
## 4 56-69 52758.
# How does credit score impact the interest rate offered to borrowers?
# step1:classify credit score
credit_score_classification <-cut(borrowers$CreditScore,
breaks=c(300,350,400,450,500,550,600,650,700,750,800,850,900),
labels=c("300-350","351-400","401-450","451-500","501-550","551-600","601-650","651-700","701-750","751-800","801-850","851-900"),
include.lowest = TRUE)
# step2: create credit score classification column
borrowers <- borrowers %>%
mutate(Credit_Score_Classification = credit_score_classification)
# step3: join the borrowers and loans tables
borrowers_loans <- left_join(borrowers,loans,by='BorrowerID')
# step4: calculate average interest and classify by age
interest_credit <- borrowers_loans %>%
group_by(Credit_Score_Classification)%>%
summarise(avg_interest=mean(InterestRate,na.rm=TRUE),.groups='drop')
# step5: print results
print(interest_credit)
## # A tibble: 11 × 2
## Credit_Score_Classification avg_interest
## <fct> <dbl>
## 1 300-350 4.55
## 2 351-400 4.54
## 3 401-450 4.48
## 4 451-500 4.51
## 5 501-550 4.50
## 6 551-600 4.50
## 7 601-650 4.50
## 8 651-700 4.46
## 9 701-750 4.48
## 10 751-800 4.38
## 11 801-850 4.54
# step6: visualize output
library(ggplot2)
ggplot(data=interest_credit,mapping=aes(x=Credit_Score_Classification,y=avg_interest))+
geom_point(size=4,color='red')+labs(title='Relationship between Credit Score and interest Rates')+
theme(axis.text.x = element_text(angle=45,hjust=1))

# Which employment status group has the highest delinquency?
employment_type_loan_status <- borrowers_loans %>%
filter(LoanStatus=='Delinquent')%>%
group_by(EmploymentStatus)%>%
summarise(delinquent_count=n(),.groups='drop')%>%
arrange(desc(delinquent_count))
print(employment_type_loan_status)
## # A tibble: 4 × 2
## EmploymentStatus delinquent_count
## <chr> <int>
## 1 Unemployed 649
## 2 Student 624
## 3 Employed 596
## 4 Self-Employed 584
# visualize output
ggplot(data=employment_type_loan_status,mapping=aes(x=EmploymentStatus,y=delinquent_count,fill=EmploymentStatus))+
geom_col()+labs(title='Employment Group with Highest Delinquency')+theme(axis.text.x = element_text(angle=45,hjust=1))

# Correlation between borrower’s annual income and their loan risk rating
#step1:convert risk status from string to numeric
loans <- loans %>%
mutate(NumericRiskRating = case_when(
RiskRating == 'Low' ~ 1,
RiskRating == 'Medium' ~ 2,
RiskRating == 'High' ~ 3,
TRUE ~ NA_real_ # Handle other cases or missing values
))
#step2: include new column into the borrowers loans table
borrowers_loans <- left_join(borrowers,loans,by='BorrowerID')
#step2:calculate correlation
correlation <- cor(borrowers_loans$AnnualIncome,borrowers_loans$NumericRiskRating,use='complete.obs')
print(correlation)
## [1] -0.0004545949
# What is the distribution of loan amounts across top performing locations?
top_locations <- borrowers_loans %>%
group_by(Location)%>%
summarise(total_loans=sum(LoanAmount))%>%
arrange(desc(total_loans))%>%
slice(1:10)
print(top_locations)
## # A tibble: 10 × 2
## Location total_loans
## <chr> <int>
## 1 Porterstad, TN 466854
## 2 New Mitchellshire, NH 400223
## 3 Danieltown, MA 371945
## 4 Hernandezchester, TN 359286
## 5 Port Melissa, NV 330054
## 6 Patrickville, OH 324502
## 7 West Rodney, MA 321576
## 8 West Wayne, AL 320846
## 9 South Julieton, WI 318396
## 10 West Ginabury, MO 317429
# visualize output
ggplot(data=top_locations,mapping=aes(x=Location,y=total_loans))+
geom_col(color='black',fill='lightblue')+
theme(axis.text.x = element_text(angle=45,hjust=1))+
labs(title='Distribution of Loans by Top Performing Locations')

#What are the primary factors contributing to high-risk ratings
#for loans?
#step1: join the market risk table to the loans table
risks_loans <- left_join(risks,loans,by='LoanID')
#step2: aggregate the factors leading to high risk ratings
primary_factors <- risks_loans %>%
filter(RiskRating=='High')%>%
group_by(EconomicIndicator)%>%
summarise(
indicator_count = n(),
total_volatility=sum(MarketVolatility,na.rm=TRUE)
,.groups='drop')
print(primary_factors)
## # A tibble: 4 × 3
## EconomicIndicator indicator_count total_volatility
## <chr> <int> <dbl>
## 1 GDP Growth 840 455.
## 2 Inflation Rate 843 462.
## 3 Interest Rate 799 449.
## 4 Unemployment Rate 843 462.
# visualize output
ggplot(data=primary_factors,mapping=aes(x=EconomicIndicator,y=total_volatility,fill=indicator_count))+
geom_col() + labs(title='Primary factors leading to Hight Risk Ratings')+
theme(axis.text.x = element_text(angle=45,hjust=1))

# Which loan status (e.g., Active, Delinquent, Defaulted) is most common among high-risk loans?
high_risk_loans <- loans %>%
filter(RiskRating=='High')%>%
group_by (LoanStatus)%>%
summarise(loan_status_count=n(),.groups='drop')
print(high_risk_loans)
## # A tibble: 4 × 2
## LoanStatus loan_status_count
## <chr> <int>
## 1 Active 838
## 2 Defaulted 852
## 3 Delinquent 816
## 4 Paid Off 825
# how does credit score vary by gender and employment status?
#step1: aggregate credit score and employment status
gender_score <- borrowers %>%
group_by(Gender,EmploymentStatus)%>%
summarise(
avg_score=mean(CreditScore,na.rm=TRUE),
status_count=n()
,.groups='drop')
#step2: print results
print(gender_score)
## # A tibble: 8 × 4
## Gender EmploymentStatus avg_score status_count
## <chr> <chr> <dbl> <int>
## 1 Female Employed 576. 1240
## 2 Female Self-Employed 578. 1197
## 3 Female Student 571. 1228
## 4 Female Unemployed 578. 1274
## 5 Male Employed 576. 1249
## 6 Male Self-Employed 575. 1247
## 7 Male Student 568. 1259
## 8 Male Unemployed 578. 1306
#step3: visualize output
ggplot(data=gender_score,mapping=aes(x=Gender,y=avg_score,fill=status_count))+
geom_point(color='blue',size=3) + facet_wrap(~EmploymentStatus)

# How does the risk rating distribution vary by loan term?
risk_distribution <- loans %>%
group_by(TermYears,RiskRating)%>%
summarise(risk_count=n(),.groups='drop')
print(risk_distribution)
## # A tibble: 18 × 3
## TermYears RiskRating risk_count
## <int> <chr> <int>
## 1 5 High 579
## 2 5 Low 552
## 3 5 Medium 555
## 4 7 High 530
## 5 7 Low 570
## 6 7 Medium 593
## 7 10 High 529
## 8 10 Low 533
## 9 10 Medium 551
## 10 15 High 569
## 11 15 Low 555
## 12 15 Medium 564
## 13 20 High 557
## 14 20 Low 579
## 15 20 Medium 546
## 16 30 High 567
## 17 30 Low 558
## 18 30 Medium 513
#visualize results
ggplot(risk_distribution, aes(x = as.factor(TermYears), y = risk_count, fill = RiskRating)) +
geom_bar(stat = "identity") +
labs(title = "Risk Rating Distribution by Loan Term") +
theme_minimal()

#What economic indicators are most associated with severe market volatility?
#step1:aggregate market volatility and group by economic indicators
volatile_indicators <- risks %>%
group_by (EconomicIndicator) %>%
summarise(total_volatility = sum(MarketVolatility,na.rm=TRUE),.groups='drop')%>%
arrange(desc(total_volatility))
#step2:print
print(volatile_indicators)
## # A tibble: 4 × 2
## EconomicIndicator total_volatility
## <chr> <dbl>
## 1 GDP Growth 1390.
## 2 Unemployment Rate 1389.
## 3 Inflation Rate 1367.
## 4 Interest Rate 1360.
#step3:visualize results
ggplot(data=volatile_indicators,mapping=aes(x=EconomicIndicator,y=total_volatility))+
geom_point(size=3,color ='red')

# How does market volatility affects loan status?
#step1: convert text to numeric using case function
risks_loans <- risks_loans %>%
mutate(LoanStatusNumeric = case_when(
LoanStatus == 'Active' ~ 1,
LoanStatus == 'Defaulted' ~ 2,
LoanStatus == 'Delinquent' ~ 3,
LoanStatus == 'PaidOff' ~ 4,
TRUE ~ NA_real_ # to handle missing values
))
#step2: calculate correlation
corr_data <- cor(risks_loans$LoanStatusNumeric,risks_loans$MarketVolatility,use='complete.obs')
#step3:print results
print(corr_data)
## [1] 0.007646908
#Are there particular economic indicators that are consistently associated with high-risk levels across different stock indices?
#step1:aggregate risk levels and stock indices and group by economic indicators
consistent_indicators <- risks %>%
group_by(EconomicIndicator)%>%
summarise(
risk_count = n(),
index_count = n_distinct(StockIndex)
,.groups='drop')
#step2:print results
print(consistent_indicators)
## # A tibble: 4 × 3
## EconomicIndicator risk_count index_count
## <chr> <int> <int>
## 1 GDP Growth 2533 4
## 2 Inflation Rate 2491 4
## 3 Interest Rate 2449 4
## 4 Unemployment Rate 2527 4
#Is there a seasonal pattern in market risk levels and loan defaults?
#step1: format date format to year format
library(lubridate)
risks_loans <- risks_loans %>%
mutate(Year = year(Date))
#step2: aggregate risk level and defaults
seasonal_patterns <- risks_loans %>%
filter(LoanStatus=='Defaulted')%>%
group_by(Year,RiskLevel)%>%
summarise(
risk_level_count=n(),
defaulted_count = n_distinct(LoanID)
,.groups='drop')%>%
arrange(desc(risk_level_count))
#step3: print results
print(seasonal_patterns)
## # A tibble: 24 × 4
## Year RiskLevel risk_level_count defaulted_count
## <dbl> <chr> <int> <int>
## 1 2021 Severe 134 128
## 2 2022 High 132 131
## 3 2022 Low 132 128
## 4 2020 High 131 127
## 5 2020 Low 131 128
## 6 2021 Low 128 122
## 7 2023 Low 123 122
## 8 2022 Medium 122 117
## 9 2023 Severe 122 120
## 10 2020 Severe 121 118
## # ℹ 14 more rows
#step4:visualize output
ggplot(data=seasonal_patterns,mapping=aes(x=risk_level_count,y=RiskLevel,fill=defaulted_count))+
geom_bar(stat='identity',position='stack') + facet_wrap(~Year)
