# load packages
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(dplyr)

install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ggplot2)
# load packages
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)

install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(dplyr)

install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ggplot2)
#list the files in the directory and load the datasets
#step1: list the files in the directory
list.files("/cloud/project/market_risk")
## [1] "borrowers.csv"          "loans.csv"              "market_risks.csv"      
## [4] "questions for analysis"
#step2: load the datasets in the directory
borrowers <- read.csv("/cloud/project/market_risk/borrowers.csv")
colnames(borrowers)
## [1] "BorrowerID"       "Name"             "Age"              "Gender"          
## [5] "CreditScore"      "AnnualIncome"     "EmploymentStatus" "Location"
loans <- read.csv("/cloud/project/market_risk/loans.csv")
colnames(loans)
## [1] "LoanID"       "BorrowerID"   "LoanAmount"   "InterestRate" "TermYears"   
## [6] "StartDate"    "EndDate"      "LoanStatus"   "RiskRating"
risks <- read.csv("/cloud/project/market_risk/market_risks.csv")
colnames(risks)
## [1] "RiskFactorID"      "LoanID"            "EconomicIndicator"
## [4] "StockIndex"        "MarketVolatility"  "Date"             
## [7] "RiskLevel"
# What is the average loan amount borrowed by different age 
# groups?

# step1: create age classification
age_classification <- cut(borrowers$Age,
                          breaks=c(20,35,45,55,69),
                          labels=c("20-35","36-45","46-55","56-69"),
                          include.lowest=TRUE)

# step2: create age classification column
library(dplyr) # load package
borrowers <- borrowers %>%
  mutate(Age_Classification=age_classification)

# step3: join the borrowers and loans tables
borrowers_loans <- left_join(borrowers,loans,by='BorrowerID')

# step4: establish average loan by age group
loan_age <- borrowers_loans %>%
  group_by(Age_Classification)%>%
  summarise(avg_loan = mean(LoanAmount,na.rm=TRUE),.groups='drop')

# step5: print results
print(loan_age)
## # A tibble: 4 × 2
##   Age_Classification avg_loan
##   <fct>                 <dbl>
## 1 20-35                52298.
## 2 36-45                52341.
## 3 46-55                52955.
## 4 56-69                52758.
# How does credit score impact the interest rate offered to borrowers?

# step1:classify credit score

credit_score_classification <-cut(borrowers$CreditScore,
                                  breaks=c(300,350,400,450,500,550,600,650,700,750,800,850,900),
                                  labels=c("300-350","351-400","401-450","451-500","501-550","551-600","601-650","651-700","701-750","751-800","801-850","851-900"),
                                  include.lowest = TRUE)

# step2: create credit score classification column
borrowers <- borrowers %>%
  mutate(Credit_Score_Classification = credit_score_classification)

# step3: join the borrowers and loans tables
borrowers_loans <- left_join(borrowers,loans,by='BorrowerID')

# step4: calculate average interest and classify by age
interest_credit <- borrowers_loans %>%
  group_by(Credit_Score_Classification)%>%
  summarise(avg_interest=mean(InterestRate,na.rm=TRUE),.groups='drop')

# step5: print results
print(interest_credit)
## # A tibble: 11 × 2
##    Credit_Score_Classification avg_interest
##    <fct>                              <dbl>
##  1 300-350                             4.55
##  2 351-400                             4.54
##  3 401-450                             4.48
##  4 451-500                             4.51
##  5 501-550                             4.50
##  6 551-600                             4.50
##  7 601-650                             4.50
##  8 651-700                             4.46
##  9 701-750                             4.48
## 10 751-800                             4.38
## 11 801-850                             4.54
# step6: visualize output
library(ggplot2)
ggplot(data=interest_credit,mapping=aes(x=Credit_Score_Classification,y=avg_interest))+
  geom_point(size=4,color='red')+labs(title='Relationship between Credit Score and interest Rates')+
  theme(axis.text.x = element_text(angle=45,hjust=1))

# Which employment status group has the highest delinquency?
employment_type_loan_status <- borrowers_loans %>%
  filter(LoanStatus=='Delinquent')%>%
  group_by(EmploymentStatus)%>%
summarise(delinquent_count=n(),.groups='drop')%>%
  arrange(desc(delinquent_count))
print(employment_type_loan_status)
## # A tibble: 4 × 2
##   EmploymentStatus delinquent_count
##   <chr>                       <int>
## 1 Unemployed                    649
## 2 Student                       624
## 3 Employed                      596
## 4 Self-Employed                 584
# visualize output
ggplot(data=employment_type_loan_status,mapping=aes(x=EmploymentStatus,y=delinquent_count,fill=EmploymentStatus))+
  geom_col()+labs(title='Employment Group with Highest Delinquency')+theme(axis.text.x = element_text(angle=45,hjust=1))

# Correlation between borrower’s annual income and their loan risk rating
#step1:convert risk status from string to numeric
loans <- loans %>%
  mutate(NumericRiskRating = case_when(
    RiskRating == 'Low' ~ 1,
    RiskRating == 'Medium' ~ 2,
    RiskRating == 'High' ~ 3,
    TRUE ~ NA_real_ # Handle other cases or missing values
  ))
#step2: include new column into the borrowers loans table
borrowers_loans <- left_join(borrowers,loans,by='BorrowerID')

#step2:calculate correlation
correlation <- cor(borrowers_loans$AnnualIncome,borrowers_loans$NumericRiskRating,use='complete.obs')
print(correlation)
## [1] -0.0004545949
# What is the distribution of loan amounts across top performing locations?

top_locations <- borrowers_loans %>%
  group_by(Location)%>%
  summarise(total_loans=sum(LoanAmount))%>%
  arrange(desc(total_loans))%>%
  slice(1:10)
print(top_locations)
## # A tibble: 10 × 2
##    Location              total_loans
##    <chr>                       <int>
##  1 Porterstad, TN             466854
##  2 New Mitchellshire, NH      400223
##  3 Danieltown, MA             371945
##  4 Hernandezchester, TN       359286
##  5 Port Melissa, NV           330054
##  6 Patrickville, OH           324502
##  7 West Rodney, MA            321576
##  8 West Wayne, AL             320846
##  9 South Julieton, WI         318396
## 10 West Ginabury, MO          317429
# visualize output
ggplot(data=top_locations,mapping=aes(x=Location,y=total_loans))+
  geom_col(color='black',fill='lightblue')+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  labs(title='Distribution of Loans by Top Performing Locations')

#What are the primary factors contributing to high-risk ratings 
#for loans?

#step1: join the market risk table to the loans table
risks_loans <- left_join(risks,loans,by='LoanID')

#step2: aggregate the factors leading to high risk ratings
primary_factors <- risks_loans %>%
  filter(RiskRating=='High')%>%
  group_by(EconomicIndicator)%>%
  summarise(
    indicator_count = n(),
    total_volatility=sum(MarketVolatility,na.rm=TRUE)
  ,.groups='drop')
print(primary_factors)
## # A tibble: 4 × 3
##   EconomicIndicator indicator_count total_volatility
##   <chr>                       <int>            <dbl>
## 1 GDP Growth                    840             455.
## 2 Inflation Rate                843             462.
## 3 Interest Rate                 799             449.
## 4 Unemployment Rate             843             462.
# visualize output
ggplot(data=primary_factors,mapping=aes(x=EconomicIndicator,y=total_volatility,fill=indicator_count))+
  geom_col() + labs(title='Primary factors leading to Hight Risk Ratings')+
  theme(axis.text.x = element_text(angle=45,hjust=1))

# Which loan status (e.g., Active, Delinquent, Defaulted) is most common among high-risk loans?
high_risk_loans <- loans %>%
  filter(RiskRating=='High')%>%
  group_by (LoanStatus)%>%
  summarise(loan_status_count=n(),.groups='drop')
print(high_risk_loans)
## # A tibble: 4 × 2
##   LoanStatus loan_status_count
##   <chr>                  <int>
## 1 Active                   838
## 2 Defaulted                852
## 3 Delinquent               816
## 4 Paid Off                 825
# how does credit score vary by gender and employment status?
#step1: aggregate credit score and employment status
gender_score <- borrowers %>%
  group_by(Gender,EmploymentStatus)%>%
  summarise(
    avg_score=mean(CreditScore,na.rm=TRUE),
    status_count=n()
    ,.groups='drop')
#step2: print results
print(gender_score)
## # A tibble: 8 × 4
##   Gender EmploymentStatus avg_score status_count
##   <chr>  <chr>                <dbl>        <int>
## 1 Female Employed              576.         1240
## 2 Female Self-Employed         578.         1197
## 3 Female Student               571.         1228
## 4 Female Unemployed            578.         1274
## 5 Male   Employed              576.         1249
## 6 Male   Self-Employed         575.         1247
## 7 Male   Student               568.         1259
## 8 Male   Unemployed            578.         1306
#step3: visualize output
ggplot(data=gender_score,mapping=aes(x=Gender,y=avg_score,fill=status_count))+
  geom_point(color='blue',size=3) + facet_wrap(~EmploymentStatus)

# How does the risk rating distribution vary by loan term?
risk_distribution <- loans %>%
  group_by(TermYears,RiskRating)%>%
  summarise(risk_count=n(),.groups='drop')
print(risk_distribution)
## # A tibble: 18 × 3
##    TermYears RiskRating risk_count
##        <int> <chr>           <int>
##  1         5 High              579
##  2         5 Low               552
##  3         5 Medium            555
##  4         7 High              530
##  5         7 Low               570
##  6         7 Medium            593
##  7        10 High              529
##  8        10 Low               533
##  9        10 Medium            551
## 10        15 High              569
## 11        15 Low               555
## 12        15 Medium            564
## 13        20 High              557
## 14        20 Low               579
## 15        20 Medium            546
## 16        30 High              567
## 17        30 Low               558
## 18        30 Medium            513
#visualize results
ggplot(risk_distribution, aes(x = as.factor(TermYears), y = risk_count, fill = RiskRating)) +
  geom_bar(stat = "identity") +
  labs(title = "Risk Rating Distribution by Loan Term") +
  theme_minimal()

#What economic indicators are most associated with severe market volatility?

#step1:aggregate market volatility and group by economic indicators
volatile_indicators <- risks %>%
  group_by (EconomicIndicator) %>%
  summarise(total_volatility = sum(MarketVolatility,na.rm=TRUE),.groups='drop')%>%
  arrange(desc(total_volatility))

#step2:print
print(volatile_indicators)
## # A tibble: 4 × 2
##   EconomicIndicator total_volatility
##   <chr>                        <dbl>
## 1 GDP Growth                   1390.
## 2 Unemployment Rate            1389.
## 3 Inflation Rate               1367.
## 4 Interest Rate                1360.
#step3:visualize results
ggplot(data=volatile_indicators,mapping=aes(x=EconomicIndicator,y=total_volatility))+
  geom_point(size=3,color ='red')

# How does market volatility affects loan status?

#step1: convert text to numeric using case function
risks_loans <- risks_loans %>%
  mutate(LoanStatusNumeric = case_when(
            LoanStatus == 'Active' ~ 1,
            LoanStatus == 'Defaulted' ~ 2,
            LoanStatus == 'Delinquent' ~ 3,
            LoanStatus == 'PaidOff' ~ 4,
  TRUE ~ NA_real_ # to handle missing values  
  ))
#step2: calculate correlation
corr_data <- cor(risks_loans$LoanStatusNumeric,risks_loans$MarketVolatility,use='complete.obs')

#step3:print results
print(corr_data)
## [1] 0.007646908
#Are there particular economic indicators that are consistently associated with high-risk levels across different stock indices?

#step1:aggregate risk levels and stock indices and group by economic indicators
consistent_indicators <- risks %>%
  group_by(EconomicIndicator)%>%
  summarise(
    risk_count = n(),
    index_count = n_distinct(StockIndex)
    ,.groups='drop')

#step2:print results
  print(consistent_indicators)
## # A tibble: 4 × 3
##   EconomicIndicator risk_count index_count
##   <chr>                  <int>       <int>
## 1 GDP Growth              2533           4
## 2 Inflation Rate          2491           4
## 3 Interest Rate           2449           4
## 4 Unemployment Rate       2527           4
#Is there a seasonal pattern in market risk levels and loan defaults?
#step1: format date format to year format
library(lubridate)
risks_loans <- risks_loans %>%
  mutate(Year = year(Date))

#step2: aggregate risk level and defaults
seasonal_patterns <- risks_loans %>%
  filter(LoanStatus=='Defaulted')%>%
  group_by(Year,RiskLevel)%>%
  summarise(
    risk_level_count=n(),
    defaulted_count = n_distinct(LoanID)
  ,.groups='drop')%>%
  arrange(desc(risk_level_count))
  
#step3: print results
print(seasonal_patterns)
## # A tibble: 24 × 4
##     Year RiskLevel risk_level_count defaulted_count
##    <dbl> <chr>                <int>           <int>
##  1  2021 Severe                 134             128
##  2  2022 High                   132             131
##  3  2022 Low                    132             128
##  4  2020 High                   131             127
##  5  2020 Low                    131             128
##  6  2021 Low                    128             122
##  7  2023 Low                    123             122
##  8  2022 Medium                 122             117
##  9  2023 Severe                 122             120
## 10  2020 Severe                 121             118
## # ℹ 14 more rows
#step4:visualize output
ggplot(data=seasonal_patterns,mapping=aes(x=risk_level_count,y=RiskLevel,fill=defaulted_count))+
  geom_bar(stat='identity',position='stack') + facet_wrap(~Year)