mydata <- read.table("./loan.csv",
header = TRUE,
sep = ",",
quote = "\"") #I used quote = "\"" because R has not understood apostrophe correctly
head(mydata)
## age gender occupation education_level marital_status income credit_score loan_status
## 1 32 Male Engineer Bachelor's Married 85000 720 Approved
## 2 45 Female Teacher Master's Single 62000 680 Approved
## 3 28 Male Student High School Single 25000 590 Denied
## 4 51 Female Manager Bachelor's Married 105000 780 Approved
## 5 36 Male Accountant Bachelor's Married 75000 710 Approved
## 6 24 Female Nurse Associate's Single 48000 640 Denied
Unit of the observation:
Sample size:
Description of the variables:
age
gender
occupation
education_level
marital_status
income
credit_score
loan_status
Source: Mandala, S. K. (n.d.). Simple Loan Classification
Dataset. Kaggle. Retrieved 24 March 2025, from https://www.kaggle.com/datasets/sujithmandala/simple-loan-classification-dataset
# install.packages("dplyr")
# install.packages("tidyr")
library(dplyr)
library(tidyr)
mydata <- mydata %>%
rename(Income_USD = income, Age = age, Gender = gender, Occupation = occupation, Education_level = education_level, Marital_status = marital_status, Credit_score = credit_score, Loan_status = loan_status) %>%
drop_na()
# Here I will indicate an exchange rate and show an income in euros
USD_EUR <- 0.92
mydata$Income_EUR <- mydata$Income_USD * USD_EUR
# Adding a new variable called "ID"
mydata$ID <- c(1:61)
mydata <- mydata[, c(10, 1:9)] # Moves column 10 to the front
# Changing the "Male" and "Female" into "M" and "F" (Actually, I have a pretty clean data and don't need this data manipulation, so it is done just in order to show my knowledge)
mydata$Gender <- factor(mydata$Gender,
levels = c("Male", "Female"),
labels = c("M", "F"))
head(mydata)
## ID Age Gender Occupation Education_level Marital_status Income_USD Credit_score Loan_status Income_EUR
## 1 1 32 M Engineer Bachelor's Married 85000 720 Approved 78200
## 2 2 45 F Teacher Master's Single 62000 680 Approved 57040
## 3 3 28 M Student High School Single 25000 590 Denied 23000
## 4 4 51 F Manager Bachelor's Married 105000 780 Approved 96600
## 5 5 36 M Accountant Bachelor's Married 75000 710 Approved 69000
## 6 6 24 F Nurse Associate's Single 48000 640 Denied 44160
mydata_M <- mydata[mydata$Gender == "M", ]
head(mydata_M)
## ID Age Gender Occupation Education_level Marital_status Income_USD Credit_score Loan_status Income_EUR
## 1 1 32 M Engineer Bachelor's Married 85000 720 Approved 78200
## 3 3 28 M Student High School Single 25000 590 Denied 23000
## 5 5 36 M Accountant Bachelor's Married 75000 710 Approved 69000
## 7 7 42 M Lawyer Doctoral Married 120000 790 Approved 110400
## 9 9 37 M IT Master's Married 92000 750 Approved 84640
## 11 11 55 M Consultant Master's Married 110000 770 Approved 101200
mydata_under30 <- mydata[mydata$Age < 30, ]
head(mydata_under30)
## ID Age Gender Occupation Education_level Marital_status Income_USD Credit_score Loan_status Income_EUR
## 3 3 28 M Student High School Single 25000 590 Denied 23000
## 6 6 24 F Nurse Associate's Single 48000 640 Denied 44160
## 8 8 29 F Artist Bachelor's Single 38000 620 Denied 34960
## 13 13 26 M Salesman High School Single 42000 610 Denied 38640
## 16 16 27 F Designer Bachelor's Single 52000 650 Denied 47840
## 22 22 25 F Receptionist High School Single 35000 580 Denied 32200
summary(mydata[ , -c(1, 3, 4, 5, 6, 9)]) #Dropping several non-numerical variables in order to receive observable statistics as a summary
## Age Income_USD Credit_score Income_EUR
## Min. :24.00 Min. : 25000 Min. :560.0 Min. : 23000
## 1st Qu.:30.00 1st Qu.: 52000 1st Qu.:650.0 1st Qu.: 47840
## Median :36.00 Median : 78000 Median :720.0 Median : 71760
## Mean :37.08 Mean : 78984 Mean :709.8 Mean : 72665
## 3rd Qu.:43.00 3rd Qu.: 98000 3rd Qu.:770.0 3rd Qu.: 90160
## Max. :55.00 Max. :180000 Max. :830.0 Max. :165600
Based on the given data frame:
head(mydata[order(mydata$Age), -c(1, 3, 4, 5, 6, 9, 10)])
## Age Income_USD Credit_score
## 6 24 48000 640
## 22 25 35000 580
## 58 25 32000 570
## 13 26 42000 610
## 38 26 28000 560
## 16 27 52000 650
avg_income <- mean(mydata$Income_USD)
youngest_income <- mydata[6, "Income_USD"]
difference_youngest_and_avg <- (youngest_income/avg_income)-1
difference_youngest_and_avg <- round(difference_youngest_and_avg, 2)
print(difference_youngest_and_avg)
## [1] -0.39
# install.packages("psych")
library(psych)
describeBy(mydata$Income_USD, group = mydata$Gender)
##
## Descriptive statistics by group
## group: M
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 31 80677.42 26507.09 85000 81640 22239 25000 130000 105000 -0.34 -0.68 4760.81
## ------------------------------------------------------------------------------------------------------------------------------------------------------
## group: F
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 30 77233.33 40331.97 66500 71958.33 34841.1 28000 180000 152000 0.97 0.19 7363.58
Income comparison between male and female based on the given data shows that:
library(modeest)
## Registered S3 method overwritten by 'rmutil':
## method from
## plot.residuals psych
mlv(mydata$Occupation) # Most likely value
## [1] "Engineer"
# Showing how annual incomes are distributed
hist(mydata$Income_EUR,
main = "Distribution of annual income in USD",
xlab = "Income",
ylab = "Frequency",
breaks = seq(from = 0, to = 200000, by = 10000),
col = "forestgreen")
# Showing how ages are spread across given sample
hist(mydata$Age,
main = "Distribution of ages",
xlab = "Age",
ylab = "Frequency",
breaks = seq(from = 10, to = 70, by = 5),
col = "bisque")
# Showing how annual incomes are distributed
boxplot(mydata$Income_USD,
xlab = "Annual Income (in USD)")
Thus, there are 2 outliers that should be removed.
head(mydata[order(-mydata$Income_USD), ])
## ID Age Gender Occupation Education_level Marital_status Income_USD Credit_score Loan_status Income_EUR
## 10 10 48 F Doctor Doctoral Married 180000 820 Approved 165600
## 48 48 48 F Doctor Doctoral Married 175000 830 Approved 161000
## 34 34 47 F Dentist Doctoral Married 140000 810 Approved 128800
## 45 45 42 M Lawyer Doctoral Married 130000 800 Approved 119600
## 18 18 41 F Pharmacist Doctoral Married 125000 800 Approved 115000
## 7 7 42 M Lawyer Doctoral Married 120000 790 Approved 110400
mydata_new <- mydata[-c(10, 48), ]
head(mydata_new[order(-mydata_new$Income_USD), ])
## ID Age Gender Occupation Education_level Marital_status Income_USD Credit_score Loan_status Income_EUR
## 34 34 47 F Dentist Doctoral Married 140000 810 Approved 128800
## 45 45 42 M Lawyer Doctoral Married 130000 800 Approved 119600
## 18 18 41 F Pharmacist Doctoral Married 125000 800 Approved 115000
## 7 7 42 M Lawyer Doctoral Married 120000 790 Approved 110400
## 56 56 50 F Professor Doctoral Married 120000 810 Approved 110400
## 23 23 46 M Banker Master's Married 115000 780 Approved 105800
boxplot(mydata_new$Income_USD,
xlab = "Income (in USD)")
boxplot(mydata$Age,
xlab = "Age (in years)")
# install.packages("ggplot2")
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
ggplot(mydata, aes(x=Loan_status, y=Income_USD, fill=Gender)) +
geom_boxplot() +
scale_fill_brewer(palette="Spectral") +
xlab("Loan status") +
labs(fill="Gender")
# Used when we have 2 numerical variables and seek the relationship between them
# install.packages("car")
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
scatterplot(y = mydata$Age,
x = mydata$Income_USD,
ylab = "Age (in years)",
xlab = "Annual Income (in USD)",
smooth = FALSE)
scatterplot(Age ~ Income_USD | mydata$Gender,
ylab = "Age (in years)",
xlab = "Annual Income (in USD)",
smooth = FALSE,
data = mydata)
scatterplotMatrix(mydata[ , -c(1, 3, 4, 5, 6, 9, 10)],
smooth = FALSE)