loan.df<-read.csv(paste("loan.csv",sep = ""))
dim(loan.df)
## [1] 614 13
summary(loan.df)
## Loan_ID Gender Married Dependents Education
## LP001002: 1 Female:117 No :214 Min. :0.0000 Graduate :480
## LP001003: 1 Male :497 Yes:400 1st Qu.:0.0000 Not Graduate:134
## LP001005: 1 Median :0.0000
## LP001006: 1 Mean :0.8274
## LP001008: 1 3rd Qu.:2.0000
## LP001011: 1 Max. :4.0000
## (Other) :608
## Self_Employed ApplicantIncome CoapplicantIncome LoanAmount
## No :509 Min. : 150 Min. : 0 Min. : 9.0
## Yes:105 1st Qu.: 2878 1st Qu.: 0 1st Qu.:100.2
## Median : 3812 Median : 1188 Median :128.0
## Mean : 5403 Mean : 1621 Mean :146.1
## 3rd Qu.: 5795 3rd Qu.: 2297 3rd Qu.:164.8
## Max. :81000 Max. :41667 Max. :700.0
##
## Loan_Amount_Term Credit_History Property_Area Loan_Status
## Min. : 12.0 Min. :0.0000 Rural :179 N:192
## 1st Qu.:360.0 1st Qu.:1.0000 Semiurban:233 Y:422
## Median :360.0 Median :1.0000 Urban :202
## Mean :339.5 Mean :0.8339
## 3rd Qu.:360.0 3rd Qu.:1.0000
## Max. :480.0 Max. :1.0000
##
library(psych)
describe(loan.df) [,c(1:9)]
## vars n mean sd median trimmed mad min
## Loan_ID* 1 614 307.50 177.39 307.5 307.50 227.58 1
## Gender* 2 614 1.81 0.39 2.0 1.89 0.00 1
## Married* 3 614 1.65 0.48 2.0 1.69 0.00 1
## Dependents 4 614 0.83 1.14 0.0 0.62 0.00 0
## Education* 5 614 1.22 0.41 1.0 1.15 0.00 1
## Self_Employed* 6 614 1.17 0.38 1.0 1.09 0.00 1
## ApplicantIncome 7 614 5403.46 6109.04 3812.5 4292.06 1822.86 150
## CoapplicantIncome 8 614 1621.25 2926.25 1188.5 1154.85 1762.07 0
## LoanAmount 9 614 146.09 84.10 128.0 133.17 44.48 9
## Loan_Amount_Term 10 614 339.49 70.69 360.0 356.34 0.00 12
## Credit_History 11 614 0.83 0.37 1.0 0.92 0.00 0
## Property_Area* 12 614 2.04 0.79 2.0 2.05 1.48 1
## Loan_Status* 13 614 1.69 0.46 2.0 1.73 0.00 1
## max
## Loan_ID* 614
## Gender* 2
## Married* 2
## Dependents 4
## Education* 2
## Self_Employed* 2
## ApplicantIncome 81000
## CoapplicantIncome 41667
## LoanAmount 700
## Loan_Amount_Term 480
## Credit_History 1
## Property_Area* 3
## Loan_Status* 2
str(loan.df)
## 'data.frame': 614 obs. of 13 variables:
## $ Loan_ID : Factor w/ 614 levels "LP001002","LP001003",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
## $ Married : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 2 2 2 2 2 ...
## $ Dependents : int 0 1 0 0 0 2 0 4 2 1 ...
## $ Education : Factor w/ 2 levels "Graduate","Not Graduate": 1 1 1 2 1 1 2 1 1 1 ...
## $ Self_Employed : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 2 1 1 1 1 ...
## $ ApplicantIncome : int 5849 4583 3000 2583 6000 5417 2333 3036 4006 12841 ...
## $ CoapplicantIncome: int 0 1508 0 2358 0 4196 1516 2504 1526 10968 ...
## $ LoanAmount : int 160 128 66 120 141 267 95 158 168 349 ...
## $ Loan_Amount_Term : int 360 360 360 360 360 360 360 360 360 360 ...
## $ Credit_History : int 1 1 1 1 1 1 1 0 1 1 ...
## $ Property_Area : Factor w/ 3 levels "Rural","Semiurban",..: 3 1 3 3 3 3 3 2 3 2 ...
## $ Loan_Status : Factor w/ 2 levels "N","Y": 2 1 2 2 2 2 2 1 2 1 ...
loan.df$Credit_History[loan.df$Credit_History == 1] <- 'Yes'
loan.df$Credit_History[loan.df$Credit_History == 0] <- 'NO'
loan.df$Credit_History <- factor(loan.df$Credit_History)
str(loan.df)
## 'data.frame': 614 obs. of 13 variables:
## $ Loan_ID : Factor w/ 614 levels "LP001002","LP001003",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
## $ Married : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 2 2 2 2 2 ...
## $ Dependents : int 0 1 0 0 0 2 0 4 2 1 ...
## $ Education : Factor w/ 2 levels "Graduate","Not Graduate": 1 1 1 2 1 1 2 1 1 1 ...
## $ Self_Employed : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 2 1 1 1 1 ...
## $ ApplicantIncome : int 5849 4583 3000 2583 6000 5417 2333 3036 4006 12841 ...
## $ CoapplicantIncome: int 0 1508 0 2358 0 4196 1516 2504 1526 10968 ...
## $ LoanAmount : int 160 128 66 120 141 267 95 158 168 349 ...
## $ Loan_Amount_Term : int 360 360 360 360 360 360 360 360 360 360 ...
## $ Credit_History : Factor w/ 2 levels "NO","Yes": 2 2 2 2 2 2 2 1 2 2 ...
## $ Property_Area : Factor w/ 3 levels "Rural","Semiurban",..: 3 1 3 3 3 3 3 2 3 2 ...
## $ Loan_Status : Factor w/ 2 levels "N","Y": 2 1 2 2 2 2 2 1 2 1 ...
table1 <- with(loan.df, table(Gender))
table1
## Gender
## Female Male
## 117 497
prop.table(table1)
## Gender
## Female Male
## 0.1905537 0.8094463
prop.table(table1)*100
## Gender
## Female Male
## 19.05537 80.94463
table2 <- with(loan.df, table(Married))
table2
## Married
## No Yes
## 214 400
prop.table(table2)
## Married
## No Yes
## 0.3485342 0.6514658
prop.table(table2)*100
## Married
## No Yes
## 34.85342 65.14658
table3 <- with(loan.df, table(Education))
table3
## Education
## Graduate Not Graduate
## 480 134
prop.table(table3)
## Education
## Graduate Not Graduate
## 0.781759 0.218241
prop.table(table3)*100
## Education
## Graduate Not Graduate
## 78.1759 21.8241
table4 <- with(loan.df, table(Self_Employed))
table4
## Self_Employed
## No Yes
## 509 105
prop.table(table4)
## Self_Employed
## No Yes
## 0.8289902 0.1710098
prop.table(table4)*100
## Self_Employed
## No Yes
## 82.89902 17.10098
table5 <- with(loan.df, table(Credit_History))
table5
## Credit_History
## NO Yes
## 102 512
prop.table(table5)
## Credit_History
## NO Yes
## 0.1661238 0.8338762
prop.table(table5)*100
## Credit_History
## NO Yes
## 16.61238 83.38762
table6 <- with(loan.df, table(Property_Area))
table6
## Property_Area
## Rural Semiurban Urban
## 179 233 202
prop.table(table6)
## Property_Area
## Rural Semiurban Urban
## 0.2915309 0.3794788 0.3289902
prop.table(table6)*100
## Property_Area
## Rural Semiurban Urban
## 29.15309 37.94788 32.89902
table7 <- xtabs(~ Loan_Status+Credit_History, data=loan.df)
table7
## Credit_History
## Loan_Status NO Yes
## N 95 97
## Y 7 415
table8 <- xtabs(~ Loan_Status+Property_Area, data=loan.df)
table8
## Property_Area
## Loan_Status Rural Semiurban Urban
## N 69 54 69
## Y 110 179 133
table9 <- xtabs(~ Loan_Status+Gender, data=loan.df)
table9
## Gender
## Loan_Status Female Male
## N 38 154
## Y 79 343
table10 <- xtabs(~ Loan_Status+Education, data=loan.df)
table10
## Education
## Loan_Status Graduate Not Graduate
## N 140 52
## Y 340 82
table11 <- xtabs(~ Loan_Status+Self_Employed, data=loan.df)
table11
## Self_Employed
## Loan_Status No Yes
## N 166 26
## Y 343 79
barplot(table(loan.df$Loan_Status), main = "Loan_status distribution")
barplot(table(loan.df$Education))
boxplot(loan.df$ApplicantIncome,loan.df$CoapplicantIncome, xlab="Income", names=c("app income","coapp income"), main="Boxplot of Applicant income", horizontal=TRUE, col = "green")
hist(loan.df$LoanAmount,
main="Histogram of loan amount",
xlab="Loan_amount", ylab="Density",
breaks=30, col="lightblue", freq=FALSE)
lines(density(loan.df$LoanAmount, bw=10),
type="l", col="darkred", lwd=2)
barplot(table(loan.df$Credit_History))
barplot(table(loan.df$Self_Employed))
library(corrgram)
corrgram(loan.df, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Corrgram of loan dataset")
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplotMatrix(~Loan_Status+Education+Self_Employed+ApplicantIncome+LoanAmount+Credit_History, data=loan.df, cex=0.6, diagonal="histogram", main= "Loan_Status vs other variables")
A chisquare test between LOAN_STATUS and GENDER to show whether there is any significant difference between the loan status of males and females.
H0: There is no difference in the Loan_status.
chisq.test(table9)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table9
## X-squared = 0.041015, df = 1, p-value = 0.8395
As p=value is >0.05 we fail to reject the hypothesis.
A T-test test between LOAN_STATUS and CREDIT_HISTORY to show whether there is any significant difference between the credit_history of various applicants.
H0: There is no difference between the Credit_history and loan_status.
converting factor to numeric
loan.df$Loan_Status <- as.numeric(loan.df$Loan_Status)
t.test(Loan_Status~Credit_History, data = loan.df)
##
## Welch Two Sample t-test
##
## data: Loan_Status by Credit_History
## t = -24.285, df = 210.32, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.8021447 -0.6816942
## sample estimates:
## mean in group NO mean in group Yes
## 1.068627 1.810547
As the p-value is <0.05 we can reject the hypothesis and conclude that there is significant diffrence between the credit_history and the loan_status.