dftrain <- read.csv(paste("LoanDataTrain.csv",sep=""))
View(dftrain)
library(psych)
describe(dftrain)
## vars n mean sd median trimmed mad min
## Loan_ID* 1 614 307.50 177.39 307.5 307.50 227.58 1
## Gender* 2 614 2.78 0.47 3.0 2.87 0.00 1
## Married* 3 614 2.64 0.49 3.0 2.68 0.00 1
## Dependents* 4 614 2.72 1.04 2.0 2.58 0.00 1
## Education* 5 614 1.22 0.41 1.0 1.15 0.00 1
## Self_Employed* 6 614 2.08 0.42 2.0 2.04 0.00 1
## ApplicantIncome 7 614 5403.46 6109.04 3812.5 4292.06 1822.86 150
## CoapplicantIncome 8 614 1621.25 2926.25 1188.5 1154.85 1762.07 0
## LoanAmount 9 592 146.41 85.59 128.0 133.14 47.44 9
## Loan_Amount_Term 10 600 342.00 65.12 360.0 358.38 0.00 12
## Credit_History 11 564 0.84 0.36 1.0 0.93 0.00 0
## Property_Area* 12 614 2.04 0.79 2.0 2.05 1.48 1
## Loan_Status* 13 614 1.69 0.46 2.0 1.73 0.00 1
## max range skew kurtosis se
## Loan_ID* 614 613 0.00 -1.21 7.16
## Gender* 3 2 -1.92 2.91 0.02
## Married* 3 2 -0.72 -1.16 0.02
## Dependents* 5 4 0.89 -0.38 0.04
## Education* 2 1 1.36 -0.15 0.02
## Self_Employed* 3 2 0.49 2.17 0.02
## ApplicantIncome 81000 80850 6.51 59.83 246.54
## CoapplicantIncome 41667 41667 7.45 83.97 118.09
## LoanAmount 700 691 2.66 10.26 3.52
## Loan_Amount_Term 480 468 -2.35 6.58 2.66
## Credit_History 1 1 -1.87 1.51 0.02
## Property_Area* 3 2 -0.07 -1.39 0.03
## Loan_Status* 2 1 -0.81 -1.35 0.02
dim(dftrain)
## [1] 614 13
str(dftrain)
## 'data.frame': 614 obs. of 13 variables:
## $ Loan_ID : Factor w/ 614 levels "LP001002","LP001003",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Gender : Factor w/ 3 levels "","Female","Male": 3 3 3 3 3 3 3 3 3 3 ...
## $ Married : Factor w/ 3 levels "","No","Yes": 2 3 3 3 2 3 3 3 3 3 ...
## $ Dependents : Factor w/ 5 levels "","0","1","2",..: 2 3 2 2 2 4 2 5 4 3 ...
## $ Education : Factor w/ 2 levels "Graduate","Not Graduate": 1 1 1 2 1 1 2 1 1 1 ...
## $ Self_Employed : Factor w/ 3 levels "","No","Yes": 2 2 3 2 2 3 2 2 2 2 ...
## $ ApplicantIncome : int 5849 4583 3000 2583 6000 5417 2333 3036 4006 12841 ...
## $ CoapplicantIncome: num 0 1508 0 2358 0 ...
## $ LoanAmount : int NA 128 66 120 141 267 95 158 168 349 ...
## $ Loan_Amount_Term : int 360 360 360 360 360 360 360 360 360 360 ...
## $ Credit_History : int 1 1 1 1 1 1 1 0 1 1 ...
## $ Property_Area : Factor w/ 3 levels "Rural","Semiurban",..: 3 1 3 3 3 3 3 2 3 2 ...
## $ Loan_Status : Factor w/ 2 levels "N","Y": 2 1 2 2 2 2 2 1 2 1 ...
dftrain$Married = factor(ifelse(dftrain$Married == "" | dftrain$Married == "No","No","Yes"))
dftrain$Gender = factor(ifelse(dftrain$Gender == ""|dftrain$Gender=="Male","Male","Female"))
dftrain$Dependents = factor(ifelse(dftrain$Dependents == "","5",dftrain$Dependents))
dftrain$Self_Employed = factor(ifelse(dftrain$Self_Employed == "" | dftrain$Self_Employed == "No","No","Yes"))
dftrain$Credit_History = factor(ifelse(is.na(dftrain$Credit_History),0,dftrain$Credit_History ))
dftrain$ApplicantIncome = ifelse(is.na(dftrain$ApplicantIncome),
ave(dftrain$ApplicantIncome,FUN = function(x)mean(x,na.rm=TRUE)),
dftrain$ApplicantIncome)
dftrain$LoanAmount <- ifelse(is.na(dftrain$LoanAmount),
ave(dftrain$LoanAmount,FUN = function(x)mean(x,na.rm=TRUE)),
dftrain$LoanAmount)
dftrain$Loan_Amount_Term <-ifelse(is.na(dftrain$Loan_Amount_Term),
ave(dftrain$Loan_Amount_Term,FUN = function(x)mean(x,na.rm=TRUE)),
dftrain$Loan_Amount_Term)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
ggplot(dftrain , aes(x=Married,fill=Loan_Status))+
geom_bar()+theme_bw()+
labs(x = "Martial Satus of Applicant",
y = "Frequency",
title = "Distribution of Martial Status",
caption ="Data Source : Analytics Vidya")
ggplot(dftrain , aes(x=Credit_History,fill=Loan_Status))+
geom_bar()+theme_bw()+
labs(x = "Credit History of Applicant",
y = "Frequency",
title = "Distribution of credit history",
caption ="Data Source : Analytics Vidya")
ggplot(dftrain , aes(x=Property_Area,fill=Loan_Status))+
geom_bar()+theme_bw()+
labs(x = "Property Area of Applicant",
y = "Frequency",
title = "Distribution of Property Area",
caption ="Data Source : Analytics Vidya")
ggplot(dftrain , aes(x=Self_Employed,y=LoanAmount,fill= Loan_Status))+
geom_boxplot()+theme_bw()+
labs(x="Self Employed",
y= "Loan amount",
title ="Distribution of Loan Amount as Function of Self Employment",
caption ="Data Source : Analytics Vidya")
ggplot(dftrain , aes(x=Loan_Status,y=ApplicantIncome,fill= Credit_History))+
geom_boxplot()+theme_bw()+
labs(x="Loan Status",
y= "Applicants income",
title ="Distribution of Applicant's Income as function of Loan Status",
caption ="Data Source : Analytics Vidya")
ggplot(dftrain , aes(x=Loan_Status,y=CoapplicantIncome,fill= Credit_History))+
geom_boxplot()+theme_bw()+
labs(x="Loan Status",
y= "Applicants income",
title ="Distribution of Applicant's Income as function of Loan Status",
caption ="Data Source : Analytics Vidya")
ggplot(dftrain , aes(x=ApplicantIncome,bins=5,fill=Loan_Status))+
geom_histogram()+theme_bw()+
labs(x="Applicants Income",
y= "Frequency",
title ="Distribution of Applicant's Income",
caption ="Data Source : Analytics Vidya")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(dftrain , aes(x=CoapplicantIncome,fill= Loan_Status))+
geom_histogram()+theme_bw()+
labs(x="Co Applicant's Income",
y= "Frequency",
title ="Distribution of Co-Applicant's Income",
caption ="Data Source : Analytics Vidya")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
boxplot(dftrain$CoapplicantIncome,xlab="Co Applicant's income",main="distribution of Co Applicant's income",horizontal = TRUE,col = "skyblue")
boxplot(dftrain$ApplicantIncome,xlab="Applicant's income",main="Distribution of Applicant's income",horizontal = TRUE,col= "skyblue")
table1<-with(dftrain,table(Loan_Status))
addmargins(table1)
## Loan_Status
## N Y Sum
## 192 422 614
table2<-with(dftrain,table(Married))
addmargins(table2)
## Married
## No Yes Sum
## 216 398 614
table3<-with(dftrain,table(Gender))
addmargins(table3)
## Gender
## Female Male Sum
## 112 502 614
table4<-with(dftrain,table(Property_Area))
addmargins(table4)
## Property_Area
## Rural Semiurban Urban Sum
## 179 233 202 614
table5<-with(dftrain,table(Dependents))
addmargins(table5)
## Dependents
## 2 3 4 5 Sum
## 345 102 101 66 614
table6<-with(dftrain,table(Education))
addmargins(table6)
## Education
## Graduate Not Graduate Sum
## 480 134 614
table7<-with(dftrain,table(Self_Employed))
addmargins(table7)
## Self_Employed
## No Yes Sum
## 532 82 614
table8<-with(dftrain,table(Credit_History))
addmargins(table8)
## Credit_History
## 0 1 Sum
## 139 475 614
table9<-xtabs(~Loan_Status+Education, data=dftrain)
addmargins(table9)
## Education
## Loan_Status Graduate Not Graduate Sum
## N 140 52 192
## Y 340 82 422
## Sum 480 134 614
table10<-xtabs(~Loan_Status+Gender, data=dftrain)
addmargins(table10)
## Gender
## Loan_Status Female Male Sum
## N 37 155 192
## Y 75 347 422
## Sum 112 502 614
table11<-xtabs(~Loan_Status+Self_Employed, data=dftrain)
addmargins(table11)
## Self_Employed
## Loan_Status No Yes Sum
## N 166 26 192
## Y 366 56 422
## Sum 532 82 614
table12<-xtabs(~Loan_Status+Property_Area, data=dftrain)
addmargins(table12)
## Property_Area
## Loan_Status Rural Semiurban Urban Sum
## N 69 54 69 192
## Y 110 179 133 422
## Sum 179 233 202 614
table12<-xtabs(~Loan_Status+Dependents, data=dftrain)
addmargins(table12)
## Dependents
## Loan_Status 2 3 4 5 Sum
## N 107 36 25 24 192
## Y 238 66 76 42 422
## Sum 345 102 101 66 614
table13<-xtabs(~Loan_Status+Credit_History, data=dftrain)
addmargins(table13)
## Credit_History
## Loan_Status 0 1 Sum
## N 95 97 192
## Y 44 378 422
## Sum 139 475 614
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplotMatrix(formula=~CoapplicantIncome+LoanAmount+Loan_Amount_Term+ApplicantIncome, data=dftrain, diagonal="histogram")
library(corrgram)
corrgram(dftrain, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Correlation Visualization")
cor(dftrain[,c(7:10)])
## ApplicantIncome CoapplicantIncome LoanAmount
## ApplicantIncome 1.00000000 -0.11660458 0.56562046
## CoapplicantIncome -0.11660458 1.00000000 0.18782839
## LoanAmount 0.56562046 0.18782839 1.00000000
## Loan_Amount_Term -0.04524182 -0.05967534 0.03880147
## Loan_Amount_Term
## ApplicantIncome -0.04524182
## CoapplicantIncome -0.05967534
## LoanAmount 0.03880147
## Loan_Amount_Term 1.00000000
chisq.test(dftrain$Gender,dftrain$Loan_Status)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: dftrain$Gender and dftrain$Loan_Status
## X-squared = 0.11088, df = 1, p-value = 0.7391
chisq.test(dftrain$Education,dftrain$Loan_Status)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: dftrain$Education and dftrain$Loan_Status
## X-squared = 4.0915, df = 1, p-value = 0.0431
chisq.test(dftrain$Self_Employed,dftrain$Loan_Status)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: dftrain$Self_Employed and dftrain$Loan_Status
## X-squared = 1.0223e-29, df = 1, p-value = 1
chisq.test(dftrain$Dependents,dftrain$Loan_Status)
##
## Pearson's Chi-squared test
##
## data: dftrain$Dependents and dftrain$Loan_Status
## X-squared = 3.572, df = 3, p-value = 0.3115
t.test(dftrain$CoapplicantIncome,dftrain$LoanAmount)
##
## Welch Two Sample t-test
##
## data: dftrain$CoapplicantIncome and dftrain$LoanAmount
## t = 12.484, df = 614.01, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1242.821 1706.846
## sample estimates:
## mean of x mean of y
## 1621.2458 146.4122
t.test(dftrain$Loan_Amount_Term,dftrain$LoanAmount)
##
## Welch Two Sample t-test
##
## data: dftrain$Loan_Amount_Term and dftrain$LoanAmount
## t = 45.782, df = 1148.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 187.2058 203.9699
## sample estimates:
## mean of x mean of y
## 342.0000 146.4122
dftrain$Gender <- factor(dftrain$Gender,
levels= c("Male","Female"),
labels = c(0,1))
dftrain$Married <- factor(dftrain$Married,
levels= c("No","Yes"),
labels = c(0,1))
dftrain$Education<- factor(dftrain$Education,
levels= c("Not Graduate","Graduate"),
labels = c(0,1))
dftrain$Self_Employed<- factor(dftrain$Self_Employed,
levels= c("No","Yes"),
labels = c(0,1))
dftrain$Property_Area<- factor(dftrain$Property_Area,
levels= c("Urban","Rural","Semiurban"),
labels = c(1,2,3))
dftrain$Loan_Status<- factor(dftrain$Loan_Status,
levels= c("N","Y"),
labels = c(0,1))
library(caTools)#Library to split data into training and test set to test the model
split <- sample.split(dftrain$Loan_Status,SplitRatio = 0.7)
training <- subset(dftrain,split==TRUE)
test <- subset(dftrain,split==FALSE)
classifier <- glm(Loan_Status~
Education+
Self_Employed+
ApplicantIncome+
CoapplicantIncome+
LoanAmount+
Loan_Amount_Term+
Credit_History,
family = binomial(),
data= training)
test <- test[,c(1:13)]
prob <-predict(classifier,type="response",newdata = test[-13])
pred <- ifelse(prob > 0.5,1,0)
#making confusion matrix
cm = table(test[,13],pred)
cm
## pred
## 0 1
## 0 27 31
## 1 9 118
library(e1071)
x= training[,c(7:10)]
y =training[,13]
classifier1 <- svm(x,y)
#Predicting
pred1 <-predict(classifier1,newdata =test[,c(7:10)])
#making confusion matrix
cm1 <- table(test[,13],pred1)
cm1
## pred1
## 0 1
## 0 3 55
## 1 1 126