This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
# Reading into data frames and loading required packages
library(dplyr)
library(ggplot2)
library(rpart)
setwd("/home/ankit/loan_pred/")
train <- read.csv("train_u6lujuX_CVtuZ9i.csv",na.strings = c("","NaN"," "))
test <- read.csv("test_Y3wMUE5_7gLdaTN.csv",na.strings = c("","NaN"," "))
test$Loan_Status <- as.factor("NA")
#Combining training and test set
df.loan <- rbind(train[,2:13],test[,2:13])
#Missing values Summary
Variable <- colnames(df.loan)
NA_count <- sapply(df.loan, function(x) sum(is.na(x)))
miss_summ <- data.frame(Variable,NA_count,row.names = NULL)
miss_summ %>%
arrange(desc(NA_count))
#Treatment of missing values
df.loan$Self_Employed[is.na(df.loan$Self_Employed)] = as.factor("No")
#Treatment of missing values in Loan Amount Term
df.loan$Loan_Amount_Term[is.na(df.loan$Loan_Amount_Term)] = 360
df.loan %>%
group_by(Education,Self_Employed) %>%
summarise(GroupMedian = mean(LoanAmount,na.rm = TRUE))
#imputing missing loan amount using sub categories
ind <- which(is.na(df.loan$LoanAmount))
df.loan[ind,]$LoanAmount[df.loan[ind,]$Education == "Graduate" & df.loan[ind,]$Self_Employed == "No"] <- 145.82
df.loan[ind,]$LoanAmount[df.loan[ind,]$Education == "Graduate" & df.loan[ind,]$Self_Employed == "Yes"] <- 174.24
df.loan[ind,]$LoanAmount[df.loan[ind,]$Education == "Not Graduate" & df.loan[ind,]$Self_Employed == "No"] <- 116.7
df.loan[ind,]$LoanAmount[df.loan[ind,]$Education == "Not Graduate" & df.loan[ind,]$Self_Employed == "Yes"] <- 131.56
#Credit History is a high impact variable
df.loan$Credit_History = as.character(df.loan$Credit_History)
df.loan$Credit_History[is.na(df.loan$Credit_History)] = "Not Available"
df.loan$Credit_History = as.factor(df.loan$Credit_History)
#Married Missing Values
df.loan$Married[is.na(df.loan$Married)] = as.factor("Yes")
#Gender Missing Values
df.loan$Gender[is.na(df.loan$Gender)] = as.factor("Male")
#Dependents Missing Values
df.loan$Dependents[is.na(df.loan$Dependents)] = as.factor("0")
cat("There are total", sum(is.na(df.loan)), "missing values in the dataset")
There are total 0 missing values in the dataset
#Feature Engineering
df.loan$TotalIncome <- log(df.loan$ApplicantIncome + df.loan$CoapplicantIncome)
df.loan$TotalIncomeLoanRatio = log(((df.loan$ApplicantIncome + df.loan$CoapplicantIncome)/df.loan$LoanAmount)*(as.numeric(df.loan$Loan_Amount_Term)/360))
df.loan$LoanAmount <- log(df.loan$LoanAmount)
df.loan <- df.loan[,!(names(df.loan)) %in% c("ApplicantIncome","CoapplicantIncome")]
#Applying Logistic Regression Model
train_up<- df.loan[1:614,]
test <- df.loan[615:981,]
model <- glm(train_up$Loan_Status~.,family = binomial(link = 'logit'),data = train_up, maxit = 100)
summary(model)
Call:
glm(formula = train_up$Loan_Status ~ ., family = binomial(link = "logit"),
data = train_up, maxit = 100)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.4183 -0.3598 0.5282 0.7127 2.5599
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.716037 2.579441 0.278 0.781324
GenderMale -0.059115 0.299718 -0.197 0.843644
MarriedYes 0.578602 0.253580 2.282 0.022505 *
Dependents1 -0.458598 0.294184 -1.559 0.119024
Dependents2 0.262808 0.342942 0.766 0.443479
Dependents3+ -0.040294 0.415204 -0.097 0.922691
EducationNot Graduate -0.385218 0.261564 -1.473 0.140819
Self_EmployedYes -0.003002 0.318704 -0.009 0.992484
LoanAmount 1.039985 0.939875 1.107 0.268504
Loan_Amount_Term -0.007592 0.004689 -1.619 0.105438
Credit_History1 3.951997 0.422452 9.355 < 2e-16 ***
Credit_HistoryNot Available 3.771899 0.532372 7.085 1.39e-12 ***
Property_AreaSemiurban 0.927540 0.270537 3.429 0.000607 ***
Property_AreaUrban 0.206313 0.260248 0.793 0.427921
TotalIncome -1.312866 0.936333 -1.402 0.160875
TotalIncomeLoanRatio 1.358687 0.872269 1.558 0.119317
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 762.89 on 613 degrees of freedom
Residual deviance: 558.03 on 598 degrees of freedom
AIC: 590.03
Number of Fisher Scoring iterations: 5
#Fitting the Model
fitted_results <- predict(model, newdata=test, type="response")
fitted_results <- ifelse(fitted_results > 0.5,"Y","N")
test_up <- read.csv("test_Y3wMUE5_7gLdaTN.csv", stringsAsFactors = TRUE)
submit <- data.frame(Loan_ID = test_up$Loan_ID, Loan_Status = fitted_results)
write.csv(submit,"/home/ankit/loan_pred/1405_sub_1.csv",row.names = FALSE)
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).