mydata<-read.csv("/Users/tarahorvat/Downloads/Loan_Data.csv")
head(mydata)
## Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome
## 1 LP001002 Male No 0 Graduate No 5849
## 2 LP001003 Male Yes 1 Graduate No 4583
## 3 LP001005 Male Yes 0 Graduate Yes 3000
## 4 LP001006 Male Yes 0 Not Graduate No 2583
## 5 LP001008 Male No 0 Graduate No 6000
## 6 LP001011 Male Yes 2 Graduate Yes 5417
## CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area
## 1 0 NA 360 1 Urban
## 2 1508 128 360 1 Rural
## 3 0 66 360 1 Urban
## 4 2358 120 360 1 Urban
## 5 0 141 360 1 Urban
## 6 4196 267 360 1 Urban
## Loan_Status
## 1 Y
## 2 N
## 3 Y
## 4 Y
## 5 Y
## 6 Y
mydata <- na_if(mydata,"")
mydata <- mydata[,-11]
mydata <- drop_na(mydata)
head(mydata,10)
## Loan_ID Gender Married Dependents Education Self_Employed
## 1 LP001003 Male Yes 1 Graduate No
## 2 LP001005 Male Yes 0 Graduate Yes
## 3 LP001006 Male Yes 0 Not Graduate No
## 4 LP001008 Male No 0 Graduate No
## 5 LP001011 Male Yes 2 Graduate Yes
## 6 LP001013 Male Yes 0 Not Graduate No
## 7 LP001014 Male Yes 3+ Graduate No
## 8 LP001018 Male Yes 2 Graduate No
## 9 LP001020 Male Yes 1 Graduate No
## 10 LP001024 Male Yes 2 Graduate No
## ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Property_Area
## 1 4583 1508 128 360 Rural
## 2 3000 0 66 360 Urban
## 3 2583 2358 120 360 Urban
## 4 6000 0 141 360 Urban
## 5 5417 4196 267 360 Urban
## 6 2333 1516 95 360 Urban
## 7 3036 2504 158 360 Semiurban
## 8 4006 1526 168 360 Urban
## 9 12841 10968 349 360 Semiurban
## 10 3200 700 70 360 Urban
## Loan_Status
## 1 N
## 2 Y
## 3 Y
## 4 Y
## 5 Y
## 6 Y
## 7 N
## 8 Y
## 9 N
## 10 Y
mydata <- mydata %>% rename(Applicant_Income=ApplicantIncome, Coapplicant_Income=CoapplicantIncome,Loan_Amount=LoanAmount)
head(mydata)
## Loan_ID Gender Married Dependents Education Self_Employed
## 1 LP001003 Male Yes 1 Graduate No
## 2 LP001005 Male Yes 0 Graduate Yes
## 3 LP001006 Male Yes 0 Not Graduate No
## 4 LP001008 Male No 0 Graduate No
## 5 LP001011 Male Yes 2 Graduate Yes
## 6 LP001013 Male Yes 0 Not Graduate No
## Applicant_Income Coapplicant_Income Loan_Amount Loan_Amount_Term
## 1 4583 1508 128 360
## 2 3000 0 66 360
## 3 2583 2358 120 360
## 4 6000 0 141 360
## 5 5417 4196 267 360
## 6 2333 1516 95 360
## Property_Area Loan_Status
## 1 Rural N
## 2 Urban Y
## 3 Urban Y
## 4 Urban Y
## 5 Urban Y
## 6 Urban Y
round(stat.desc(mydata[ ,c (7, 8, 9, 10)]), 2)
## Applicant_Income Coapplicant_Income Loan_Amount Loan_Amount_Term
## nbr.val 523.00 523.00 523.00 523.00
## nbr.null 0.00 232.00 0.00 0.00
## nbr.na 0.00 0.00 0.00 0.00
## min 150.00 0.00 9.00 12.00
## max 81000.00 33837.00 650.00 480.00
## range 80850.00 33837.00 641.00 468.00
## sum 2780319.00 839152.92 76089.00 178860.00
## median 3850.00 1167.00 128.00 360.00
## mean 5316.10 1604.50 145.49 341.99
## SE.mean 242.66 112.66 3.64 2.88
## CI.mean.0.95 476.72 221.33 7.15 5.65
## var 30797052.97 6638607.00 6926.18 4327.38
## std.dev 5549.51 2576.55 83.22 65.78
## coef.var 1.04 1.61 0.57 0.19
DESCRIPTIVE STATISTICS FOR THE SELECTED VARIABLES
1. MEAN represents the arithmetic average of the data. It is calculated by taking the sum of the values and dividing by the number of observations.
2. MEDIAN value is the middle most value of a variable in a data.
3. RANGE is the difference between the highest and lowest values within a set of numbers in data.
hist(mydata$Loan_Amount,
main = "Distribution of Loan Amount",
xlab = "Loan Amount in thousands, in dollars",
ylab = "Frequency",
breaks = seq(from = 0, to = 700, by = 25))
We can see that the distribution of loan amount is skewed to the right. The distribution is close to normal. From that, we can infer that most frequent amount of loan is around 100k.
ggplot(mydata, aes(x=Applicant_Income)) +
geom_histogram(color="black", fill="white")+
ggtitle("Distribution of Loan Amount")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
There are some cases of an outliers, therefore I removed values bigger than 30000.
mydata <- mydata[mydata$Applicant_Income < 30000,]
AI <- ggplot(mydata, aes(x=Applicant_Income)) +
geom_histogram(color="black", fill="white")+
ggtitle("Distribution of Applicant Income") +
xlab("Monthly applicant income, in dollars ")
LA <- ggplot(mydata, aes(x=Loan_Amount)) +
geom_histogram(color="black", fill="white")+
ggtitle("Distribution of Loan Amount") +
xlab("Loan Amount in thousands, in dollars")
LAT <- ggplot(mydata, aes(x=Loan_Amount_Term)) +
geom_histogram(color="black", fill="white", binwidth=50)+
ggtitle("Distribution of Loan Amount Term") +
xlab("Term of loan in months")
ggarrange(AI, LA, LAT,
ncol = 2, nrow = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
In addition to loan amount, there is also skewness to the right in case of applicant income. The distribution is close to normal. From that, we can infer that most frequent applicant income is around 2500 dollars per month. However some of applicant’s monthly income is more than 20000 dollars. The third graph shows that the applicants will repay the loan in an average of approximately 360 months, which is 30 years. From this I can assume that the majority are applying for housing loan.
scatterplotMatrix(mydata[ , c(7,9)],
smooth = FALSE)
In the graphs above there is shown a dependance among applicant income and amount of loan. It is evident from the graph that there is positive correlation between Applicant Income and Loan Amount. Applicants with a higher income have higher loan amount.
mydata <- mydata[mydata$Loan_Amount < 300,]
E <- ggplot(mydata, aes(y=Loan_Amount, fill=Education)) +
geom_boxplot(position=position_dodge(1)) +
ggtitle("Amount of Loan x Education") +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())
G <- ggplot(mydata, aes(y=Loan_Amount, fill=Gender)) +
geom_boxplot(position=position_dodge(1)) +
ggtitle("Amount of Loan x Gender") +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())
M <- ggplot(mydata, aes(y=Loan_Amount, fill=Married)) +
geom_boxplot(position=position_dodge(1)) +
ggtitle("Amount of Loan x Marriage Status") +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())
SE <- ggplot(mydata, aes(y=Loan_Amount, fill=Self_Employed)) +
geom_boxplot(position=position_dodge(1)) +
ggtitle("Amount of Loan x Self Employment") +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())
ggarrange(E, G, M, SE,
ncol = 2, nrow = 2)
In the graphs above there is shown a connection between Amount of Loan and Education/Gender/Marriage Status/Self Employment. Graduate compared to ungraduate have a larger amount of loan. Compared to women, men have a slightly larger amount of loan. Married and self-employed also have a larger amount of loan than not married and not self employed. However, the differences are not significant and we have to take into account that there are quite a few outliers.