Reading data

mydata<-read.csv("/Users/tarahorvat/Downloads/Loan_Data.csv")

head(mydata)
##    Loan_ID Gender Married Dependents    Education Self_Employed ApplicantIncome
## 1 LP001002   Male      No          0     Graduate            No            5849
## 2 LP001003   Male     Yes          1     Graduate            No            4583
## 3 LP001005   Male     Yes          0     Graduate           Yes            3000
## 4 LP001006   Male     Yes          0 Not Graduate            No            2583
## 5 LP001008   Male      No          0     Graduate            No            6000
## 6 LP001011   Male     Yes          2     Graduate           Yes            5417
##   CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area
## 1                 0         NA              360              1         Urban
## 2              1508        128              360              1         Rural
## 3                 0         66              360              1         Urban
## 4              2358        120              360              1         Urban
## 5                 0        141              360              1         Urban
## 6              4196        267              360              1         Urban
##   Loan_Status
## 1           Y
## 2           N
## 3           Y
## 4           Y
## 5           Y
## 6           Y

Getting rid of units of observations without data

mydata <- na_if(mydata,"")
mydata <- mydata[,-11]
mydata <- drop_na(mydata)
head(mydata,10)
##     Loan_ID Gender Married Dependents    Education Self_Employed
## 1  LP001003   Male     Yes          1     Graduate            No
## 2  LP001005   Male     Yes          0     Graduate           Yes
## 3  LP001006   Male     Yes          0 Not Graduate            No
## 4  LP001008   Male      No          0     Graduate            No
## 5  LP001011   Male     Yes          2     Graduate           Yes
## 6  LP001013   Male     Yes          0 Not Graduate            No
## 7  LP001014   Male     Yes         3+     Graduate            No
## 8  LP001018   Male     Yes          2     Graduate            No
## 9  LP001020   Male     Yes          1     Graduate            No
## 10 LP001024   Male     Yes          2     Graduate            No
##    ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Property_Area
## 1             4583              1508        128              360         Rural
## 2             3000                 0         66              360         Urban
## 3             2583              2358        120              360         Urban
## 4             6000                 0        141              360         Urban
## 5             5417              4196        267              360         Urban
## 6             2333              1516         95              360         Urban
## 7             3036              2504        158              360     Semiurban
## 8             4006              1526        168              360         Urban
## 9            12841             10968        349              360     Semiurban
## 10            3200               700         70              360         Urban
##    Loan_Status
## 1            N
## 2            Y
## 3            Y
## 4            Y
## 5            Y
## 6            Y
## 7            N
## 8            Y
## 9            N
## 10           Y

Data manipulations

mydata <- mydata %>% rename(Applicant_Income=ApplicantIncome, Coapplicant_Income=CoapplicantIncome,Loan_Amount=LoanAmount)

head(mydata)
##    Loan_ID Gender Married Dependents    Education Self_Employed
## 1 LP001003   Male     Yes          1     Graduate            No
## 2 LP001005   Male     Yes          0     Graduate           Yes
## 3 LP001006   Male     Yes          0 Not Graduate            No
## 4 LP001008   Male      No          0     Graduate            No
## 5 LP001011   Male     Yes          2     Graduate           Yes
## 6 LP001013   Male     Yes          0 Not Graduate            No
##   Applicant_Income Coapplicant_Income Loan_Amount Loan_Amount_Term
## 1             4583               1508         128              360
## 2             3000                  0          66              360
## 3             2583               2358         120              360
## 4             6000                  0         141              360
## 5             5417               4196         267              360
## 6             2333               1516          95              360
##   Property_Area Loan_Status
## 1         Rural           N
## 2         Urban           Y
## 3         Urban           Y
## 4         Urban           Y
## 5         Urban           Y
## 6         Urban           Y

Description of variables

Descriptive Statistics

round(stat.desc(mydata[ ,c (7, 8, 9, 10)]), 2)
##              Applicant_Income Coapplicant_Income Loan_Amount Loan_Amount_Term
## nbr.val                523.00             523.00      523.00           523.00
## nbr.null                 0.00             232.00        0.00             0.00
## nbr.na                   0.00               0.00        0.00             0.00
## min                    150.00               0.00        9.00            12.00
## max                  81000.00           33837.00      650.00           480.00
## range                80850.00           33837.00      641.00           468.00
## sum                2780319.00          839152.92    76089.00        178860.00
## median                3850.00            1167.00      128.00           360.00
## mean                  5316.10            1604.50      145.49           341.99
## SE.mean                242.66             112.66        3.64             2.88
## CI.mean.0.95           476.72             221.33        7.15             5.65
## var               30797052.97         6638607.00     6926.18          4327.38
## std.dev               5549.51            2576.55       83.22            65.78
## coef.var                 1.04               1.61        0.57             0.19

DESCRIPTIVE STATISTICS FOR THE SELECTED VARIABLES

1. MEAN represents the arithmetic average of the data. It is calculated by taking the sum of the values and dividing by the number of observations.

2. MEDIAN value is the middle most value of a variable in a data.

3. RANGE is the difference between the highest and lowest values within a set of numbers in data.

hist(mydata$Loan_Amount, 
     main = "Distribution of Loan Amount", 
     xlab = "Loan Amount in thousands, in dollars", 
     ylab = "Frequency", 
     breaks = seq(from = 0, to = 700, by = 25))

We can see that the distribution of loan amount is skewed to the right. The distribution is close to normal. From that, we can infer that most frequent amount of loan is around 100k.

ggplot(mydata, aes(x=Applicant_Income)) + 
  geom_histogram(color="black", fill="white")+
  ggtitle("Distribution of Loan Amount")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

There are some cases of an outliers, therefore I removed values bigger than 30000.

Removing outliers (values bigger than 30.000)
mydata <- mydata[mydata$Applicant_Income < 30000,]

Descriptive Statistics

AI <- ggplot(mydata, aes(x=Applicant_Income)) + 
  geom_histogram(color="black", fill="white")+
  ggtitle("Distribution of Applicant Income") + 
  xlab("Monthly applicant income, in dollars ")

LA <- ggplot(mydata, aes(x=Loan_Amount)) + 
  geom_histogram(color="black", fill="white")+
  ggtitle("Distribution of Loan Amount") +
  xlab("Loan Amount in thousands, in dollars")

LAT <- ggplot(mydata, aes(x=Loan_Amount_Term)) + 
  geom_histogram(color="black", fill="white", binwidth=50)+
  ggtitle("Distribution of Loan Amount Term") +
  xlab("Term of loan in months")
ggarrange(AI, LA, LAT,
          ncol = 2, nrow = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

In addition to loan amount, there is also skewness to the right in case of applicant income. The distribution is close to normal. From that, we can infer that most frequent applicant income is around 2500 dollars per month. However some of applicant’s monthly income is more than 20000 dollars. The third graph shows that the applicants will repay the loan in an average of approximately 360 months, which is 30 years. From this I can assume that the majority are applying for housing loan.

scatterplotMatrix(mydata[ , c(7,9)], 
                  smooth = FALSE) 

In the graphs above there is shown a dependance among applicant income and amount of loan. It is evident from the graph that there is positive correlation between Applicant Income and Loan Amount. Applicants with a higher income have higher loan amount.

mydata <- mydata[mydata$Loan_Amount < 300,]

E <- ggplot(mydata, aes(y=Loan_Amount, fill=Education)) +
  geom_boxplot(position=position_dodge(1)) +
  ggtitle("Amount of Loan x Education") +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

G <- ggplot(mydata, aes(y=Loan_Amount, fill=Gender)) +
  geom_boxplot(position=position_dodge(1)) +
  ggtitle("Amount of Loan x Gender") +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

M <- ggplot(mydata, aes(y=Loan_Amount, fill=Married)) +
  geom_boxplot(position=position_dodge(1)) +
  ggtitle("Amount of Loan x Marriage Status") +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

SE <- ggplot(mydata, aes(y=Loan_Amount, fill=Self_Employed)) +
  geom_boxplot(position=position_dodge(1)) +
  ggtitle("Amount of Loan x Self Employment") +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

ggarrange(E, G, M, SE,
          ncol = 2, nrow = 2)

In the graphs above there is shown a connection between Amount of Loan and Education/Gender/Marriage Status/Self Employment. Graduate compared to ungraduate have a larger amount of loan. Compared to women, men have a slightly larger amount of loan. Married and self-employed also have a larger amount of loan than not married and not self employed. However, the differences are not significant and we have to take into account that there are quite a few outliers.