METODE STATISTIKA

~ Ujian Tengah Semester ~

NIM 20205520004
Prodi Teknik Informatika
Email
RPubs https://rpubs.com/sabrinayose/
Github https://github.com/sabrinayose/

Tugas 1

Lakukan proses persiapan data dengan R dan Python, dengan beberapa langkah berikut:

1.1 Import Data

library(zoo)
df <- read.csv("loan-train.csv")   
df

1.2 Penanganan Data Hilang

colSums(is.na(df))              
##           Loan_ID            Gender           Married        Dependents 
##                 0                 0                 0                 0 
##         Education     Self_Employed   ApplicantIncome CoapplicantIncome 
##                 0                 0                 0                 0 
##        LoanAmount  Loan_Amount_Term    Credit_History     Property_Area 
##                22                14                50                 0 
##       Loan_Status 
##                 0

1.2.1 Dengan Cara Menghapus

df_rm <- na.omit(df)             
print(colSums(is.na(df_rm))) 
##           Loan_ID            Gender           Married        Dependents 
##                 0                 0                 0                 0 
##         Education     Self_Employed   ApplicantIncome CoapplicantIncome 
##                 0                 0                 0                 0 
##        LoanAmount  Loan_Amount_Term    Credit_History     Property_Area 
##                 0                 0                 0                 0 
##       Loan_Status 
##                 0

1.2.2 Input Mean/Modus/Median

Mean

df <- read.csv("loan-train.csv")  
df$LoanAmount [is.na(df$LoanAmount )] <- mean(df$LoanAmount, na.rm = TRUE)
colSums(is.na(df))
##           Loan_ID            Gender           Married        Dependents 
##                 0                 0                 0                 0 
##         Education     Self_Employed   ApplicantIncome CoapplicantIncome 
##                 0                 0                 0                 0 
##        LoanAmount  Loan_Amount_Term    Credit_History     Property_Area 
##                 0                14                50                 0 
##       Loan_Status 
##                 0

Mode

df <- read.csv("loan-train.csv")    
df$Loan_Amount_Term[is.na(df$Loan_Amount_Term)] <- mode(df$Loan_Amount_Term)
colSums(is.na(df))
##           Loan_ID            Gender           Married        Dependents 
##                 0                 0                 0                 0 
##         Education     Self_Employed   ApplicantIncome CoapplicantIncome 
##                 0                 0                 0                 0 
##        LoanAmount  Loan_Amount_Term    Credit_History     Property_Area 
##                22                 0                50                 0 
##       Loan_Status 
##                 0

1.2.3 Interpolasi Linier

df <- read.csv("loan-train.csv")   
df$Credit_History<-na.approx(df$Credit_History)
colSums(is.na(df))
##           Loan_ID            Gender           Married        Dependents 
##                 0                 0                 0                 0 
##         Education     Self_Employed   ApplicantIncome CoapplicantIncome 
##                 0                 0                 0                 0 
##        LoanAmount  Loan_Amount_Term    Credit_History     Property_Area 
##                22                14                 0                 0 
##       Loan_Status 
##                 0

1.2.4 Forwarding Filling

require(tidyr)
require(dplyr)

df <- read.csv("loan-train.csv")  
df <- df %>% fill(Loan_Amount_Term)
colSums(is.na(df))
##           Loan_ID            Gender           Married        Dependents 
##                 0                 0                 0                 0 
##         Education     Self_Employed   ApplicantIncome CoapplicantIncome 
##                 0                 0                 0                 0 
##        LoanAmount  Loan_Amount_Term    Credit_History     Property_Area 
##                22                 0                50                 0 
##       Loan_Status 
##                 0

1.2.5 Backward Filling

require(tidyr)
require(dplyr)

df <- read.csv("loan-train.csv")
df <- df %>% fill(LoanAmount, .direction="up")
colSums(is.na(df))
##           Loan_ID            Gender           Married        Dependents 
##                 0                 0                 0                 0 
##         Education     Self_Employed   ApplicantIncome CoapplicantIncome 
##                 0                 0                 0                 0 
##        LoanAmount  Loan_Amount_Term    Credit_History     Property_Area 
##                 0                14                50                 0 
##       Loan_Status 
##                 0

1.3 Periksa Data Duplikat

df <- read.csv("loan-train.csv")  

df %>% count(df$Self_Employed) %>% filter(n>1) %>% select(-n)

1.4 Pemisahan Data Kategori dan Numerik

Filter(is.numeric, df)
Filter(is.character, df)

1.5 Penanganan Data Numerik

1.5.1 Standardisasi

df <- read.csv("loan-train.csv") 
df_rm <- na.omit(df_rm)

df_rm$LoanAmount_scaled <- scale(df_rm$LoanAmount)
df_rm$Loan_Amount_Term_scaled <- scale(df_rm$Loan_Amount_Term)


df_rm

1.5.2 Normalisasi

df <- read.csv("loan-train.csv")   

normalize <- function(x){
  return ((x - min(x) / max(x) - min(x)))
}

df_rm <- na.omit(df_rm)

df_rm$LoanAmount_norm <- normalize(df_rm$LoanAmount)
df_rm$Loan_Amount_Term_norm <- normalize(df_rm$Loan_Amount_Term)

df_rm

1.5.3 Penskalaan Robust

df <- read.csv("loan-train.csv")

robust <- function(x){
  return ((x - quantile(x)[2] / (quantile(x)[4] - quantile(x)[2])))
}

df_rm <- na.omit(df_rm)

df_rm$LoanAmount_robust <- robust(df_rm$LoanAmount)
df_rm$Loan_Amount_Term_robust <- robust(df_rm$Loan_Amount_Term)

df_rm

1.6 Penanganan Data Pencilan

1.6.1 Metode Statistik

Distribusi Gaussian

df <- read.csv("loan-train.csv") 

outliers <- function(x){
  sample_mean = mean(x)
  sample_std = sd(x)
  cut_off = sample_std * 1
  lower = sample_mean - cut_off
  upper = sample_mean + cut_off
  
  return (sapply(x, function(x) {
    return(x < lower || x > upper)
  }))
}

df_rm <- na.omit(df_rm)

df_rm[outliers(df_rm$LoanAmount),]

Boxplot atau Rentang Interkuartil (IQR)

df <- read.csv("loan-train.csv")
boxplot(df$Loan_Amount_Term)

1.7 Penanganan Data Kategorikal

df <- read.csv("loan-train.csv") 
dim(df)
## [1] 614  13
df <- read.csv("loan-train.csv") 
head(df, 5)
df <- read.csv("loan-train.csv")
df_char <- Filter(is.character, df)
colSums(is.na(df_char))    
##       Loan_ID        Gender       Married    Dependents     Education 
##             0             0             0             0             0 
## Self_Employed Property_Area   Loan_Status 
##             0             0             0

1.7.1 Pelabelan

df <- read.csv("loan-train.csv")
library(superml)
df_label <- LabelEncoder$new()

df$LoanAmount <- df_label$fit_transform(df$LoanAmount)

df$Married <- df_label$fit_transform(df$Married)

df$Education <- df_label$fit_transform(df$Education)

df

1.7.2 Pemetaan Kustom

df <- read.csv("loan-train.csv")

df$Loan_Status[df$Loan_Status=="Y"] <- 1
df$Loan_Status[df$Loan_Status=="N"] <- 0

df

1.7.3 Variabel Dummy

library(fastDummies)
df <- read.csv("loan-train.csv")

df <- dummy_cols(df)
df

1.7.4 K-fold/Cross-fold

library(tidyverse)
library(caret)
library(ISLR)
library(lattice)

df <- read.csv("loan-train.csv")

df$Loan_Status[df$Loan_Status=="Y"] <- 1
df$Loan_Status[df$Loan_Status=="N"] <- 0

df1 <- na.omit(df)
set.seed(100)
dataset <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
nb_fit <- train(Credit_History~., data = df1, method = "lm", trControl = dataset, tuneLength =14)
nb_fit
## Linear Regression 
## 
## 529 samples
##  12 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 476, 476, 476, 476, 476, 477, ... 
## Resampling results:
## 
##   RMSE       Rsquared  MAE      
##   0.4396691  NaN       0.2230044
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Tugas 2

Lakukan Proses Visualisasi Data dengan menggunakan R dan Python dengan beberapa langkah berikut:

2.1 Visualisasi Univariabel

2.1.1 Categorical

Bar Chart

library(ggplot2)                                      
df <- read.csv("loan-train.csv")             

ggplot(df, aes(x = Property_Area)) +         
  geom_bar(fill = "cornflowerblue", 
           color = "azure4") +               
  theme_minimal() +                          
  labs(x = "Property Area",                  
       y = "Frequency", 
       title = "Loan Train by Property Area")         

Pie Chart

library(dplyr)                                  
library(ggplot2)                                 
library(scales)                                      
# Data preparation
plotdata <- df %>%
  count(Property_Area) %>%
  arrange(desc(Property_Area)) %>%
  mutate(prop = round(n*100/sum(n), 1),
         lab.ypos = cumsum(prop) - 0.5*prop)
# Create Pie chart
mycols <- c("#0073C2FF", "#EFC000FF", "#868686FF", "#CD534CFF")
ggplot(plotdata, aes(x = "", y = prop, fill = Property_Area)) +
  geom_bar(width = 1, stat = "identity", color = "white") +
  coord_polar("y", start = 0)+
  geom_text(aes(y = lab.ypos, label = prop), color = "white")+
  scale_fill_manual(values = mycols) +
  theme_void()+
  labs(title = "Loan Train by Property Area")

Tree Map

library(ggplot2)
library(treemapify)                             
library(scales)                                      
plotdata <- df %>%
  count(Property_Area)
ggplot(plotdata, 
       aes(fill = Property_Area, 
           area = n)) +
  geom_treemap() + 
  labs(title = "Loan Train by Property Area")

2.1.2 Continuous

Histogram

library(ggplot2)
ggplot(df, aes(x = LoanAmount)) +
  geom_histogram(fill = "#C04343", color = "white", bins = 20) + 
  theme_minimal() +
  labs(title="Loan Train by Property Area", x = "Loan Amount")

Kernel Density Plot

library(ggplot2)
ggplot(df, aes(x = Property_Area)) +
  geom_density(fill = "indianred3") +
  theme_minimal() +
  labs(title = "Loan Train by Property Area")

Dot Chart

library(ggplot2)         
ggplot(df, aes(x = Dependents)) +
  geom_dotplot(fill = "gold", 
               color = "azure4") +
  theme_minimal() +          
  labs(title = "Loan Train by Loan_Status",
       y = "Frequency",
       x = "Loan Status")

2.2 Visualisasi Bivariabel

2.2.1 Categorical vs. Categorical

Grouped Bar Chart

library(ggplot2)                                     # for visualization
ggplot(df, aes(x = Education, fill = Property_Area)) +
  theme_minimal() +                                  # use a minimal theme
  geom_bar(position = position_dodge(preserve = "single"))

2.2.2 Continuous vs. Continuous

Scatterplot Fit Lines

library(ggplot2)                                  
ggplot(df,
       aes(x = ApplicantIncome, 
           y = LoanAmount)) +
  geom_point(color= "cornflowerblue") +
  geom_smooth(method = "lm", color = "brown1")+
  theme_minimal() +                                  # use a minimal theme
  labs(x = "Applicant Income",
       y = "",
       title = "Applicant Income vs. Loan Amount")

### 2.2.3 Categorical vs. Continuous Grouped Kernel Density Plots

ggplot(df, 
       aes(x = ApplicantIncome, 
           fill = Education)) +
  geom_density(alpha = 0.4) +
  theme_minimal() +
  labs(title = "Applicant Income distribution by Education")

2.3 Visualisasi Multivariabel

2.3.1 Grouping

library(carData)                                     # for dataset
library(ggplot2)                                     # for visulization
data(df, package="carData")
ggplot(df, aes(x = ApplicantIncome, 
                     y = LoanAmount, 
                     color=Education)) +
  geom_point() +
  theme_minimal() +
  labs(title = "Loan Amount by Applicant Income and Education")

2.3.2 Faceting

library(carData)  
library(ggplot2)      
ggplot(df, aes(x = LoanAmount)) +
  geom_histogram(fill = "cornflowerblue",
                 color = "white") +
  facet_wrap(~Education, ncol = 1) +
  theme_minimal() +
  labs(title = "Loan Amount histograms by Education")

Tugas 3

Lakukan proses analisa data secara deskriptif menggunakan R dan Python dengan beberapa langkah berikut:

3.1 Kualitatif

3.1.1 Kategori Univariat

df = read.csv("loan-train.csv")  
apply(is.na(df),2, which)    
## $Loan_ID
## integer(0)
## 
## $Gender
## integer(0)
## 
## $Married
## integer(0)
## 
## $Dependents
## integer(0)
## 
## $Education
## integer(0)
## 
## $Self_Employed
## integer(0)
## 
## $ApplicantIncome
## integer(0)
## 
## $CoapplicantIncome
## integer(0)
## 
## $LoanAmount
##  [1]   1  36  64  82  96 103 104 114 128 203 285 306 323 339 388 436 438 480 525
## [20] 551 552 606
## 
## $Loan_Amount_Term
##  [1]  20  37  45  46  74 113 166 198 224 233 336 368 422 424
## 
## $Credit_History
##  [1]  17  25  31  43  80  84  87  96 118 126 130 131 157 182 188 199 220 237 238
## [20] 260 261 280 310 314 318 319 324 349 364 378 393 396 412 445 450 452 461 474
## [39] 491 492 498 504 507 531 534 545 557 566 584 601
## 
## $Property_Area
## integer(0)
## 
## $Loan_Status
## integer(0)
df<-na.omit(df)
head(df,3)  
Cat1 <- table(df$Gender)                           # count the frequencies
Cat1   
## 
##        Female   Male 
##     12     95    422
prop.table(table(df$Gender)) 
## 
##                Female       Male 
## 0.02268431 0.17958412 0.79773157

3.1.2 Kategori Bivariat

library(readr)           
library(dplyr)             
library(magrittr)                                

Cat2<- df %>%     
select(Gender, Education) %>%                   
table()                                          
Cat2 
##         Education
## Gender   Graduate Not Graduate
##                11            1
##   Female       80           15
##   Male        330           92

3.1.3 Kategori Multivariat

Cat3 <- df %>%                                   
select(Gender, Education, Married) %>%             
ftable()                                           
Cat3  
##                     Married      No Yes
## Gender Education                       
##        Graduate               0   3   8
##        Not Graduate           0   0   1
## Female Graduate               0  57  23
##        Not Graduate           0  11   4
## Male   Graduate               2  91 237
##        Not Graduate           0  26  66

3.2 Kuantitatif

3.2.1 Univariat numerik

3.2.1.1 Measures of Central Tendency

Quan <- df %>% 
select_if(is.numeric)
names(Quan) 
## [1] "ApplicantIncome"   "CoapplicantIncome" "LoanAmount"       
## [4] "Loan_Amount_Term"  "Credit_History"

Mean

mean(Quan$LoanAmount)  
## [1] 145.8526

Quantile

quantile(Quan$LoanAmount)   
##   0%  25%  50%  75% 100% 
##    9  100  128  167  700

Median

median(Quan$LoanAmount)  
## [1] 128

Mode

mode(Quan$LoanAmount)  
## [1] "numeric"

Summary

summary(Quan)  
##  ApplicantIncome CoapplicantIncome   LoanAmount    Loan_Amount_Term
##  Min.   :  150   Min.   :    0     Min.   :  9.0   Min.   : 36.0   
##  1st Qu.: 2900   1st Qu.:    0     1st Qu.:100.0   1st Qu.:360.0   
##  Median : 3816   Median : 1086     Median :128.0   Median :360.0   
##  Mean   : 5508   Mean   : 1542     Mean   :145.9   Mean   :342.4   
##  3rd Qu.: 5815   3rd Qu.: 2232     3rd Qu.:167.0   3rd Qu.:360.0   
##  Max.   :81000   Max.   :33837     Max.   :700.0   Max.   :480.0   
##  Credit_History  
##  Min.   :0.0000  
##  1st Qu.:1.0000  
##  Median :1.0000  
##  Mean   :0.8507  
##  3rd Qu.:1.0000  
##  Max.   :1.0000

3.2.1.2 Scale

Variance

var(Quan$LoanAmount)  
## [1] 7074.224

Standard Deviation

sd(Quan$LoanAmount)  
## [1] 84.10841

Median Absolute Deviation

mad(Quan$LoanAmount)  
## [1] 45.9606

Inter Quantile Range

IQR(Quan$LoanAmount)  
## [1] 67

3.2.1.3 Skewness

library(e1071)
skewness(Quan$LoanAmount)
## [1] 2.593174

3.2.1.4 Kurtosis

kurtosis(Quan$LoanAmount)
## [1] 9.936842

3.2.2 Bivariat numerik

Covariance

cov(Quan$LoanAmount,Quan$Loan_Amount_Term)
## [1] 126.7792

Pearson’s Correlation Coefficient

cor(Quan$LoanAmount,Quan$Loan_Amount_Term)
## [1] 0.02323917

Z-Score

zscore=(Quan$LoanAmount-mean(Quan$LoanAmount))/sd(Quan$LoanAmount)

3.2.3 Multivariat numerik

Sample Covariance Matrix

cov(Quan)    
##                   ApplicantIncome CoapplicantIncome    LoanAmount
## ApplicantIncome      4.101291e+07     -1.982437e+06  3.074072e+05
## CoapplicantIncome   -1.982437e+06      6.372069e+06  3.379027e+04
## LoanAmount           3.074072e+05      3.379027e+04  7.074224e+03
## Loan_Amount_Term    -2.611139e+04     -4.740321e+01  1.267792e+02
## Credit_History      -5.432772e+01     -9.768561e+00 -5.447886e-01
##                   Loan_Amount_Term Credit_History
## ApplicantIncome      -2.611139e+04    -54.3277231
## CoapplicantIncome    -4.740321e+01     -9.7685610
## LoanAmount            1.267792e+02     -0.5447886
## Loan_Amount_Term      4.207035e+03      0.2003351
## Credit_History        2.003351e-01      0.1272770

Sample Correlation Matrix

cor(Quan) 
##                   ApplicantIncome CoapplicantIncome  LoanAmount
## ApplicantIncome        1.00000000     -0.1226305807  0.57070849
## CoapplicantIncome     -0.12263058      1.0000000000  0.15915197
## LoanAmount             0.57070849      0.1591519703  1.00000000
## Loan_Amount_Term      -0.06286105     -0.0002895206  0.02323917
## Credit_History        -0.02377860     -0.0108471425 -0.01815573
##                   Loan_Amount_Term Credit_History
## ApplicantIncome      -0.0628610527    -0.02377860
## CoapplicantIncome    -0.0002895206    -0.01084714
## LoanAmount            0.0232391675    -0.01815573
## Loan_Amount_Term      1.0000000000     0.00865753
## Credit_History        0.0086575296     1.00000000

3.3 EDA dengan cara Malas

library(funModeling) 
library(tidyverse) 
library(Hmisc)
library(skimr)
basic_eda <- function(data)
{
  glimpse(data)
  skim(data)
  df_status(data)
  freq(data) 
  profiling_num(data)
  plot_num(data)
  describe(data)
}
basic_eda(df)
## Rows: 529
## Columns: 13
## $ Loan_ID           <chr> "LP001003", "LP001005", "LP001006", "LP001008", "LP0~
## $ Gender            <chr> "Male", "Male", "Male", "Male", "Male", "Male", "Mal~
## $ Married           <chr> "Yes", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "Yes~
## $ Dependents        <chr> "1", "0", "0", "0", "2", "0", "3+", "2", "1", "2", "~
## $ Education         <chr> "Graduate", "Graduate", "Not Graduate", "Graduate", ~
## $ Self_Employed     <chr> "No", "Yes", "No", "No", "Yes", "No", "No", "No", "N~
## $ ApplicantIncome   <int> 4583, 3000, 2583, 6000, 5417, 2333, 3036, 4006, 1284~
## $ CoapplicantIncome <dbl> 1508, 0, 2358, 0, 4196, 1516, 2504, 1526, 10968, 700~
## $ LoanAmount        <int> 128, 66, 120, 141, 267, 95, 158, 168, 349, 70, 109, ~
## $ Loan_Amount_Term  <int> 360, 360, 360, 360, 360, 360, 360, 360, 360, 360, 36~
## $ Credit_History    <int> 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0~
## $ Property_Area     <chr> "Rural", "Urban", "Urban", "Urban", "Urban", "Urban"~
## $ Loan_Status       <chr> "N", "Y", "Y", "Y", "Y", "Y", "N", "Y", "N", "Y", "Y~
##             variable q_zeros p_zeros q_na p_na q_inf p_inf      type unique
## 1            Loan_ID       0    0.00    0    0     0     0 character    529
## 2             Gender       0    0.00    0    0     0     0 character      3
## 3            Married       0    0.00    0    0     0     0 character      3
## 4         Dependents     295   55.77    0    0     0     0 character      5
## 5          Education       0    0.00    0    0     0     0 character      2
## 6      Self_Employed       0    0.00    0    0     0     0 character      3
## 7    ApplicantIncome       0    0.00    0    0     0     0   integer    442
## 8  CoapplicantIncome     238   44.99    0    0     0     0   numeric    245
## 9         LoanAmount       0    0.00    0    0     0     0   integer    194
## 10  Loan_Amount_Term       0    0.00    0    0     0     0   integer      9
## 11    Credit_History      79   14.93    0    0     0     0   integer      2
## 12     Property_Area       0    0.00    0    0     0     0 character      3
## 13       Loan_Status       0    0.00    0    0     0     0 character      2
##      Loan_ID frequency percentage cumulative_perc
## 1   LP001003         1       0.19            0.19
## 2   LP001005         1       0.19            0.38
## 3   LP001006         1       0.19            0.57
## 4   LP001008         1       0.19            0.76
## 5   LP001011         1       0.19            0.95
## 6   LP001013         1       0.19            1.14
## 7   LP001014         1       0.19            1.33
## 8   LP001018         1       0.19            1.52
## 9   LP001020         1       0.19            1.71
## 10  LP001024         1       0.19            1.90
## 11  LP001027         1       0.19            2.09
## 12  LP001028         1       0.19            2.28
## 13  LP001029         1       0.19            2.47
## 14  LP001030         1       0.19            2.66
## 15  LP001032         1       0.19            2.85
## 16  LP001036         1       0.19            3.04
## 17  LP001038         1       0.19            3.23
## 18  LP001043         1       0.19            3.42
## 19  LP001046         1       0.19            3.61
## 20  LP001047         1       0.19            3.80
## 21  LP001050         1       0.19            3.99
## 22  LP001066         1       0.19            4.18
## 23  LP001068         1       0.19            4.37
## 24  LP001073         1       0.19            4.56
## 25  LP001086         1       0.19            4.75
## 26  LP001087         1       0.19            4.94
## 27  LP001095         1       0.19            5.13
## 28  LP001097         1       0.19            5.32
## 29  LP001098         1       0.19            5.51
## 30  LP001100         1       0.19            5.70
## 31  LP001112         1       0.19            5.89
## 32  LP001114         1       0.19            6.08
## 33  LP001116         1       0.19            6.27
## 34  LP001119         1       0.19            6.46
## 35  LP001120         1       0.19            6.65
## 36  LP001131         1       0.19            6.84
## 37  LP001138         1       0.19            7.03
## 38  LP001144         1       0.19            7.22
## 39  LP001146         1       0.19            7.41
## 40  LP001151         1       0.19            7.60
## 41  LP001155         1       0.19            7.79
## 42  LP001157         1       0.19            7.98
## 43  LP001164         1       0.19            8.17
## 44  LP001179         1       0.19            8.36
## 45  LP001186         1       0.19            8.55
## 46  LP001194         1       0.19            8.74
## 47  LP001195         1       0.19            8.93
## 48  LP001197         1       0.19            9.12
## 49  LP001198         1       0.19            9.31
## 50  LP001199         1       0.19            9.50
## 51  LP001205         1       0.19            9.69
## 52  LP001206         1       0.19            9.88
## 53  LP001207         1       0.19           10.07
## 54  LP001222         1       0.19           10.26
## 55  LP001225         1       0.19           10.45
## 56  LP001228         1       0.19           10.64
## 57  LP001233         1       0.19           10.83
## 58  LP001238         1       0.19           11.02
## 59  LP001241         1       0.19           11.21
## 60  LP001243         1       0.19           11.40
## 61  LP001245         1       0.19           11.59
## 62  LP001248         1       0.19           11.78
## 63  LP001253         1       0.19           11.97
## 64  LP001255         1       0.19           12.16
## 65  LP001256         1       0.19           12.35
## 66  LP001259         1       0.19           12.54
## 67  LP001263         1       0.19           12.73
## 68  LP001265         1       0.19           12.92
## 69  LP001267         1       0.19           13.11
## 70  LP001275         1       0.19           13.30
## 71  LP001279         1       0.19           13.49
## 72  LP001282         1       0.19           13.68
## 73  LP001289         1       0.19           13.87
## 74  LP001310         1       0.19           14.06
## 75  LP001316         1       0.19           14.25
## 76  LP001318         1       0.19           14.44
## 77  LP001319         1       0.19           14.63
## 78  LP001322         1       0.19           14.82
## 79  LP001325         1       0.19           15.01
## 80  LP001327         1       0.19           15.20
## 81  LP001333         1       0.19           15.39
## 82  LP001334         1       0.19           15.58
## 83  LP001343         1       0.19           15.77
## 84  LP001345         1       0.19           15.96
## 85  LP001349         1       0.19           16.15
## 86  LP001357         1       0.19           16.34
## 87  LP001367         1       0.19           16.53
## 88  LP001369         1       0.19           16.72
## 89  LP001370         1       0.19           16.91
## 90  LP001379         1       0.19           17.10
## 91  LP001384         1       0.19           17.29
## 92  LP001385         1       0.19           17.48
## 93  LP001387         1       0.19           17.67
## 94  LP001398         1       0.19           17.86
## 95  LP001401         1       0.19           18.05
## 96  LP001404         1       0.19           18.24
## 97  LP001421         1       0.19           18.43
## 98  LP001422         1       0.19           18.62
## 99  LP001426         1       0.19           18.81
## 100 LP001430         1       0.19           19.00
## 101 LP001431         1       0.19           19.19
## 102 LP001432         1       0.19           19.38
## 103 LP001439         1       0.19           19.57
## 104 LP001448         1       0.19           19.76
## 105 LP001451         1       0.19           19.95
## 106 LP001473         1       0.19           20.14
## 107 LP001478         1       0.19           20.33
## 108 LP001482         1       0.19           20.52
## 109 LP001487         1       0.19           20.71
## 110 LP001488         1       0.19           20.90
## 111 LP001489         1       0.19           21.09
## 112 LP001491         1       0.19           21.28
## 113 LP001492         1       0.19           21.47
## 114 LP001493         1       0.19           21.66
## 115 LP001497         1       0.19           21.85
## 116 LP001498         1       0.19           22.04
## 117 LP001504         1       0.19           22.23
## 118 LP001507         1       0.19           22.42
## 119 LP001508         1       0.19           22.61
## 120 LP001514         1       0.19           22.80
## 121 LP001516         1       0.19           22.99
## 122 LP001518         1       0.19           23.18
## 123 LP001519         1       0.19           23.37
## 124 LP001520         1       0.19           23.56
## 125 LP001528         1       0.19           23.75
## 126 LP001529         1       0.19           23.94
## 127 LP001531         1       0.19           24.13
## 128 LP001532         1       0.19           24.32
## 129 LP001535         1       0.19           24.51
## 130 LP001536         1       0.19           24.70
## 131 LP001543         1       0.19           24.89
## 132 LP001546         1       0.19           25.08
## 133 LP001552         1       0.19           25.27
## 134 LP001560         1       0.19           25.46
## 135 LP001562         1       0.19           25.65
## 136 LP001565         1       0.19           25.84
## 137 LP001570         1       0.19           26.03
## 138 LP001572         1       0.19           26.22
## 139 LP001577         1       0.19           26.41
## 140 LP001578         1       0.19           26.60
## 141 LP001579         1       0.19           26.79
## 142 LP001580         1       0.19           26.98
## 143 LP001581         1       0.19           27.17
## 144 LP001585         1       0.19           27.36
## 145 LP001586         1       0.19           27.55
## 146 LP001594         1       0.19           27.74
## 147 LP001603         1       0.19           27.93
## 148 LP001606         1       0.19           28.12
## 149 LP001608         1       0.19           28.31
## 150 LP001610         1       0.19           28.50
## 151 LP001616         1       0.19           28.69
## 152 LP001630         1       0.19           28.88
## 153 LP001633         1       0.19           29.07
## 154 LP001636         1       0.19           29.26
## 155 LP001637         1       0.19           29.45
## 156 LP001639         1       0.19           29.64
## 157 LP001640         1       0.19           29.83
## 158 LP001641         1       0.19           30.02
## 159 LP001644         1       0.19           30.21
## 160 LP001647         1       0.19           30.40
## 161 LP001653         1       0.19           30.59
## 162 LP001656         1       0.19           30.78
## 163 LP001657         1       0.19           30.97
## 164 LP001658         1       0.19           31.16
## 165 LP001664         1       0.19           31.35
## 166 LP001665         1       0.19           31.54
## 167 LP001666         1       0.19           31.73
## 168 LP001673         1       0.19           31.92
## 169 LP001674         1       0.19           32.11
## 170 LP001677         1       0.19           32.30
## 171 LP001688         1       0.19           32.49
## 172 LP001691         1       0.19           32.68
## 173 LP001692         1       0.19           32.87
## 174 LP001693         1       0.19           33.06
## 175 LP001698         1       0.19           33.25
## 176 LP001699         1       0.19           33.44
## 177 LP001702         1       0.19           33.63
## 178 LP001708         1       0.19           33.82
## 179 LP001711         1       0.19           34.01
## 180 LP001713         1       0.19           34.20
## 181 LP001715         1       0.19           34.39
## 182 LP001716         1       0.19           34.58
## 183 LP001720         1       0.19           34.77
## 184 LP001722         1       0.19           34.96
## 185 LP001726         1       0.19           35.15
## 186 LP001732         1       0.19           35.34
## 187 LP001736         1       0.19           35.53
## 188 LP001743         1       0.19           35.72
## 189 LP001744         1       0.19           35.91
## 190 LP001750         1       0.19           36.10
## 191 LP001751         1       0.19           36.29
## 192 LP001754         1       0.19           36.48
## 193 LP001758         1       0.19           36.67
## 194 LP001760         1       0.19           36.86
## 195 LP001761         1       0.19           37.05
## 196 LP001765         1       0.19           37.24
## 197 LP001768         1       0.19           37.43
## 198 LP001776         1       0.19           37.62
## 199 LP001778         1       0.19           37.81
## 200 LP001784         1       0.19           38.00
## 201 LP001790         1       0.19           38.19
## 202 LP001792         1       0.19           38.38
## 203 LP001798         1       0.19           38.57
## 204 LP001800         1       0.19           38.76
## 205 LP001806         1       0.19           38.95
## 206 LP001807         1       0.19           39.14
## 207 LP001811         1       0.19           39.33
## 208 LP001813         1       0.19           39.52
## 209 LP001814         1       0.19           39.71
## 210 LP001819         1       0.19           39.90
## 211 LP001824         1       0.19           40.09
## 212 LP001825         1       0.19           40.28
## 213 LP001835         1       0.19           40.47
## 214 LP001836         1       0.19           40.66
## 215 LP001841         1       0.19           40.85
## 216 LP001843         1       0.19           41.04
## 217 LP001844         1       0.19           41.23
## 218 LP001846         1       0.19           41.42
## 219 LP001849         1       0.19           41.61
## 220 LP001854         1       0.19           41.80
## 221 LP001859         1       0.19           41.99
## 222 LP001868         1       0.19           42.18
## 223 LP001870         1       0.19           42.37
## 224 LP001871         1       0.19           42.56
## 225 LP001872         1       0.19           42.75
## 226 LP001875         1       0.19           42.94
## 227 LP001877         1       0.19           43.13
## 228 LP001882         1       0.19           43.32
## 229 LP001883         1       0.19           43.51
## 230 LP001884         1       0.19           43.70
## 231 LP001888         1       0.19           43.89
## 232 LP001891         1       0.19           44.08
## 233 LP001892         1       0.19           44.27
## 234 LP001894         1       0.19           44.46
## 235 LP001896         1       0.19           44.65
## 236 LP001900         1       0.19           44.84
## 237 LP001903         1       0.19           45.03
## 238 LP001904         1       0.19           45.22
## 239 LP001907         1       0.19           45.41
## 240 LP001910         1       0.19           45.60
## 241 LP001914         1       0.19           45.79
## 242 LP001915         1       0.19           45.98
## 243 LP001917         1       0.19           46.17
## 244 LP001924         1       0.19           46.36
## 245 LP001925         1       0.19           46.55
## 246 LP001926         1       0.19           46.74
## 247 LP001931         1       0.19           46.93
## 248 LP001935         1       0.19           47.12
## 249 LP001936         1       0.19           47.31
## 250 LP001938         1       0.19           47.50
## 251 LP001940         1       0.19           47.69
## 252 LP001945         1       0.19           47.88
## 253 LP001947         1       0.19           48.07
## 254 LP001949         1       0.19           48.26
## 255 LP001953         1       0.19           48.45
## 256 LP001954         1       0.19           48.64
## 257 LP001955         1       0.19           48.83
## 258 LP001963         1       0.19           49.02
## 259 LP001964         1       0.19           49.21
## 260 LP001972         1       0.19           49.40
## 261 LP001974         1       0.19           49.59
## 262 LP001977         1       0.19           49.78
## 263 LP001978         1       0.19           49.97
## 264 LP001993         1       0.19           50.16
## 265 LP001994         1       0.19           50.35
## 266 LP001996         1       0.19           50.54
## 267 LP002002         1       0.19           50.73
## 268 LP002004         1       0.19           50.92
## 269 LP002006         1       0.19           51.11
## 270 LP002024         1       0.19           51.30
## 271 LP002031         1       0.19           51.49
## 272 LP002035         1       0.19           51.68
## 273 LP002050         1       0.19           51.87
## 274 LP002051         1       0.19           52.06
## 275 LP002053         1       0.19           52.25
## 276 LP002065         1       0.19           52.44
## 277 LP002067         1       0.19           52.63
## 278 LP002068         1       0.19           52.82
## 279 LP002082         1       0.19           53.01
## 280 LP002086         1       0.19           53.20
## 281 LP002087         1       0.19           53.39
## 282 LP002097         1       0.19           53.58
## 283 LP002098         1       0.19           53.77
## 284 LP002100         1       0.19           53.96
## 285 LP002101         1       0.19           54.15
## 286 LP002103         1       0.19           54.34
## 287 LP002110         1       0.19           54.53
## 288 LP002112         1       0.19           54.72
## 289 LP002114         1       0.19           54.91
## 290 LP002115         1       0.19           55.10
## 291 LP002116         1       0.19           55.29
## 292 LP002119         1       0.19           55.48
## 293 LP002126         1       0.19           55.67
## 294 LP002128         1       0.19           55.86
## 295 LP002129         1       0.19           56.05
## 296 LP002130         1       0.19           56.24
## 297 LP002131         1       0.19           56.43
## 298 LP002138         1       0.19           56.62
## 299 LP002139         1       0.19           56.81
## 300 LP002140         1       0.19           57.00
## 301 LP002141         1       0.19           57.19
## 302 LP002142         1       0.19           57.38
## 303 LP002143         1       0.19           57.57
## 304 LP002144         1       0.19           57.76
## 305 LP002149         1       0.19           57.95
## 306 LP002151         1       0.19           58.14
## 307 LP002158         1       0.19           58.33
## 308 LP002160         1       0.19           58.52
## 309 LP002161         1       0.19           58.71
## 310 LP002170         1       0.19           58.90
## 311 LP002175         1       0.19           59.09
## 312 LP002180         1       0.19           59.28
## 313 LP002181         1       0.19           59.47
## 314 LP002187         1       0.19           59.66
## 315 LP002190         1       0.19           59.85
## 316 LP002191         1       0.19           60.04
## 317 LP002194         1       0.19           60.23
## 318 LP002197         1       0.19           60.42
## 319 LP002201         1       0.19           60.61
## 320 LP002205         1       0.19           60.80
## 321 LP002209         1       0.19           60.99
## 322 LP002211         1       0.19           61.18
## 323 LP002219         1       0.19           61.37
## 324 LP002224         1       0.19           61.56
## 325 LP002225         1       0.19           61.75
## 326 LP002226         1       0.19           61.94
## 327 LP002229         1       0.19           62.13
## 328 LP002231         1       0.19           62.32
## 329 LP002234         1       0.19           62.51
## 330 LP002236         1       0.19           62.70
## 331 LP002237         1       0.19           62.89
## 332 LP002239         1       0.19           63.08
## 333 LP002244         1       0.19           63.27
## 334 LP002250         1       0.19           63.46
## 335 LP002255         1       0.19           63.65
## 336 LP002262         1       0.19           63.84
## 337 LP002265         1       0.19           64.03
## 338 LP002266         1       0.19           64.22
## 339 LP002277         1       0.19           64.41
## 340 LP002281         1       0.19           64.60
## 341 LP002284         1       0.19           64.79
## 342 LP002287         1       0.19           64.98
## 343 LP002288         1       0.19           65.17
## 344 LP002296         1       0.19           65.36
## 345 LP002297         1       0.19           65.55
## 346 LP002300         1       0.19           65.74
## 347 LP002301         1       0.19           65.93
## 348 LP002305         1       0.19           66.12
## 349 LP002308         1       0.19           66.31
## 350 LP002314         1       0.19           66.50
## 351 LP002315         1       0.19           66.69
## 352 LP002317         1       0.19           66.88
## 353 LP002318         1       0.19           67.07
## 354 LP002328         1       0.19           67.26
## 355 LP002332         1       0.19           67.45
## 356 LP002335         1       0.19           67.64
## 357 LP002337         1       0.19           67.83
## 358 LP002341         1       0.19           68.02
## 359 LP002342         1       0.19           68.21
## 360 LP002345         1       0.19           68.40
## 361 LP002347         1       0.19           68.59
## 362 LP002348         1       0.19           68.78
## 363 LP002361         1       0.19           68.97
## 364 LP002364         1       0.19           69.16
## 365 LP002366         1       0.19           69.35
## 366 LP002367         1       0.19           69.54
## 367 LP002368         1       0.19           69.73
## 368 LP002369         1       0.19           69.92
## 369 LP002370         1       0.19           70.11
## 370 LP002377         1       0.19           70.30
## 371 LP002379         1       0.19           70.49
## 372 LP002386         1       0.19           70.68
## 373 LP002387         1       0.19           70.87
## 374 LP002390         1       0.19           71.06
## 375 LP002398         1       0.19           71.25
## 376 LP002403         1       0.19           71.44
## 377 LP002407         1       0.19           71.63
## 378 LP002408         1       0.19           71.82
## 379 LP002409         1       0.19           72.01
## 380 LP002418         1       0.19           72.20
## 381 LP002422         1       0.19           72.39
## 382 LP002429         1       0.19           72.58
## 383 LP002434         1       0.19           72.77
## 384 LP002435         1       0.19           72.96
## 385 LP002443         1       0.19           73.15
## 386 LP002446         1       0.19           73.34
## 387 LP002448         1       0.19           73.53
## 388 LP002449         1       0.19           73.72
## 389 LP002453         1       0.19           73.91
## 390 LP002455         1       0.19           74.10
## 391 LP002459         1       0.19           74.29
## 392 LP002467         1       0.19           74.48
## 393 LP002472         1       0.19           74.67
## 394 LP002473         1       0.19           74.86
## 395 LP002484         1       0.19           75.05
## 396 LP002487         1       0.19           75.24
## 397 LP002489         1       0.19           75.43
## 398 LP002493         1       0.19           75.62
## 399 LP002494         1       0.19           75.81
## 400 LP002500         1       0.19           76.00
## 401 LP002501         1       0.19           76.19
## 402 LP002502         1       0.19           76.38
## 403 LP002505         1       0.19           76.57
## 404 LP002515         1       0.19           76.76
## 405 LP002517         1       0.19           76.95
## 406 LP002519         1       0.19           77.14
## 407 LP002524         1       0.19           77.33
## 408 LP002527         1       0.19           77.52
## 409 LP002529         1       0.19           77.71
## 410 LP002530         1       0.19           77.90
## 411 LP002531         1       0.19           78.09
## 412 LP002534         1       0.19           78.28
## 413 LP002536         1       0.19           78.47
## 414 LP002537         1       0.19           78.66
## 415 LP002541         1       0.19           78.85
## 416 LP002543         1       0.19           79.04
## 417 LP002544         1       0.19           79.23
## 418 LP002545         1       0.19           79.42
## 419 LP002547         1       0.19           79.61
## 420 LP002555         1       0.19           79.80
## 421 LP002556         1       0.19           79.99
## 422 LP002571         1       0.19           80.18
## 423 LP002582         1       0.19           80.37
## 424 LP002585         1       0.19           80.56
## 425 LP002586         1       0.19           80.75
## 426 LP002587         1       0.19           80.94
## 427 LP002600         1       0.19           81.13
## 428 LP002602         1       0.19           81.32
## 429 LP002603         1       0.19           81.51
## 430 LP002606         1       0.19           81.70
## 431 LP002615         1       0.19           81.89
## 432 LP002619         1       0.19           82.08
## 433 LP002622         1       0.19           82.27
## 434 LP002625         1       0.19           82.46
## 435 LP002626         1       0.19           82.65
## 436 LP002634         1       0.19           82.84
## 437 LP002637         1       0.19           83.03
## 438 LP002640         1       0.19           83.22
## 439 LP002643         1       0.19           83.41
## 440 LP002648         1       0.19           83.60
## 441 LP002652         1       0.19           83.79
## 442 LP002659         1       0.19           83.98
## 443 LP002670         1       0.19           84.17
## 444 LP002682         1       0.19           84.36
## 445 LP002683         1       0.19           84.55
## 446 LP002684         1       0.19           84.74
## 447 LP002689         1       0.19           84.93
## 448 LP002690         1       0.19           85.12
## 449 LP002692         1       0.19           85.31
## 450 LP002693         1       0.19           85.50
## 451 LP002699         1       0.19           85.69
## 452 LP002705         1       0.19           85.88
## 453 LP002706         1       0.19           86.07
## 454 LP002714         1       0.19           86.26
## 455 LP002716         1       0.19           86.45
## 456 LP002720         1       0.19           86.64
## 457 LP002723         1       0.19           86.83
## 458 LP002731         1       0.19           87.02
## 459 LP002732         1       0.19           87.21
## 460 LP002734         1       0.19           87.40
## 461 LP002738         1       0.19           87.59
## 462 LP002739         1       0.19           87.78
## 463 LP002740         1       0.19           87.97
## 464 LP002741         1       0.19           88.16
## 465 LP002743         1       0.19           88.35
## 466 LP002753         1       0.19           88.54
## 467 LP002755         1       0.19           88.73
## 468 LP002767         1       0.19           88.92
## 469 LP002768         1       0.19           89.11
## 470 LP002772         1       0.19           89.30
## 471 LP002776         1       0.19           89.49
## 472 LP002777         1       0.19           89.68
## 473 LP002785         1       0.19           89.87
## 474 LP002788         1       0.19           90.06
## 475 LP002789         1       0.19           90.25
## 476 LP002792         1       0.19           90.44
## 477 LP002795         1       0.19           90.63
## 478 LP002798         1       0.19           90.82
## 479 LP002804         1       0.19           91.01
## 480 LP002807         1       0.19           91.20
## 481 LP002813         1       0.19           91.39
## 482 LP002820         1       0.19           91.58
## 483 LP002821         1       0.19           91.77
## 484 LP002832         1       0.19           91.96
## 485 LP002836         1       0.19           92.15
## 486 LP002837         1       0.19           92.34
## 487 LP002840         1       0.19           92.53
## 488 LP002841         1       0.19           92.72
## 489 LP002842         1       0.19           92.91
## 490 LP002847         1       0.19           93.10
## 491 LP002855         1       0.19           93.29
## 492 LP002862         1       0.19           93.48
## 493 LP002863         1       0.19           93.67
## 494 LP002868         1       0.19           93.86
## 495 LP002872         1       0.19           94.05
## 496 LP002874         1       0.19           94.24
## 497 LP002877         1       0.19           94.43
## 498 LP002888         1       0.19           94.62
## 499 LP002892         1       0.19           94.81
## 500 LP002893         1       0.19           95.00
## 501 LP002894         1       0.19           95.19
## 502 LP002911         1       0.19           95.38
## 503 LP002912         1       0.19           95.57
## 504 LP002916         1       0.19           95.76
## 505 LP002917         1       0.19           95.95
## 506 LP002925         1       0.19           96.14
## 507 LP002926         1       0.19           96.33
## 508 LP002928         1       0.19           96.52
## 509 LP002931         1       0.19           96.71
## 510 LP002933         1       0.19           96.90
## 511 LP002936         1       0.19           97.09
## 512 LP002938         1       0.19           97.28
## 513 LP002940         1       0.19           97.47
## 514 LP002941         1       0.19           97.66
## 515 LP002943         1       0.19           97.85
## 516 LP002945         1       0.19           98.04
## 517 LP002948         1       0.19           98.23
## 518 LP002950         1       0.19           98.42
## 519 LP002953         1       0.19           98.61
## 520 LP002958         1       0.19           98.80
## 521 LP002959         1       0.19           98.99
## 522 LP002961         1       0.19           99.18
## 523 LP002964         1       0.19           99.37
## 524 LP002974         1       0.19           99.56
## 525 LP002978         1       0.19           99.75
## 526 LP002979         1       0.19           99.94
## 527 LP002983         1       0.19          100.13
## 528 LP002984         1       0.19          100.32
## 529 LP002990         1       0.19          100.00

##   Gender frequency percentage cumulative_perc
## 1   Male       422      79.77           79.77
## 2 Female        95      17.96           97.73
## 3               12       2.27          100.00

##   Married frequency percentage cumulative_perc
## 1     Yes       339      64.08           64.08
## 2      No       188      35.54           99.62
## 3                 2       0.38          100.00

##   Dependents frequency percentage cumulative_perc
## 1          0       295      55.77           55.77
## 2          2        92      17.39           73.16
## 3          1        85      16.07           89.23
## 4         3+        45       8.51           97.74
## 5                   12       2.27          100.00

##      Education frequency percentage cumulative_perc
## 1     Graduate       421      79.58           79.58
## 2 Not Graduate       108      20.42          100.00

##   Self_Employed frequency percentage cumulative_perc
## 1            No       434      82.04           82.04
## 2           Yes        70      13.23           95.27
## 3                      25       4.73          100.00

##   Property_Area frequency percentage cumulative_perc
## 1     Semiurban       209      39.51           39.51
## 2         Urban       165      31.19           70.70
## 3         Rural       155      29.30          100.00

##   Loan_Status frequency percentage cumulative_perc
## 1           Y       366      69.19           69.19
## 2           N       163      30.81          100.00

## data 
## 
##  13  Variables      529  Observations
## --------------------------------------------------------------------------------
## Loan_ID 
##        n  missing distinct 
##      529        0      529 
## 
## lowest : LP001003 LP001005 LP001006 LP001008 LP001011
## highest: LP002978 LP002979 LP002983 LP002984 LP002990
## --------------------------------------------------------------------------------
## Gender 
##        n  missing distinct 
##      517       12        2 
##                         
## Value      Female   Male
## Frequency      95    422
## Proportion  0.184  0.816
## --------------------------------------------------------------------------------
## Married 
##        n  missing distinct 
##      527        2        2 
##                       
## Value         No   Yes
## Frequency    188   339
## Proportion 0.357 0.643
## --------------------------------------------------------------------------------
## Dependents 
##        n  missing distinct 
##      517       12        4 
##                                   
## Value          0     1     2    3+
## Frequency    295    85    92    45
## Proportion 0.571 0.164 0.178 0.087
## --------------------------------------------------------------------------------
## Education 
##        n  missing distinct 
##      529        0        2 
##                                     
## Value          Graduate Not Graduate
## Frequency           421          108
## Proportion        0.796        0.204
## --------------------------------------------------------------------------------
## Self_Employed 
##        n  missing distinct 
##      504       25        2 
##                       
## Value         No   Yes
## Frequency    434    70
## Proportion 0.861 0.139
## --------------------------------------------------------------------------------
## ApplicantIncome 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      529        0      442        1     5508     4325     1927     2275 
##      .25      .50      .75      .90      .95 
##     2900     3816     5815     9542    14643 
## 
## lowest :   150   210   645   674  1000, highest: 39147 39999 51763 63337 81000
## --------------------------------------------------------------------------------
## CoapplicantIncome 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      529        0      245    0.909     1542     2001        0        0 
##      .25      .50      .75      .90      .95 
##        0     1086     2232     3670     4890 
## 
## lowest :     0.00    16.12   189.00   240.00   242.00
## highest:  8980.00 10968.00 11300.00 20000.00 33837.00
## --------------------------------------------------------------------------------
## LoanAmount 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      529        0      194        1    145.9    78.82     55.0     71.0 
##      .25      .50      .75      .90      .95 
##    100.0    128.0    167.0    234.4    294.4 
## 
## lowest :   9  17  25  26  30, highest: 496 500 570 600 700
## --------------------------------------------------------------------------------
## Loan_Amount_Term 
##        n  missing distinct     Info     Mean      Gmd 
##      529        0        9    0.376    342.4    43.83 
## 
## lowest :  36  60  84 120 180, highest: 180 240 300 360 480
##                                                                 
## Value         36    60    84   120   180   240   300   360   480
## Frequency      2     2     3     3    41     2    10   452    14
## Proportion 0.004 0.004 0.006 0.006 0.078 0.004 0.019 0.854 0.026
## --------------------------------------------------------------------------------
## Credit_History 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##      529        0        2    0.381      450   0.8507   0.2546 
## 
## --------------------------------------------------------------------------------
## Property_Area 
##        n  missing distinct 
##      529        0        3 
##                                         
## Value          Rural Semiurban     Urban
## Frequency        155       209       165
## Proportion     0.293     0.395     0.312
## --------------------------------------------------------------------------------
## Loan_Status 
##        n  missing distinct 
##      529        0        2 
##                       
## Value          N     Y
## Frequency    163   366
## Proportion 0.308 0.692
## --------------------------------------------------------------------------------

Tugas 4

Lakukan pemeriksaan distribusi densitas pada setiap variabel kuantitatif menggunakan R dan Python dengan beberapa bagian sebagai berikut:

4.1 Univariat numerik

4.1.1 ApplicantIncome

df <- read.csv("loan-train.csv")  

hist(df$ApplicantIncome, main ="", 
     col = "blue",  
     freq = FALSE,
     xlab = "")

curve(dnorm(x, 
            mean=mean(Quan$ApplicantIncome), 
            sd=sd(Quan$ApplicantIncome)), 
            add=TRUE, 
            col="black", 
            lwd=3)

library(visualize)
par(mfrow=c(2,2))
visualize.norm(stat=1, mu=mean(Quan$ApplicantIncome), sd=sd(Quan$ApplicantIncome), section="lower")
visualize.norm(stat=c(3,3000),mu=mean(Quan$ApplicantIncome), sd=sd(Quan$ApplicantIncome),section="bounded")
visualize.norm(stat=1,mu=mean(Quan$ApplicantIncome), sd=sd(Quan$ApplicantIncome),section="upper")

4.1.2 LoanAmount

df <- read.csv("loan-train.csv")  

hist(df$LoanAmount, main ="", 
     col = "blue",  
     freq = FALSE,
     xlab = "")

curve(dnorm(x, 
            mean=mean(Quan$LoanAmount), 
            sd=sd(Quan$LoanAmount)), 
            add=TRUE, 
            col="black", 
            lwd=3)

library(visualize)
par(mfrow=c(2,2))
visualize.norm(stat=1, mu=mean(Quan$LoanAmount), sd=sd(Quan$LoanAmount), section="lower")
visualize.norm(stat=c(3,50),mu=mean(Quan$LoanAmount), sd=sd(Quan$LoanAmount),section="bounded")
visualize.norm(stat=1,mu=mean(Quan$LoanAmount), sd=sd(Quan$LoanAmount),section="upper")

4.1.3 Credit_History

df <- read.csv("loan-train.csv")  

hist(df$Credit_History, main ="", 
     col = "blue",  
     freq = FALSE,
     xlab = "")

curve(dnorm(x, 
            mean=mean(Quan$Credit_History), 
            sd=sd(Quan$Credit_History)), 
            add=TRUE, 
            col="black", 
            lwd=3)

library(visualize)
par(mfrow=c(2,2))
visualize.norm(stat=1, mu=mean(Quan$Credit_History), sd=sd(Quan$Credit_History), section="lower")
visualize.norm(stat=c(0.3,0.6),mu=mean(Quan$Credit_History), sd=sd(Quan$Credit_History),section="bounded")
visualize.norm(stat=1,mu=mean(Quan$Credit_History), sd=sd(Quan$Credit_History),section="upper")

4.1.4 CoapplicantIncome

df <- read.csv("loan-train.csv")  

hist(df$CoapplicantIncome, main ="", 
     col = "blue",  
     freq = FALSE,
     xlab = "")

curve(dnorm(x, 
            mean=mean(Quan$CoapplicantIncome), 
            sd=sd(Quan$CoapplicantIncome)), 
            add=TRUE, 
            col="black", 
            lwd=3)

library(visualize)
par(mfrow=c(2,2))
visualize.norm(stat=1, mu=mean(Quan$CoapplicantIncome), sd=sd(Quan$CoapplicantIncome), section="lower")
visualize.norm(stat=c(3,2000),mu=mean(Quan$CoapplicantIncome), sd=sd(Quan$CoapplicantIncome),section="bounded")
visualize.norm(stat=1,mu=mean(Quan$CoapplicantIncome), sd=sd(Quan$CoapplicantIncome),section="upper")

4.1.5 Loan_Amount_Term

df <- read.csv("loan-train.csv")  

hist(df$Loan_Amount_Term, main ="", 
     col = "blue",  
     freq = FALSE,
     xlab = "")

curve(dnorm(x, 
            mean=mean(Quan$Loan_Amount_Term), 
            sd=sd(Quan$Loan_Amount_Term)), 
            add=TRUE, 
            col="black", 
            lwd=3)

library(visualize)
par(mfrow=c(2,2))
visualize.norm(stat=250, mu=mean(Quan$Loan_Amount_Term), sd=sd(Quan$Loan_Amount_Term), section="lower")
visualize.norm(stat=c(300,400),mu=mean(Quan$Loan_Amount_Term), sd=sd(Quan$Loan_Amount_Term),section="bounded")
visualize.norm(stat=250,mu=mean(Quan$Loan_Amount_Term), sd=sd(Quan$Loan_Amount_Term),section="upper")

4.2 Bivariat numerik

4.2.1 ApplicantIncome dan CoapplicantIncome

df1 <- ggplot(df, aes(x = ApplicantIncome, y =CoapplicantIncome)) +
  geom_point(alpha = .5) +
  geom_density_2d()

df1

4.2.2 ApplicantIncome dan LoanAmount

df1 <- ggplot(df, aes(x = ApplicantIncome, y = LoanAmount)) +
  geom_point(alpha = .5) +
  geom_density_2d()

df1

4.2.3 CoapplicantIncome dan LoanAmount

df1 <- ggplot(df, aes(x = CoapplicantIncome, y = LoanAmount)) +
  geom_point(alpha = .5) +
  geom_density_2d()

df1

4.3 Multivariat numerik

library(carData)                                     # for dataset
library(ggplot2)                                     # for visulization
data(df, package="carData")
ggplot(df, aes(x = ApplicantIncome, 
                     y = LoanAmount, 
                     color=Credit_History)) +
  geom_point() +
  theme_minimal() +
  labs(title = "Loan Amount by Applicant Income and Credit History")

Tugas 5

Lakukan proses pengujian Hipotesis menggunakan R dan Python pada setiap variabel kuantitatif dengan beberapa bagian sebagai berikut:

5.1 Margin of Error dan Estimasi Interval

  • Hitunglah margin of error dan estimasi interval untuk proporsi peminjam bejenis kelamin perempuan dalam pada tingkat kepercayaan 95%.
library(MASS)
k = sum(df$Gender == "Female")
n = length(df$Gender) 
pbar = k/n
SE = sqrt(pbar*(1-pbar)/n); SE
## [1] 0.01558505

5.1.1 Margin of error

E = qnorm(.975)*SE; E 
## [1] 0.03054614

5.1.2 Estimasi Interval

library(stats)
prop.test(k, n)
## 
##  1-sample proportions test with continuity correction
## 
## data:  k out of n, null probability 0.5
## X-squared = 246.45, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.1531133 0.2157616
## sample estimates:
##         p 
## 0.1824104

5.2 Ukuran Sampel

  • Jika anda berencana menggunakan perkiraan proporsi 50% data konsumen berjenis kelamin perempuan, temukan ukuran sampel yang diperlukan untuk mencapai margin kesalahan 5% untuk data obeservasi pada tingkat kepercayaan 95%.
zstar = qnorm(.975)
p = 0.5
E = 0.05
zstar^2*p*(1-p)/E^2
## [1] 384.1459

5.3 Pembuktian Kebenaran

  • Lakukan pembuktian kebenaran assumsi dengan tingakat signifikansi 0.05, jika Bank mengklaim bahwa pinjaman rata-rata konsumen adalah:

5.3.1 Lebih besar $ 150

df <- read.csv("loan-train.csv") 
df <- na.omit(df)   
mu0 = 150                                              # hypothesized value  
xbar = mean(Quan$LoanAmount)                           # sample mean 
sigma = sd(Quan$LoanAmount)                            # sample standard deviation 
n = length(df$LoanAmount)                              # sample size 
z = (xbar-mu0)/(sigma/sqrt(n));z                       # test statistic 
## [1] -1.134147
alpha = .05                                            # .05 significance level
z.alpha = qnorm(1-alpha)                               # right tail critical value
z.alpha
## [1] 1.644854

5.3.2 Lebih kecil $ 150

-z.alpha                                               # left tail critical value 
## [1] -1.644854

5.3.3 Sama dengan $ 150

alpha = .05                                      # .05 significance level
z.half.alpha = qnorm(1-alpha/2)                  # per-one tail .025 significance level
c(-z.half.alpha, z.half.alpha)                   # Two-Tailed 0.05 significance level  
## [1] -1.959964  1.959964

5.4 Pembuktian Kebenaran Jika std 85

  • Lakukan pembuktian kebenaran assumsi dengan tingakat signifikansi 0.05, seperti diatas jika diketahui simpangan baku pinjaman adalah $ 85.

5.3.1 Lebih besar $ 150

df <- read.csv("loan-train.csv") 
df <- na.omit(df)   
mu0 = 150                                              # hypothesized value  
xbar = mean(Quan$LoanAmount)                           # sample mean 
sigma = 85                                             # sample standard deviation 
n = length(df$LoanAmount)                              # sample size 
z = (xbar-mu0)/(sigma/sqrt(n));
z                                                       # test statistic 
## [1] -1.122251
alpha = .05                                            # .05 significance level
z.alpha = qnorm(1-alpha)                               # right tail critical value
z.alpha
## [1] 1.644854

5.3.2 Lebih kecil $ 150

-z.alpha                                               # left tail critical value 
## [1] -1.644854

5.3.3 Sama dengan $ 150

alpha = .05                                      # .05 significance level
z.half.alpha = qnorm(1-alpha/2)                  # per-one tail .025 significance level
c(-z.half.alpha, z.half.alpha)                   # Two-Tailed 0.05 significance level  
## [1] -1.959964  1.959964