Komputasi Statistika

Ujian Tengah Semester

Kontak	: $\downarrow$
Email	putri.angelina@matanauniversity.ac.id
GitHub	https://github.com/putriangelinaw/
RPubs	https://rpubs.com/putriangelinaw/

Data Set

Kumpulan data yang digunakan adalah data konsumen yang melakukan pinjaman di suatu Bank. Dataset ini memiliki 614 observasi, 13 atribut sebagai berikut:

Tugas 1

Lakukan proses persiapan data dengan R dan Python, dengan beberapa langkah berikut:

Import Data

loan_data <- read.csv("loan_data.csv")
loan_data

Penanganan Data Hilang

library(dplyr)

loan_data <- loan_data %>%
  mutate(Credit_History = as.character(Credit_History)) %>%
  mutate(Loan_Amount_Term = as.character(Loan_Amount_Term))

## Periksa berapa banyak data yang hilang
list_nan <- unique(names(loan_data)[col(loan_data)[which(loan_data == "")]])
list_nan

## [1] "Gender"        "Married"       "Dependents"    "Self_Employed"

loan_dataNA <- loan_data %>%
  mutate(Gender = replace(Gender, Gender == "", NA)) %>% 
  mutate(Married = replace(Married, Married == "", NA)) %>% 
  mutate(Dependents = replace(Dependents, Dependents == "", NA)) %>% 
  mutate(Self_Employed = replace(Self_Employed, Self_Employed == "", NA))


list_na <- colnames(loan_data)[ apply(loan_data, 2, anyNA) ]
list_na

## [1] "LoanAmount"       "Loan_Amount_Term" "Credit_History"

modes <- function(x) {
  ux <- unique(x)
  tab <- tabulate(match(x, ux))
  ux[tab == max(tab)]
}

# replace data numerik dengan rata-ratanya dan data kategorikal dengan modusnya
clean.loan <- loan_dataNA %>% 
            mutate(LoanAmount       = ifelse(is.na(LoanAmount), mean(na.omit(loan_dataNA$LoanAmount)), LoanAmount),
                   Loan_Amount_Term = as.character(ifelse(is.na(Loan_Amount_Term), modes(na.omit(loan_dataNA$Loan_Amount_Term)), Loan_Amount_Term)),
                   Credit_History   = as.character(ifelse(is.na(Credit_History), modes(na.omit(loan_dataNA$Credit_History)), Credit_History)),
                   Gender           = replace(Gender, is.na(Gender), modes(Gender)),
                   Married          = replace(Married, is.na(Married), modes(Married)),
                   Dependents       = replace(Dependents, is.na(Dependents), modes(Dependents)),
                   Self_Employed    = replace(Self_Employed, is.na(Self_Employed), modes(Self_Employed))
                  )
# Periksa kembali data yang hilang
sum(is.na(clean.loan))

## [1] 0

clean.loan

Periksa Data Duplikat

library(dplyr)
check.duplicate <- data.frame(
                   row_of_data        = clean.loan %>% nrow (),
                   row_of_unique.data = clean.loan %>% distinct() %>% nrow())
check.duplicate

Pemisahan Data Kategori dan Numerik

Data Kategori

category.loan <- select_if(clean.loan, is.character)

category.loan

Data Numerik

numeric.loan  <- select_if(clean.loan, is.numeric)

numeric.loan

Penanganan Data Numerik

Standarisasi

# Standarisasi

standardized <- as.data.frame(lapply(numeric.loan,scale))
standardized

Normalisasi

# Normalisasi

normalize <- function(x){
             (x- min(x)) /(max(x)-min(x))
             }
normalized <- as.data.frame(lapply(numeric.loan,normalize))
normalized

Robust Scalar

# Robust Scalar

robust      <- function(x){
               (x- median(x)) /(quantile(x,probs = .75)-quantile(x,probs = .25))
               }

robust_data <- as.data.frame(lapply(numeric.loan,robust))
robust_data

Penanganan Data Pencilan

# Periksa berapa banyak outlier
outliers <- function(x) {

  Q1 <- quantile(x, probs=.25)
  Q3 <- quantile(x, probs=.75)
  iqr <-  IQR(x)

  upper_limit <- Q3 + (1.5*iqr)
  lower_limit <- Q1 - (1.5*iqr)

   x < lower_limit | x > upper_limit
}

dfap <- subset(numeric.loan, outliers(numeric.loan$ApplicantIncome))
dfco <- subset(numeric.loan, outliers(numeric.loan$CoapplicantIncome))
dfla <- subset(numeric.loan, outliers(numeric.loan$LoanAmount))

outlier.loan <- rbind(dfap,dfco,dfla)%>% distinct()
nrow(outlier.loan)

## [1] 79

# Remove outlier
no_outlier_loan <- clean.loan %>% anti_join(outlier.loan)
no_outlier_loan

Penanganan Data Kategorikal

no_outlier_loan %>% summarise_all(n_distinct)

GenderLabel        <-factor(no_outlier_loan$Gender        , labels=c(0:1))
MarriedLabel       <-factor(no_outlier_loan$Married       , labels=c(0:1))
DependentsLabel    <-factor(no_outlier_loan$Dependents    , labels=c(0:3))
EducationLabel     <-factor(no_outlier_loan$Education     , labels=c(0:1))
Self_EmployedLabel <-factor(no_outlier_loan$Self_Employed , labels=c(0:1))
Loan_Amount_TermLabel <-factor(no_outlier_loan$Loan_Amount_Term , labels=c(0:9))
Credit_HistoryLabel <-factor(no_outlier_loan$Credit_History , labels=c(0:1))
Property_AreaLabel <-factor(no_outlier_loan$Property_Area , labels=c(0:2))
Loan_StatusLabel   <-factor(no_outlier_loan$Loan_Status   , labels=c(0:1))

category.labeled   <- data.frame(no_outlier_loan$Loan_ID,
                                 GenderLabel,
                                 MarriedLabel,
                                 DependentsLabel,
                                 EducationLabel,
                                 Self_EmployedLabel,
                                 Loan_Amount_TermLabel,
                                 Credit_HistoryLabel,
                                 Property_AreaLabel,
                                 Loan_StatusLabel)

category.labeled

Tugas 2

Lakukan Proses Visualisasi Data dengan menggunakan R dan Python dengan beberapa langkah berikut:

Visualisasi Univariabel

Data Kategorikal

library(ggplot2)
library(patchwork)

plotG   <- ggplot(no_outlier_loan, aes(x = Gender))           + geom_bar()
plotM   <- ggplot(no_outlier_loan, aes(x = Married))          + geom_bar()
plotD   <- ggplot(no_outlier_loan, aes(x = Dependents))       + geom_bar()
plotE   <- ggplot(no_outlier_loan, aes(x = Education))        + geom_bar()
plotSE  <- ggplot(no_outlier_loan, aes(x = Self_Employed))    + geom_bar()
plotLAT <- ggplot(no_outlier_loan, aes(x = Loan_Amount_Term)) + geom_bar()+
                  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5))
plotCH  <- ggplot(no_outlier_loan, aes(x = Credit_History))   + geom_bar()
plotPA  <- ggplot(no_outlier_loan, aes(x = Property_Area))    + geom_bar()
plotLS  <- ggplot(no_outlier_loan, aes(x = Loan_Status))      + geom_bar()

plotG +
  plotM + 
  plotD +
  plotE + 
  plotSE + 
  plotLAT + 
  plotCH + 
  plotPA + 
  plotLS + 
  plot_layout(ncol = 3)

Data Numerikal

plotAI <- ggplot(no_outlier_loan, aes(x = ApplicantIncome))+
          geom_histogram(bins = 12, fill = "red", colour = "white")
plotCI <- ggplot(no_outlier_loan, aes(x = CoapplicantIncome))+
          geom_histogram(bins = 12, fill = "blue", colour = "white")
plotLA <- ggplot(no_outlier_loan, aes(x = LoanAmount))+
          geom_histogram(bins = 12, fill = "violet", colour = "white")

plotAI +
  plotCI + 
  plotLA + 
  plot_layout(nrow = 3)

Visualisasi Bivariabel

Data Kategorikal vs Kategorikal

no_outlier_loan$TotalIncome <- no_outlier_loan$ApplicantIncome + no_outlier_loan$CoapplicantIncome

plotG_M  <- ggplot(no_outlier_loan, aes(x = Gender, fill = Married)) +
                   theme_get() +                                 
                   geom_bar(position = position_dodge(preserve = "single")) 
plotM_E  <- ggplot(no_outlier_loan, aes(x = Married, fill = Education)) +
                   theme_get() + 
                   geom_bar(position = position_dodge(preserve = "single"))
PlotE_PA <- ggplot(no_outlier_loan, aes(x = Education, fill = Property_Area)) +
                   theme_get() +                                 
                   geom_bar(position = position_dodge(preserve = "single"))

plotG_M +  
  plotM_E +
  PlotE_PA + 
  plot_layout(ncol = 2)

Data Numerikal vs Numerikal

plotLA_CI <- ggplot(no_outlier_loan, aes(x = LoanAmount, y = CoapplicantIncome ))      + geom_line()
plotLA_AI <- ggplot(no_outlier_loan, aes(x = LoanAmount, y = ApplicantIncome ))        + geom_line()
plotAI_CI <- ggplot(no_outlier_loan, aes(x = ApplicantIncome, y = CoapplicantIncome )) + geom_line()

plotLA_CI + 
  plotLA_AI + 
  plotAI_CI + 
  plot_layout(ncol = 2)

Data Kategorikal vs Numerikal

plotLA_LS <- ggplot(no_outlier_loan, 
                      aes(x = LoanAmount, 
                          fill = Loan_Status)) +
                          geom_density(alpha = 0.3) +
                          theme_minimal() +
                          labs(title = "Loan Amount distribution by Loan Amount  Term")

plotCI_M <- ggplot(no_outlier_loan, 
                      aes(x = CoapplicantIncome, 
                          fill = Married)) +
                          geom_density(alpha = 0.3) +
                          theme_minimal() +
                          labs(title = "Coapplicant Income distribution by Married")

plotAI_E <- ggplot(no_outlier_loan, 
                      aes(x = ApplicantIncome, 
                          fill = Education)) +
                          geom_density(alpha = 0.3) +
                          theme_minimal() +
                          labs(title = "Applicant Income distribution by Education")
plotLA_PA <- ggplot(no_outlier_loan, 
                      aes(x = ApplicantIncome, 
                          fill = Property_Area)) +
                          geom_density(alpha = 0.3) +
                          theme_minimal() +
                          labs(title = "Applicant Income distribution by Property Area")

plotLA_LS + 
  plotCI_M +
  plotAI_E +
  plotLA_PA + 
  plot_layout(ncol = 2)

Visualisasi Multivariabel

plotAI_LA_LAT <- ggplot(no_outlier_loan, aes(x=ApplicantIncome, y=LoanAmount, 
                                             shape=Loan_Amount_Term, colour=Property_Area))+geom_point()
plotLA_CI_LAT <- ggplot(no_outlier_loan, aes(x=LoanAmount, y=CoapplicantIncome, 
                                             shape=Loan_Amount_Term, colour=Property_Area))+geom_point()
plotAI_CI_E   <- ggplot(no_outlier_loan, aes(x=ApplicantIncome, y=CoapplicantIncome, 
                                             shape=Education, colour=Property_Area))+geom_point()
plotAI_LA_E   <- ggplot(no_outlier_loan, aes(x=ApplicantIncome, y=LoanAmount, 
                                             shape=Education, colour=Property_Area))+geom_point()
plotLA_CI_E   <- ggplot(no_outlier_loan, aes(x=LoanAmount, y=CoapplicantIncome, 
                                             shape=Education, colour=Property_Area))+geom_point()

plotAI_LA_LAT

plotLA_CI_LAT

plotAI_CI_E

plotAI_LA_E

plotLA_CI_E

Tugas 3

Lakukan proses analisis data secara deskriptif menggunakan R dan Python dengan beberapa langkah berikut:

Kualitatif

Kategori Univariat

# Proportion
prop.table(table(no_outlier_loan$Gender))

## 
##    Female      Male 
## 0.1869159 0.8130841

prop.table(table(no_outlier_loan$Married))

## 
##        No       Yes 
## 0.3495327 0.6504673

prop.table(table(no_outlier_loan$Dependents))

## 
##          0          1          2         3+ 
## 0.60000000 0.15700935 0.16822430 0.07476636

prop.table(table(no_outlier_loan$Education))

## 
##     Graduate Not Graduate 
##    0.7551402    0.2448598

prop.table(table(no_outlier_loan$Self_Employed))

## 
##        No       Yes 
## 0.8859813 0.1140187

prop.table(table(no_outlier_loan$Loan_Amount_Term))

## 
##          12         120         180         240         300          36 
## 0.001869159 0.005607477 0.067289720 0.007476636 0.018691589 0.003738318 
##         360         480          60          84 
## 0.857943925 0.026168224 0.003738318 0.007476636

prop.table(table(no_outlier_loan$Credit_History))

## 
##         0         1 
## 0.1457944 0.8542056

prop.table(table(no_outlier_loan$Property_Area))

## 
##     Rural Semiurban     Urban 
## 0.2990654 0.3831776 0.3177570

prop.table(table(no_outlier_loan$Loan_Status))

## 
##         N         Y 
## 0.3046729 0.6953271

Kategori Bivariat

no_outlier_loan %>% select(Gender, Married)           %>%  table()

##         Married
## Gender    No Yes
##   Female  72  28
##   Male   115 320

no_outlier_loan %>% select(Gender, Education)         %>%  table()

##         Education
## Gender   Graduate Not Graduate
##   Female       82           18
##   Male        322          113

no_outlier_loan %>% select(Gender, Property_Area)     %>%  table()

##         Property_Area
## Gender   Rural Semiurban Urban
##   Female    24        49    27
##   Male     136       156   143

no_outlier_loan %>% select(Education, Self_Employed)  %>%  table()

##               Self_Employed
## Education       No Yes
##   Graduate     358  46
##   Not Graduate 116  15

no_outlier_loan %>% select(Gender, Loan_Amount_Term)  %>%  table()

##         Loan_Amount_Term
## Gender    12 120 180 240 300  36 360 480  60  84
##   Female   0   0   2   1   1   1  90   4   0   1
##   Male     1   3  34   3   9   1 369  10   2   3

no_outlier_loan %>% select(Married, Loan_Amount_Term) %>%  table()

##        Loan_Amount_Term
## Married  12 120 180 240 300  36 360 480  60  84
##     No    0   1   7   1   3   2 164   8   1   0
##     Yes   1   2  29   3   7   0 295   6   1   4

Kategori Multivariat

no_outlier_loan %>% select(Gender, Married, Education) %>% ftable()

##                Education Graduate Not Graduate
## Gender Married                                
## Female No                      60           12
##        Yes                     22            6
## Male   No                      84           31
##        Yes                    238           82

no_outlier_loan %>% select(Gender, Married, Education,Property_Area,Loan_Amount_Term) %>% ftable()

##                                           Loan_Amount_Term 12 120 180 240 300 36 360 480 60 84
## Gender Married Education    Property_Area                                                     
## Female No      Graduate     Rural                           0   0   0   0   0  0  13   2  0  0
##                             Semiurban                       0   0   0   0   1  1  23   0  0  0
##                             Urban                           0   0   1   0   0  0  18   1  0  0
##                Not Graduate Rural                           0   0   0   0   0  0   4   0  0  0
##                             Semiurban                       0   0   0   0   0  0   5   0  0  0
##                             Urban                           0   0   0   0   0  0   3   0  0  0
##        Yes     Graduate     Rural                           0   0   0   0   0  0   3   0  0  0
##                             Semiurban                       0   0   1   1   0  0  11   1  0  1
##                             Urban                           0   0   0   0   0  0   4   0  0  0
##                Not Graduate Rural                           0   0   0   0   0  0   2   0  0  0
##                             Semiurban                       0   0   0   0   0  0   4   0  0  0
##                             Urban                           0   0   0   0   0  0   0   0  0  0
## Male   No      Graduate     Rural                           0   0   0   0   0  0  27   0  0  0
##                             Semiurban                       0   0   1   0   1  0  23   2  0  0
##                             Urban                           0   0   3   0   0  0  24   2  1  0
##                Not Graduate Rural                           0   0   0   0   1  0  12   0  0  0
##                             Semiurban                       0   1   0   0   0  1   9   0  0  0
##                             Urban                           0   0   2   1   0  0   3   1  0  0
##        Yes     Graduate     Rural                           0   0   7   0   1  0  57   0  0  2
##                             Semiurban                       0   1   3   1   3  0  84   2  0  0
##                             Urban                           1   1   5   1   1  0  67   0  0  1
##                Not Graduate Rural                           0   0   3   0   0  0  26   0  0  0
##                             Semiurban                       0   0   2   0   1  0  19   2  0  0
##                             Urban                           0   0   8   0   1  0  18   1  1  0

Kuantitatif

Univariat numerik

library(e1071)
# Summary statistics
num_new <- select_if(no_outlier_loan, is.numeric)
summary(num_new)

##  ApplicantIncome CoapplicantIncome   LoanAmount     TotalIncome   
##  Min.   :  150   Min.   :   0      Min.   :  9.0   Min.   : 1442  
##  1st Qu.: 2752   1st Qu.:   0      1st Qu.:100.0   1st Qu.: 3900  
##  Median : 3598   Median :1260      Median :124.0   Median : 5000  
##  Mean   : 4054   Mean   :1323      Mean   :127.0   Mean   : 5377  
##  3rd Qu.: 4891   3rd Qu.:2194      3rd Qu.:151.5   3rd Qu.: 6411  
##  Max.   :10139   Max.   :5701      Max.   :260.0   Max.   :13746

# Variance
sapply(num_new, var)

##   ApplicantIncome CoapplicantIncome        LoanAmount       TotalIncome 
##       3435005.100       2019826.684          1990.793       4049506.682

# Standard Deviation
sapply(num_new, sd)

##   ApplicantIncome CoapplicantIncome        LoanAmount       TotalIncome 
##        1853.37668        1421.20607          44.61831        2012.33861

# Median Absolute Deviation
sapply(num_new, mad)

##   ApplicantIncome CoapplicantIncome        LoanAmount       TotalIncome 
##         1504.8390         1868.0760           38.5476         1813.2198

# Inter Quantile Range
sapply(num_new, IQR)

##   ApplicantIncome CoapplicantIncome        LoanAmount       TotalIncome 
##            2138.5            2194.0              51.5            2511.0

# Skewness
sapply(num_new, skewness)

##   ApplicantIncome CoapplicantIncome        LoanAmount       TotalIncome 
##         1.1358372         0.8411675         0.3989309         0.9268307

# Kurtosis
sapply(num_new, kurtosis)

##   ApplicantIncome CoapplicantIncome        LoanAmount       TotalIncome 
##        1.19274525        0.01943741        0.47240548        0.94869136

Bivariat numerik

# Covariance
cov(no_outlier_loan$ApplicantIncome,no_outlier_loan$CoapplicantIncome)

## [1] -702662.6

cov(no_outlier_loan$CoapplicantIncome,no_outlier_loan$LoanAmount)

## [1] 18225.52

cov(no_outlier_loan$LoanAmount,no_outlier_loan$ApplicantIncome)

## [1] 39323.8

# Correlation
cor(no_outlier_loan$ApplicantIncome,no_outlier_loan$CoapplicantIncome)

## [1] -0.2667633

cor(no_outlier_loan$CoapplicantIncome,no_outlier_loan$LoanAmount)

## [1] 0.2874152

cor(no_outlier_loan$LoanAmount,no_outlier_loan$ApplicantIncome)

## [1] 0.4755308

Multivariat numerik

# Covariance
cov(num_new)

##                   ApplicantIncome CoapplicantIncome LoanAmount TotalIncome
## ApplicantIncome         3435005.1        -702662.55  39323.796  2732342.55
## CoapplicantIncome       -702662.6        2019826.68  18225.520  1317164.13
## LoanAmount                39323.8          18225.52   1990.793    57549.32
## TotalIncome             2732342.5        1317164.13  57549.316  4049506.68

# Correlation
cor(num_new)

##                   ApplicantIncome CoapplicantIncome LoanAmount TotalIncome
## ApplicantIncome         1.0000000        -0.2667633  0.4755308   0.7326059
## CoapplicantIncome      -0.2667633         1.0000000  0.2874152   0.4605553
## LoanAmount              0.4755308         0.2874152  1.0000000   0.6409528
## TotalIncome             0.7326059         0.4605553  0.6409528   1.0000000

EDA dengan cara Malas

library(funModeling) 
library(tidyverse) 
library(Hmisc)
library(skimr)

basic_eda <- function(data)
{
  glimpse(data)
  skim(data)
  df_status(data)
  freq(data) 
  profiling_num(data)
  plot_num(data)
  describe(data)
}
eda <- no_outlier_loan %>% select(-Loan_ID)
basic_eda(eda)

## Rows: 535
## Columns: 13
## $ Gender            <chr> "Male", "Male", "Male", "Male", "Male", "Male", "Mal~
## $ Married           <chr> "No", "Yes", "Yes", "Yes", "No", "Yes", "Yes", "Yes"~
## $ Dependents        <chr> "0", "1", "0", "0", "0", "0", "3+", "2", "2", "2", "~
## $ Education         <chr> "Graduate", "Graduate", "Graduate", "Not Graduate", ~
## $ Self_Employed     <chr> "No", "No", "Yes", "No", "No", "No", "No", "No", "No~
## $ ApplicantIncome   <int> 5849, 4583, 3000, 2583, 6000, 2333, 3036, 4006, 3200~
## $ CoapplicantIncome <dbl> 0, 1508, 0, 2358, 0, 1516, 2504, 1526, 700, 1840, 28~
## $ LoanAmount        <dbl> 146.4122, 128.0000, 66.0000, 120.0000, 141.0000, 95.~
## $ Loan_Amount_Term  <chr> "360", "360", "360", "360", "360", "360", "360", "36~
## $ Credit_History    <chr> "1", "1", "1", "1", "1", "1", "0", "1", "1", "1", "1~
## $ Property_Area     <chr> "Urban", "Rural", "Urban", "Urban", "Urban", "Urban"~
## $ Loan_Status       <chr> "Y", "N", "Y", "Y", "Y", "Y", "N", "Y", "Y", "Y", "N~
## $ TotalIncome       <dbl> 5849, 6091, 3000, 4941, 6000, 3849, 5540, 5532, 3900~
##             variable q_zeros p_zeros q_na p_na q_inf p_inf      type unique
## 1             Gender       0    0.00    0    0     0     0 character      2
## 2            Married       0    0.00    0    0     0     0 character      2
## 3         Dependents     321   60.00    0    0     0     0 character      4
## 4          Education       0    0.00    0    0     0     0 character      2
## 5      Self_Employed       0    0.00    0    0     0     0 character      2
## 6    ApplicantIncome       0    0.00    0    0     0     0   integer    438
## 7  CoapplicantIncome     229   42.80    0    0     0     0   numeric    259
## 8         LoanAmount       0    0.00    0    0     0     0   numeric    161
## 9   Loan_Amount_Term       0    0.00    0    0     0     0 character     10
## 10    Credit_History      78   14.58    0    0     0     0 character      2
## 11     Property_Area       0    0.00    0    0     0     0 character      3
## 12       Loan_Status       0    0.00    0    0     0     0 character      2
## 13       TotalIncome       0    0.00    0    0     0     0   numeric    480

##   Gender frequency percentage cumulative_perc
## 1   Male       435      81.31           81.31
## 2 Female       100      18.69          100.00

##   Married frequency percentage cumulative_perc
## 1     Yes       348      65.05           65.05
## 2      No       187      34.95          100.00

##   Dependents frequency percentage cumulative_perc
## 1          0       321      60.00           60.00
## 2          2        90      16.82           76.82
## 3          1        84      15.70           92.52
## 4         3+        40       7.48          100.00

##      Education frequency percentage cumulative_perc
## 1     Graduate       404      75.51           75.51
## 2 Not Graduate       131      24.49          100.00

##   Self_Employed frequency percentage cumulative_perc
## 1            No       474       88.6            88.6
## 2           Yes        61       11.4           100.0

##    Loan_Amount_Term frequency percentage cumulative_perc
## 1               360       459      85.79           85.79
## 2               180        36       6.73           92.52
## 3               480        14       2.62           95.14
## 4               300        10       1.87           97.01
## 5               240         4       0.75           97.76
## 6                84         4       0.75           98.51
## 7               120         3       0.56           99.07
## 8                36         2       0.37           99.44
## 9                60         2       0.37           99.81
## 10               12         1       0.19          100.00

##   Credit_History frequency percentage cumulative_perc
## 1              1       457      85.42           85.42
## 2              0        78      14.58          100.00

##   Property_Area frequency percentage cumulative_perc
## 1     Semiurban       205      38.32           38.32
## 2         Urban       170      31.78           70.10
## 3         Rural       160      29.91          100.00

##   Loan_Status frequency percentage cumulative_perc
## 1           Y       372      69.53           69.53
## 2           N       163      30.47          100.00

## data 
## 
##  13  Variables      535  Observations
## --------------------------------------------------------------------------------
## Gender 
##        n  missing distinct 
##      535        0        2 
##                         
## Value      Female   Male
## Frequency     100    435
## Proportion  0.187  0.813
## --------------------------------------------------------------------------------
## Married 
##        n  missing distinct 
##      535        0        2 
##                     
## Value        No  Yes
## Frequency   187  348
## Proportion 0.35 0.65
## --------------------------------------------------------------------------------
## Dependents 
##        n  missing distinct 
##      535        0        4 
##                                   
## Value          0     1     2    3+
## Frequency    321    84    90    40
## Proportion 0.600 0.157 0.168 0.075
## --------------------------------------------------------------------------------
## Education 
##        n  missing distinct 
##      535        0        2 
##                                     
## Value          Graduate Not Graduate
## Frequency           404          131
## Proportion        0.755        0.245
## --------------------------------------------------------------------------------
## Self_Employed 
##        n  missing distinct 
##      535        0        2 
##                       
## Value         No   Yes
## Frequency    474    61
## Proportion 0.886 0.114
## --------------------------------------------------------------------------------
## ApplicantIncome 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      535        0      438        1     4054     1986     1878     2200 
##      .25      .50      .75      .90      .95 
##     2752     3598     4891     6467     8022 
## 
## lowest :   150   210   645   674  1000, highest:  9833  9963 10000 10047 10139
## --------------------------------------------------------------------------------
## CoapplicantIncome 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      535        0      259    0.922     1323     1528        0        0 
##      .25      .50      .75      .90      .95 
##        0     1260     2194     3258     4130 
## 
## lowest :    0.00   16.12  189.00  240.00  242.00
## highest: 5500.00 5624.00 5625.00 5654.00 5701.00
## --------------------------------------------------------------------------------
## LoanAmount 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      535        0      161        1      127    49.51     55.7     71.0 
##      .25      .50      .75      .90      .95 
##    100.0    124.0    151.5    185.6    205.6 
## 
## lowest :   9  17  25  26  30, highest: 253 255 258 259 260
## --------------------------------------------------------------------------------
## Loan_Amount_Term 
##        n  missing distinct 
##      535        0       10 
## 
## lowest : 12  120 180 240 300, highest: 36  360 480 60  84 
##                                                                       
## Value         12   120   180   240   300    36   360   480    60    84
## Frequency      1     3    36     4    10     2   459    14     2     4
## Proportion 0.002 0.006 0.067 0.007 0.019 0.004 0.858 0.026 0.004 0.007
## --------------------------------------------------------------------------------
## Credit_History 
##        n  missing distinct 
##      535        0        2 
##                       
## Value          0     1
## Frequency     78   457
## Proportion 0.146 0.854
## --------------------------------------------------------------------------------
## Property_Area 
##        n  missing distinct 
##      535        0        3 
##                                         
## Value          Rural Semiurban     Urban
## Frequency        160       205       170
## Proportion     0.299     0.383     0.318
## --------------------------------------------------------------------------------
## Loan_Status 
##        n  missing distinct 
##      535        0        2 
##                       
## Value          N     Y
## Frequency    163   372
## Proportion 0.305 0.695
## --------------------------------------------------------------------------------
## TotalIncome 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      535        0      480        1     5377     2203     2718     3139 
##      .25      .50      .75      .90      .95 
##     3900     5000     6411     8041     9517 
## 
## lowest :  1442  1830  1880  1950  1963, highest: 11500 11666 11904 12083 13746
## --------------------------------------------------------------------------------

Tugas 4

Lakukan pemeriksaan distribusi densitas menggunakan R dan Python pada setiap variabel kuantitatif dengan beberapa bagian sebagai berikut:

Univariat numerik

library(ks)
library(MASS)

fhat <- kde(x=num_new[,3])
plot_1 <- plot(fhat, cont=50, col.cont=4, cont.lwd=2, xlab="Loan Amount")

fhat1 <- kde(x=num_new[,2])
plot_2 <- plot(fhat1, cont=50, col.cont=4, cont.lwd=2, xlab="Coapplicant Income")

fhat2 <- kde(x=num_new[,4])
plot_3 <- plot(fhat2, cont=50, col.cont=4, cont.lwd=2, xlab="AppCoapplicant Income")

fhat3 <- kde(x=num_new[,1])
plot_4 <- plot(fhat3, cont=50, col.cont=4, cont.lwd=2, xlab="Applicant Income")

Bivariat numerik

fhat4 <- kde(x=num_new[,3:4])
plot(fhat4, display="filled.contour", cont=seq(10,90,by=10), lwd=1)

plot(fhat4, display="persp", border=1)

Multivariat numerik

fhat5 <- kde(x=num_new[,1:3])
plot(fhat5)

Tugas 5

Lakukan proses pengujian Hipotesis menggunakan R dan Python pada setiap variabel kuantitatif dengan beberapa bagian sebagai berikut:

Hitunglah margin of error dan estimasi interval untuk proporsi peminjam berjenis kelamin perempuan dalam pada tingkat kepercayaan 95%.

k    = sum(no_outlier_loan$Gender == "Female")         # the sum of female responses
n    = sum(count(no_outlier_loan))                     # total data
pbar = k/n                                             # female student proportion in survey
SE   = sqrt(pbar*(1-pbar)/n); SE                       # standard error

## [1] 0.01685443

E = qnorm(.975)*SE; E                                  # margin of error

## [1] 0.03303407

pbar + c(-E, E)                                        # the CI of sample proportion

## [1] 0.1538818 0.2199500

Jika anda berencana menggunakan perkiraan proporsi 50% data konsumen berjenis kelamin perempuan, temukan ukuran sampel yang diperlukan untuk mencapai margin kesalahan 5% untuk data obeservasi pada tingkat kepercayaan 95%.

zstar = qnorm(.975)                                    # quantiles (95% confidence level)
p = 0.5                                                # 50% planned proportion estimate
E = 0.05                                               # expected error
zstar^2*p*(1-p)/E^2                                    # sampling size

## [1] 384.1459

Lakukan pembuktian kebenaran assumsi dengan tingkat signifikansi 0.05, jika Bank mengklaim bahwa pinjaman rata-rata konsumen adalah:

set.seed(100)
sampel <- sample_n(no_outlier_loan, 30)
sampel

Lebih besar $ 150.

\[H_0 = \mu = \$150 \\ \] \[H_1 = \mu > \$150 \]

mu0 = 150                                              # hypothesized value  
xbar = mean(sampel$LoanAmount)                         # sample mean 
s = sd(sampel$LoanAmount)                              # sample standard deviation 
n = sum(count(sampel))                                 # sample size 
t = (xbar-mu0)/(s/sqrt(n));t                           # test statistic

## [1] -2.520249

alpha = .05                                            # use 0.05 left tail significant level 
t.alpha = qt(1-alpha, df=n-1)                          # right tail critical value    
t.alpha

## [1] 1.699127

Karena t hitung kurang dari t tabel, maka terima $H_0$. Artinya klaim pinjaman rata-rata Bank tidak lebih besar dari $ $ 150$.

Lebih kecil $150

\[H_0 = \mu = \$150 \\ \] \[H_1 = \mu < \$150 \]

mu0 = 150                                            # hypothesized value  
xbar = mean(sampel$LoanAmount)                       # sample mean 
s = sd(sampel$LoanAmount)                            # sample standard deviation 
n = sum(count(sampel))                               # sample size 
t = (xbar-mu0)/(s/sqrt(n));t                         # test statistic

## [1] -2.520249

alpha = .05                                          # use 0.05 left tail significant level 
t.alpha = qt(1-alpha, df=n-1)                        # right tail critical value
-t.alpha                                             # left tail critical value

## [1] -1.699127

Karena t hitung kurang dari t tabel, maka tolak $H_0$. Artinya klaim pinjaman rata-rata Bank kurang dari $\$ 150$.

Sama dengan $ 150.

\[H_0 = \mu = \$150 \\ \] \[H_1 = \mu \neq \$150 \]

mu0 = 150                                            # hypothesized value  
xbar = mean(sampel$LoanAmount)                       # sample mean 
s = sd(sampel$LoanAmount)                            # sample standard deviation 
n = sum(count(sampel))                               # sample size 
t = (xbar-mu0)/(s/sqrt(n));t                         # test statistic

## [1] -2.520249

alpha = .05                                          # use 0.05 left tail significant level 
t.alpha = qt(1-alpha, df=n-1)                      # right tail critical value    
t.alpha

## [1] 1.699127

-t.alpha

## [1] -1.699127

Karena t hitung tidak berada di antara t tabel, maka tolak $H_0$. Artinya klaim pinjaman rata-rata Bank tidak sama dengan $\$ 150$.

Lakukan pembuktian kebenaran assumsi dengan tingakat signifikansi 0.05, seperti diatas jika diketahui simpangan baku pinjaman adalah $ 85.