Komputasi Statistika
Ujian Tengah Semester
| Kontak | : \(\downarrow\) |
| putri.angelina@matanauniversity.ac.id | |
| GitHub | https://github.com/putriangelinaw/ |
| RPubs | https://rpubs.com/putriangelinaw/ |
Data Set
Kumpulan data yang digunakan adalah data konsumen yang melakukan pinjaman di suatu Bank. Dataset ini memiliki 614 observasi, 13 atribut sebagai berikut:
Tugas 1
Lakukan proses persiapan data dengan R dan Python, dengan beberapa langkah berikut:
Import Data
loan_data <- read.csv("loan_data.csv")
loan_dataPenanganan Data Hilang
library(dplyr)
loan_data <- loan_data %>%
mutate(Credit_History = as.character(Credit_History)) %>%
mutate(Loan_Amount_Term = as.character(Loan_Amount_Term))
## Periksa berapa banyak data yang hilang
list_nan <- unique(names(loan_data)[col(loan_data)[which(loan_data == "")]])
list_nan## [1] "Gender" "Married" "Dependents" "Self_Employed"
loan_dataNA <- loan_data %>%
mutate(Gender = replace(Gender, Gender == "", NA)) %>%
mutate(Married = replace(Married, Married == "", NA)) %>%
mutate(Dependents = replace(Dependents, Dependents == "", NA)) %>%
mutate(Self_Employed = replace(Self_Employed, Self_Employed == "", NA))
list_na <- colnames(loan_data)[ apply(loan_data, 2, anyNA) ]
list_na## [1] "LoanAmount" "Loan_Amount_Term" "Credit_History"
modes <- function(x) {
ux <- unique(x)
tab <- tabulate(match(x, ux))
ux[tab == max(tab)]
}
# replace data numerik dengan rata-ratanya dan data kategorikal dengan modusnya
clean.loan <- loan_dataNA %>%
mutate(LoanAmount = ifelse(is.na(LoanAmount), mean(na.omit(loan_dataNA$LoanAmount)), LoanAmount),
Loan_Amount_Term = as.character(ifelse(is.na(Loan_Amount_Term), modes(na.omit(loan_dataNA$Loan_Amount_Term)), Loan_Amount_Term)),
Credit_History = as.character(ifelse(is.na(Credit_History), modes(na.omit(loan_dataNA$Credit_History)), Credit_History)),
Gender = replace(Gender, is.na(Gender), modes(Gender)),
Married = replace(Married, is.na(Married), modes(Married)),
Dependents = replace(Dependents, is.na(Dependents), modes(Dependents)),
Self_Employed = replace(Self_Employed, is.na(Self_Employed), modes(Self_Employed))
)
# Periksa kembali data yang hilang
sum(is.na(clean.loan))## [1] 0
clean.loanPeriksa Data Duplikat
library(dplyr)
check.duplicate <- data.frame(
row_of_data = clean.loan %>% nrow (),
row_of_unique.data = clean.loan %>% distinct() %>% nrow())
check.duplicatePemisahan Data Kategori dan Numerik
Data Kategori
category.loan <- select_if(clean.loan, is.character)
category.loanData Numerik
numeric.loan <- select_if(clean.loan, is.numeric)
numeric.loanPenanganan Data Numerik
Standarisasi
# Standarisasi
standardized <- as.data.frame(lapply(numeric.loan,scale))
standardizedNormalisasi
# Normalisasi
normalize <- function(x){
(x- min(x)) /(max(x)-min(x))
}
normalized <- as.data.frame(lapply(numeric.loan,normalize))
normalizedRobust Scalar
# Robust Scalar
robust <- function(x){
(x- median(x)) /(quantile(x,probs = .75)-quantile(x,probs = .25))
}
robust_data <- as.data.frame(lapply(numeric.loan,robust))
robust_dataPenanganan Data Pencilan
# Periksa berapa banyak outlier
outliers <- function(x) {
Q1 <- quantile(x, probs=.25)
Q3 <- quantile(x, probs=.75)
iqr <- IQR(x)
upper_limit <- Q3 + (1.5*iqr)
lower_limit <- Q1 - (1.5*iqr)
x < lower_limit | x > upper_limit
}
dfap <- subset(numeric.loan, outliers(numeric.loan$ApplicantIncome))
dfco <- subset(numeric.loan, outliers(numeric.loan$CoapplicantIncome))
dfla <- subset(numeric.loan, outliers(numeric.loan$LoanAmount))
outlier.loan <- rbind(dfap,dfco,dfla)%>% distinct()
nrow(outlier.loan)## [1] 79
# Remove outlier
no_outlier_loan <- clean.loan %>% anti_join(outlier.loan)
no_outlier_loanPenanganan Data Kategorikal
no_outlier_loan %>% summarise_all(n_distinct)GenderLabel <-factor(no_outlier_loan$Gender , labels=c(0:1))
MarriedLabel <-factor(no_outlier_loan$Married , labels=c(0:1))
DependentsLabel <-factor(no_outlier_loan$Dependents , labels=c(0:3))
EducationLabel <-factor(no_outlier_loan$Education , labels=c(0:1))
Self_EmployedLabel <-factor(no_outlier_loan$Self_Employed , labels=c(0:1))
Loan_Amount_TermLabel <-factor(no_outlier_loan$Loan_Amount_Term , labels=c(0:9))
Credit_HistoryLabel <-factor(no_outlier_loan$Credit_History , labels=c(0:1))
Property_AreaLabel <-factor(no_outlier_loan$Property_Area , labels=c(0:2))
Loan_StatusLabel <-factor(no_outlier_loan$Loan_Status , labels=c(0:1))
category.labeled <- data.frame(no_outlier_loan$Loan_ID,
GenderLabel,
MarriedLabel,
DependentsLabel,
EducationLabel,
Self_EmployedLabel,
Loan_Amount_TermLabel,
Credit_HistoryLabel,
Property_AreaLabel,
Loan_StatusLabel)
category.labeledTugas 2
Lakukan Proses Visualisasi Data dengan menggunakan R dan Python dengan beberapa langkah berikut:
Visualisasi Univariabel
Data Kategorikal
library(ggplot2)
library(patchwork)
plotG <- ggplot(no_outlier_loan, aes(x = Gender)) + geom_bar()
plotM <- ggplot(no_outlier_loan, aes(x = Married)) + geom_bar()
plotD <- ggplot(no_outlier_loan, aes(x = Dependents)) + geom_bar()
plotE <- ggplot(no_outlier_loan, aes(x = Education)) + geom_bar()
plotSE <- ggplot(no_outlier_loan, aes(x = Self_Employed)) + geom_bar()
plotLAT <- ggplot(no_outlier_loan, aes(x = Loan_Amount_Term)) + geom_bar()+
theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5))
plotCH <- ggplot(no_outlier_loan, aes(x = Credit_History)) + geom_bar()
plotPA <- ggplot(no_outlier_loan, aes(x = Property_Area)) + geom_bar()
plotLS <- ggplot(no_outlier_loan, aes(x = Loan_Status)) + geom_bar()
plotG +
plotM +
plotD +
plotE +
plotSE +
plotLAT +
plotCH +
plotPA +
plotLS +
plot_layout(ncol = 3)Data Numerikal
plotAI <- ggplot(no_outlier_loan, aes(x = ApplicantIncome))+
geom_histogram(bins = 12, fill = "red", colour = "white")
plotCI <- ggplot(no_outlier_loan, aes(x = CoapplicantIncome))+
geom_histogram(bins = 12, fill = "blue", colour = "white")
plotLA <- ggplot(no_outlier_loan, aes(x = LoanAmount))+
geom_histogram(bins = 12, fill = "violet", colour = "white")
plotAI +
plotCI +
plotLA +
plot_layout(nrow = 3)Visualisasi Bivariabel
Data Kategorikal vs Kategorikal
no_outlier_loan$TotalIncome <- no_outlier_loan$ApplicantIncome + no_outlier_loan$CoapplicantIncome
plotG_M <- ggplot(no_outlier_loan, aes(x = Gender, fill = Married)) +
theme_get() +
geom_bar(position = position_dodge(preserve = "single"))
plotM_E <- ggplot(no_outlier_loan, aes(x = Married, fill = Education)) +
theme_get() +
geom_bar(position = position_dodge(preserve = "single"))
PlotE_PA <- ggplot(no_outlier_loan, aes(x = Education, fill = Property_Area)) +
theme_get() +
geom_bar(position = position_dodge(preserve = "single"))
plotG_M +
plotM_E +
PlotE_PA +
plot_layout(ncol = 2)Data Numerikal vs Numerikal
plotLA_CI <- ggplot(no_outlier_loan, aes(x = LoanAmount, y = CoapplicantIncome )) + geom_line()
plotLA_AI <- ggplot(no_outlier_loan, aes(x = LoanAmount, y = ApplicantIncome )) + geom_line()
plotAI_CI <- ggplot(no_outlier_loan, aes(x = ApplicantIncome, y = CoapplicantIncome )) + geom_line()
plotLA_CI +
plotLA_AI +
plotAI_CI +
plot_layout(ncol = 2)Data Kategorikal vs Numerikal
plotLA_LS <- ggplot(no_outlier_loan,
aes(x = LoanAmount,
fill = Loan_Status)) +
geom_density(alpha = 0.3) +
theme_minimal() +
labs(title = "Loan Amount distribution by Loan Amount Term")
plotCI_M <- ggplot(no_outlier_loan,
aes(x = CoapplicantIncome,
fill = Married)) +
geom_density(alpha = 0.3) +
theme_minimal() +
labs(title = "Coapplicant Income distribution by Married")
plotAI_E <- ggplot(no_outlier_loan,
aes(x = ApplicantIncome,
fill = Education)) +
geom_density(alpha = 0.3) +
theme_minimal() +
labs(title = "Applicant Income distribution by Education")
plotLA_PA <- ggplot(no_outlier_loan,
aes(x = ApplicantIncome,
fill = Property_Area)) +
geom_density(alpha = 0.3) +
theme_minimal() +
labs(title = "Applicant Income distribution by Property Area")
plotLA_LS +
plotCI_M +
plotAI_E +
plotLA_PA +
plot_layout(ncol = 2)Visualisasi Multivariabel
plotAI_LA_LAT <- ggplot(no_outlier_loan, aes(x=ApplicantIncome, y=LoanAmount,
shape=Loan_Amount_Term, colour=Property_Area))+geom_point()
plotLA_CI_LAT <- ggplot(no_outlier_loan, aes(x=LoanAmount, y=CoapplicantIncome,
shape=Loan_Amount_Term, colour=Property_Area))+geom_point()
plotAI_CI_E <- ggplot(no_outlier_loan, aes(x=ApplicantIncome, y=CoapplicantIncome,
shape=Education, colour=Property_Area))+geom_point()
plotAI_LA_E <- ggplot(no_outlier_loan, aes(x=ApplicantIncome, y=LoanAmount,
shape=Education, colour=Property_Area))+geom_point()
plotLA_CI_E <- ggplot(no_outlier_loan, aes(x=LoanAmount, y=CoapplicantIncome,
shape=Education, colour=Property_Area))+geom_point()
plotAI_LA_LATplotLA_CI_LATplotAI_CI_EplotAI_LA_EplotLA_CI_ETugas 3
Lakukan proses analisis data secara deskriptif menggunakan R dan Python dengan beberapa langkah berikut:
Kualitatif
Kategori Univariat
# Proportion
prop.table(table(no_outlier_loan$Gender))##
## Female Male
## 0.1869159 0.8130841
prop.table(table(no_outlier_loan$Married))##
## No Yes
## 0.3495327 0.6504673
prop.table(table(no_outlier_loan$Dependents))##
## 0 1 2 3+
## 0.60000000 0.15700935 0.16822430 0.07476636
prop.table(table(no_outlier_loan$Education))##
## Graduate Not Graduate
## 0.7551402 0.2448598
prop.table(table(no_outlier_loan$Self_Employed))##
## No Yes
## 0.8859813 0.1140187
prop.table(table(no_outlier_loan$Loan_Amount_Term))##
## 12 120 180 240 300 36
## 0.001869159 0.005607477 0.067289720 0.007476636 0.018691589 0.003738318
## 360 480 60 84
## 0.857943925 0.026168224 0.003738318 0.007476636
prop.table(table(no_outlier_loan$Credit_History))##
## 0 1
## 0.1457944 0.8542056
prop.table(table(no_outlier_loan$Property_Area))##
## Rural Semiurban Urban
## 0.2990654 0.3831776 0.3177570
prop.table(table(no_outlier_loan$Loan_Status))##
## N Y
## 0.3046729 0.6953271
Kategori Bivariat
no_outlier_loan %>% select(Gender, Married) %>% table() ## Married
## Gender No Yes
## Female 72 28
## Male 115 320
no_outlier_loan %>% select(Gender, Education) %>% table() ## Education
## Gender Graduate Not Graduate
## Female 82 18
## Male 322 113
no_outlier_loan %>% select(Gender, Property_Area) %>% table() ## Property_Area
## Gender Rural Semiurban Urban
## Female 24 49 27
## Male 136 156 143
no_outlier_loan %>% select(Education, Self_Employed) %>% table()## Self_Employed
## Education No Yes
## Graduate 358 46
## Not Graduate 116 15
no_outlier_loan %>% select(Gender, Loan_Amount_Term) %>% table() ## Loan_Amount_Term
## Gender 12 120 180 240 300 36 360 480 60 84
## Female 0 0 2 1 1 1 90 4 0 1
## Male 1 3 34 3 9 1 369 10 2 3
no_outlier_loan %>% select(Married, Loan_Amount_Term) %>% table() ## Loan_Amount_Term
## Married 12 120 180 240 300 36 360 480 60 84
## No 0 1 7 1 3 2 164 8 1 0
## Yes 1 2 29 3 7 0 295 6 1 4
Kategori Multivariat
no_outlier_loan %>% select(Gender, Married, Education) %>% ftable()## Education Graduate Not Graduate
## Gender Married
## Female No 60 12
## Yes 22 6
## Male No 84 31
## Yes 238 82
no_outlier_loan %>% select(Gender, Married, Education,Property_Area,Loan_Amount_Term) %>% ftable()## Loan_Amount_Term 12 120 180 240 300 36 360 480 60 84
## Gender Married Education Property_Area
## Female No Graduate Rural 0 0 0 0 0 0 13 2 0 0
## Semiurban 0 0 0 0 1 1 23 0 0 0
## Urban 0 0 1 0 0 0 18 1 0 0
## Not Graduate Rural 0 0 0 0 0 0 4 0 0 0
## Semiurban 0 0 0 0 0 0 5 0 0 0
## Urban 0 0 0 0 0 0 3 0 0 0
## Yes Graduate Rural 0 0 0 0 0 0 3 0 0 0
## Semiurban 0 0 1 1 0 0 11 1 0 1
## Urban 0 0 0 0 0 0 4 0 0 0
## Not Graduate Rural 0 0 0 0 0 0 2 0 0 0
## Semiurban 0 0 0 0 0 0 4 0 0 0
## Urban 0 0 0 0 0 0 0 0 0 0
## Male No Graduate Rural 0 0 0 0 0 0 27 0 0 0
## Semiurban 0 0 1 0 1 0 23 2 0 0
## Urban 0 0 3 0 0 0 24 2 1 0
## Not Graduate Rural 0 0 0 0 1 0 12 0 0 0
## Semiurban 0 1 0 0 0 1 9 0 0 0
## Urban 0 0 2 1 0 0 3 1 0 0
## Yes Graduate Rural 0 0 7 0 1 0 57 0 0 2
## Semiurban 0 1 3 1 3 0 84 2 0 0
## Urban 1 1 5 1 1 0 67 0 0 1
## Not Graduate Rural 0 0 3 0 0 0 26 0 0 0
## Semiurban 0 0 2 0 1 0 19 2 0 0
## Urban 0 0 8 0 1 0 18 1 1 0
Kuantitatif
Univariat numerik
library(e1071)
# Summary statistics
num_new <- select_if(no_outlier_loan, is.numeric)
summary(num_new)## ApplicantIncome CoapplicantIncome LoanAmount TotalIncome
## Min. : 150 Min. : 0 Min. : 9.0 Min. : 1442
## 1st Qu.: 2752 1st Qu.: 0 1st Qu.:100.0 1st Qu.: 3900
## Median : 3598 Median :1260 Median :124.0 Median : 5000
## Mean : 4054 Mean :1323 Mean :127.0 Mean : 5377
## 3rd Qu.: 4891 3rd Qu.:2194 3rd Qu.:151.5 3rd Qu.: 6411
## Max. :10139 Max. :5701 Max. :260.0 Max. :13746
# Variance
sapply(num_new, var)## ApplicantIncome CoapplicantIncome LoanAmount TotalIncome
## 3435005.100 2019826.684 1990.793 4049506.682
# Standard Deviation
sapply(num_new, sd)## ApplicantIncome CoapplicantIncome LoanAmount TotalIncome
## 1853.37668 1421.20607 44.61831 2012.33861
# Median Absolute Deviation
sapply(num_new, mad)## ApplicantIncome CoapplicantIncome LoanAmount TotalIncome
## 1504.8390 1868.0760 38.5476 1813.2198
# Inter Quantile Range
sapply(num_new, IQR)## ApplicantIncome CoapplicantIncome LoanAmount TotalIncome
## 2138.5 2194.0 51.5 2511.0
# Skewness
sapply(num_new, skewness)## ApplicantIncome CoapplicantIncome LoanAmount TotalIncome
## 1.1358372 0.8411675 0.3989309 0.9268307
# Kurtosis
sapply(num_new, kurtosis)## ApplicantIncome CoapplicantIncome LoanAmount TotalIncome
## 1.19274525 0.01943741 0.47240548 0.94869136
Bivariat numerik
# Covariance
cov(no_outlier_loan$ApplicantIncome,no_outlier_loan$CoapplicantIncome)## [1] -702662.6
cov(no_outlier_loan$CoapplicantIncome,no_outlier_loan$LoanAmount)## [1] 18225.52
cov(no_outlier_loan$LoanAmount,no_outlier_loan$ApplicantIncome)## [1] 39323.8
# Correlation
cor(no_outlier_loan$ApplicantIncome,no_outlier_loan$CoapplicantIncome)## [1] -0.2667633
cor(no_outlier_loan$CoapplicantIncome,no_outlier_loan$LoanAmount)## [1] 0.2874152
cor(no_outlier_loan$LoanAmount,no_outlier_loan$ApplicantIncome)## [1] 0.4755308
Multivariat numerik
# Covariance
cov(num_new)## ApplicantIncome CoapplicantIncome LoanAmount TotalIncome
## ApplicantIncome 3435005.1 -702662.55 39323.796 2732342.55
## CoapplicantIncome -702662.6 2019826.68 18225.520 1317164.13
## LoanAmount 39323.8 18225.52 1990.793 57549.32
## TotalIncome 2732342.5 1317164.13 57549.316 4049506.68
# Correlation
cor(num_new)## ApplicantIncome CoapplicantIncome LoanAmount TotalIncome
## ApplicantIncome 1.0000000 -0.2667633 0.4755308 0.7326059
## CoapplicantIncome -0.2667633 1.0000000 0.2874152 0.4605553
## LoanAmount 0.4755308 0.2874152 1.0000000 0.6409528
## TotalIncome 0.7326059 0.4605553 0.6409528 1.0000000
EDA dengan cara Malas
library(funModeling)
library(tidyverse)
library(Hmisc)
library(skimr)
basic_eda <- function(data)
{
glimpse(data)
skim(data)
df_status(data)
freq(data)
profiling_num(data)
plot_num(data)
describe(data)
}
eda <- no_outlier_loan %>% select(-Loan_ID)
basic_eda(eda)## Rows: 535
## Columns: 13
## $ Gender <chr> "Male", "Male", "Male", "Male", "Male", "Male", "Mal~
## $ Married <chr> "No", "Yes", "Yes", "Yes", "No", "Yes", "Yes", "Yes"~
## $ Dependents <chr> "0", "1", "0", "0", "0", "0", "3+", "2", "2", "2", "~
## $ Education <chr> "Graduate", "Graduate", "Graduate", "Not Graduate", ~
## $ Self_Employed <chr> "No", "No", "Yes", "No", "No", "No", "No", "No", "No~
## $ ApplicantIncome <int> 5849, 4583, 3000, 2583, 6000, 2333, 3036, 4006, 3200~
## $ CoapplicantIncome <dbl> 0, 1508, 0, 2358, 0, 1516, 2504, 1526, 700, 1840, 28~
## $ LoanAmount <dbl> 146.4122, 128.0000, 66.0000, 120.0000, 141.0000, 95.~
## $ Loan_Amount_Term <chr> "360", "360", "360", "360", "360", "360", "360", "36~
## $ Credit_History <chr> "1", "1", "1", "1", "1", "1", "0", "1", "1", "1", "1~
## $ Property_Area <chr> "Urban", "Rural", "Urban", "Urban", "Urban", "Urban"~
## $ Loan_Status <chr> "Y", "N", "Y", "Y", "Y", "Y", "N", "Y", "Y", "Y", "N~
## $ TotalIncome <dbl> 5849, 6091, 3000, 4941, 6000, 3849, 5540, 5532, 3900~
## variable q_zeros p_zeros q_na p_na q_inf p_inf type unique
## 1 Gender 0 0.00 0 0 0 0 character 2
## 2 Married 0 0.00 0 0 0 0 character 2
## 3 Dependents 321 60.00 0 0 0 0 character 4
## 4 Education 0 0.00 0 0 0 0 character 2
## 5 Self_Employed 0 0.00 0 0 0 0 character 2
## 6 ApplicantIncome 0 0.00 0 0 0 0 integer 438
## 7 CoapplicantIncome 229 42.80 0 0 0 0 numeric 259
## 8 LoanAmount 0 0.00 0 0 0 0 numeric 161
## 9 Loan_Amount_Term 0 0.00 0 0 0 0 character 10
## 10 Credit_History 78 14.58 0 0 0 0 character 2
## 11 Property_Area 0 0.00 0 0 0 0 character 3
## 12 Loan_Status 0 0.00 0 0 0 0 character 2
## 13 TotalIncome 0 0.00 0 0 0 0 numeric 480
## Gender frequency percentage cumulative_perc
## 1 Male 435 81.31 81.31
## 2 Female 100 18.69 100.00
## Married frequency percentage cumulative_perc
## 1 Yes 348 65.05 65.05
## 2 No 187 34.95 100.00
## Dependents frequency percentage cumulative_perc
## 1 0 321 60.00 60.00
## 2 2 90 16.82 76.82
## 3 1 84 15.70 92.52
## 4 3+ 40 7.48 100.00
## Education frequency percentage cumulative_perc
## 1 Graduate 404 75.51 75.51
## 2 Not Graduate 131 24.49 100.00
## Self_Employed frequency percentage cumulative_perc
## 1 No 474 88.6 88.6
## 2 Yes 61 11.4 100.0
## Loan_Amount_Term frequency percentage cumulative_perc
## 1 360 459 85.79 85.79
## 2 180 36 6.73 92.52
## 3 480 14 2.62 95.14
## 4 300 10 1.87 97.01
## 5 240 4 0.75 97.76
## 6 84 4 0.75 98.51
## 7 120 3 0.56 99.07
## 8 36 2 0.37 99.44
## 9 60 2 0.37 99.81
## 10 12 1 0.19 100.00
## Credit_History frequency percentage cumulative_perc
## 1 1 457 85.42 85.42
## 2 0 78 14.58 100.00
## Property_Area frequency percentage cumulative_perc
## 1 Semiurban 205 38.32 38.32
## 2 Urban 170 31.78 70.10
## 3 Rural 160 29.91 100.00
## Loan_Status frequency percentage cumulative_perc
## 1 Y 372 69.53 69.53
## 2 N 163 30.47 100.00
## data
##
## 13 Variables 535 Observations
## --------------------------------------------------------------------------------
## Gender
## n missing distinct
## 535 0 2
##
## Value Female Male
## Frequency 100 435
## Proportion 0.187 0.813
## --------------------------------------------------------------------------------
## Married
## n missing distinct
## 535 0 2
##
## Value No Yes
## Frequency 187 348
## Proportion 0.35 0.65
## --------------------------------------------------------------------------------
## Dependents
## n missing distinct
## 535 0 4
##
## Value 0 1 2 3+
## Frequency 321 84 90 40
## Proportion 0.600 0.157 0.168 0.075
## --------------------------------------------------------------------------------
## Education
## n missing distinct
## 535 0 2
##
## Value Graduate Not Graduate
## Frequency 404 131
## Proportion 0.755 0.245
## --------------------------------------------------------------------------------
## Self_Employed
## n missing distinct
## 535 0 2
##
## Value No Yes
## Frequency 474 61
## Proportion 0.886 0.114
## --------------------------------------------------------------------------------
## ApplicantIncome
## n missing distinct Info Mean Gmd .05 .10
## 535 0 438 1 4054 1986 1878 2200
## .25 .50 .75 .90 .95
## 2752 3598 4891 6467 8022
##
## lowest : 150 210 645 674 1000, highest: 9833 9963 10000 10047 10139
## --------------------------------------------------------------------------------
## CoapplicantIncome
## n missing distinct Info Mean Gmd .05 .10
## 535 0 259 0.922 1323 1528 0 0
## .25 .50 .75 .90 .95
## 0 1260 2194 3258 4130
##
## lowest : 0.00 16.12 189.00 240.00 242.00
## highest: 5500.00 5624.00 5625.00 5654.00 5701.00
## --------------------------------------------------------------------------------
## LoanAmount
## n missing distinct Info Mean Gmd .05 .10
## 535 0 161 1 127 49.51 55.7 71.0
## .25 .50 .75 .90 .95
## 100.0 124.0 151.5 185.6 205.6
##
## lowest : 9 17 25 26 30, highest: 253 255 258 259 260
## --------------------------------------------------------------------------------
## Loan_Amount_Term
## n missing distinct
## 535 0 10
##
## lowest : 12 120 180 240 300, highest: 36 360 480 60 84
##
## Value 12 120 180 240 300 36 360 480 60 84
## Frequency 1 3 36 4 10 2 459 14 2 4
## Proportion 0.002 0.006 0.067 0.007 0.019 0.004 0.858 0.026 0.004 0.007
## --------------------------------------------------------------------------------
## Credit_History
## n missing distinct
## 535 0 2
##
## Value 0 1
## Frequency 78 457
## Proportion 0.146 0.854
## --------------------------------------------------------------------------------
## Property_Area
## n missing distinct
## 535 0 3
##
## Value Rural Semiurban Urban
## Frequency 160 205 170
## Proportion 0.299 0.383 0.318
## --------------------------------------------------------------------------------
## Loan_Status
## n missing distinct
## 535 0 2
##
## Value N Y
## Frequency 163 372
## Proportion 0.305 0.695
## --------------------------------------------------------------------------------
## TotalIncome
## n missing distinct Info Mean Gmd .05 .10
## 535 0 480 1 5377 2203 2718 3139
## .25 .50 .75 .90 .95
## 3900 5000 6411 8041 9517
##
## lowest : 1442 1830 1880 1950 1963, highest: 11500 11666 11904 12083 13746
## --------------------------------------------------------------------------------
Tugas 4
Lakukan pemeriksaan distribusi densitas menggunakan R dan Python pada setiap variabel kuantitatif dengan beberapa bagian sebagai berikut:
Univariat numerik
library(ks)
library(MASS)
fhat <- kde(x=num_new[,3])
plot_1 <- plot(fhat, cont=50, col.cont=4, cont.lwd=2, xlab="Loan Amount")fhat1 <- kde(x=num_new[,2])
plot_2 <- plot(fhat1, cont=50, col.cont=4, cont.lwd=2, xlab="Coapplicant Income")fhat2 <- kde(x=num_new[,4])
plot_3 <- plot(fhat2, cont=50, col.cont=4, cont.lwd=2, xlab="AppCoapplicant Income")fhat3 <- kde(x=num_new[,1])
plot_4 <- plot(fhat3, cont=50, col.cont=4, cont.lwd=2, xlab="Applicant Income")Bivariat numerik
fhat4 <- kde(x=num_new[,3:4])
plot(fhat4, display="filled.contour", cont=seq(10,90,by=10), lwd=1)plot(fhat4, display="persp", border=1)Multivariat numerik
fhat5 <- kde(x=num_new[,1:3])
plot(fhat5)Tugas 5
Lakukan proses pengujian Hipotesis menggunakan R dan Python pada setiap variabel kuantitatif dengan beberapa bagian sebagai berikut:
Hitunglah margin of error dan estimasi interval untuk proporsi peminjam berjenis kelamin perempuan dalam pada tingkat kepercayaan 95%.
k = sum(no_outlier_loan$Gender == "Female") # the sum of female responses
n = sum(count(no_outlier_loan)) # total data
pbar = k/n # female student proportion in survey
SE = sqrt(pbar*(1-pbar)/n); SE # standard error ## [1] 0.01685443
E = qnorm(.975)*SE; E # margin of error## [1] 0.03303407
pbar + c(-E, E) # the CI of sample proportion## [1] 0.1538818 0.2199500
Jika anda berencana menggunakan perkiraan proporsi 50% data konsumen berjenis kelamin perempuan, temukan ukuran sampel yang diperlukan untuk mencapai margin kesalahan 5% untuk data obeservasi pada tingkat kepercayaan 95%.
zstar = qnorm(.975) # quantiles (95% confidence level)
p = 0.5 # 50% planned proportion estimate
E = 0.05 # expected error
zstar^2*p*(1-p)/E^2 # sampling size## [1] 384.1459
Lakukan pembuktian kebenaran assumsi dengan tingkat signifikansi 0.05, jika Bank mengklaim bahwa pinjaman rata-rata konsumen adalah:
set.seed(100)
sampel <- sample_n(no_outlier_loan, 30)
sampelLebih besar $ 150.
\[H_0 = \mu = \$150 \\ \] \[H_1 = \mu > \$150 \]
mu0 = 150 # hypothesized value
xbar = mean(sampel$LoanAmount) # sample mean
s = sd(sampel$LoanAmount) # sample standard deviation
n = sum(count(sampel)) # sample size
t = (xbar-mu0)/(s/sqrt(n));t # test statistic ## [1] -2.520249
alpha = .05 # use 0.05 left tail significant level
t.alpha = qt(1-alpha, df=n-1) # right tail critical value
t.alpha## [1] 1.699127
Karena t hitung kurang dari t tabel, maka terima \(H_0\). Artinya klaim pinjaman rata-rata Bank tidak lebih besar dari $ $ 150$.
Lebih kecil $150
\[H_0 = \mu = \$150 \\ \] \[H_1 = \mu < \$150 \]
mu0 = 150 # hypothesized value
xbar = mean(sampel$LoanAmount) # sample mean
s = sd(sampel$LoanAmount) # sample standard deviation
n = sum(count(sampel)) # sample size
t = (xbar-mu0)/(s/sqrt(n));t # test statistic ## [1] -2.520249
alpha = .05 # use 0.05 left tail significant level
t.alpha = qt(1-alpha, df=n-1) # right tail critical value
-t.alpha # left tail critical value ## [1] -1.699127
Karena t hitung kurang dari t tabel, maka tolak \(H_0\). Artinya klaim pinjaman rata-rata Bank kurang dari \(\$ 150\).
Sama dengan $ 150.
\[H_0 = \mu = \$150 \\ \] \[H_1 = \mu \neq \$150 \]
mu0 = 150 # hypothesized value
xbar = mean(sampel$LoanAmount) # sample mean
s = sd(sampel$LoanAmount) # sample standard deviation
n = sum(count(sampel)) # sample size
t = (xbar-mu0)/(s/sqrt(n));t # test statistic ## [1] -2.520249
alpha = .05 # use 0.05 left tail significant level
t.alpha = qt(1-alpha, df=n-1) # right tail critical value
t.alpha ## [1] 1.699127
-t.alpha## [1] -1.699127
Karena t hitung tidak berada di antara t tabel, maka tolak \(H_0\). Artinya klaim pinjaman rata-rata Bank tidak sama dengan \(\$ 150\).
Lakukan pembuktian kebenaran assumsi dengan tingakat signifikansi 0.05, seperti diatas jika diketahui simpangan baku pinjaman adalah $ 85.
Lebih besar $ 150.
\[H_0 = \mu = \$150 \\ \] \[H_1 = \mu > \$150 \]
mu0 = 150 # hypothesized value
xbar = mean(sampel$LoanAmount) # sample mean
s = 85 # sample standard deviation
n = sum(count(sampel)) # sample size
t = (xbar-mu0)/(s/sqrt(n));t # test statistic ## [1] -1.546511
alpha = .05 # use 0.05 left tail significant level
t.alpha = qt(1-alpha, df=n-1) # right tail critical value
t.alpha## [1] 1.699127
Karena t hitung kurang dari t tabel, maka terima \(H_0\). Artinya klaim pinjaman rata-rata Bank tidak lebih besar dari $ $ 150$.
Lebih kecil $150
\[H_0 = \mu = \$150 \\ \] \[H_1 = \mu < \$150 \]
mu0 = 150 # hypothesized value
xbar = mean(sampel$LoanAmount) # sample mean
s = 85 # sample standard deviation
n = sum(count(sampel)) # sample size
t = (xbar-mu0)/(s/sqrt(n));t # test statistic ## [1] -1.546511
alpha = .05 # use 0.05 left tail significant level
t.alpha = qt(1-alpha, df=n-1) # right tail critical value
-t.alpha # left tail critical value ## [1] -1.699127
Karena t hitung lebih dari t tabel, maka terima \(H_0\). Artinya klaim pinjaman rata-rata Bank tidak kurang dari \(\$ 150\).
Sama dengan $ 150.
\[H_0 = \mu = \$150 \\ \] \[H_1 = \mu \neq \$150 \]
mu0 = 150 # hypothesized value
xbar = mean(sampel$LoanAmount) # sample mean
s = 85 # sample standard deviation
n = sum(count(sampel)) # sample size
t = (xbar-mu0)/(s/sqrt(n));t # test statistic ## [1] -1.546511
alpha = .05 # use 0.05 left tail significant level
t.alpha = qt(1-alpha, df=n-1) # right tail critical value
t.alpha ## [1] 1.699127
-t.alpha## [1] -1.699127
Karena t hitung berada di antara t tabel, maka terima \(H_0\). Artinya klaim pinjaman rata-rata Bank sama dengan \(\$ 150\).
Referensi
- https://bookdown.org/BaktiSiregar/data-science-for-beginners-part-2
- https://rpubs.com/dsciencelabs/ks3
- https://rpubs.com/dsciencelabs/ks4
- https://bookdown.org/BaktiSiregar/data-science-for-beginners
- https://www.analyticsvidhya.com/blog/2020/07/univariate-analysis-visualization-with-illustrations-in-python/
- https://seaborn.pydata.org/tutorial/distributions.html
- https://www.geeksforgeeks.org/multiple-density-plots-with-pandas-in-python/
- https://stackoverflow.com
- https://www.python-graph-gallery.com/
- https://seaborn.pydata.org/tutorial/categorical.html