library(lmtest)
library(nnet)
library(car)
library(readr)
library(caret)
library(biotools)
library(MVN)
library(MASS)
dataset <- read_csv("C:/Users/Lenovo/Downloads/train (1).csv")
head(dataset)
## # A tibble: 6 × 12
## PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin
## <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <chr>
## 1 1 0 3 Braund… male 22 1 0 A/5 2… 7.25 <NA>
## 2 2 1 1 Cuming… fema… 38 1 0 PC 17… 71.3 C85
## 3 3 1 3 Heikki… fema… 26 0 0 STON/… 7.92 <NA>
## 4 4 1 1 Futrel… fema… 35 1 0 113803 53.1 C123
## 5 5 0 3 Allen,… male 35 0 0 373450 8.05 <NA>
## 6 6 0 3 Moran,… male NA 0 0 330877 8.46 <NA>
## # ℹ 1 more variable: Embarked <chr>
boxplot(dataset$Fare,
main = "Boxplot Fare",
ylab = "Fare")
barplot(table(dataset$Sex),
main = "Distribusi jenis kelamin")
summary (dataset)
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:891
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :446.0 Median :0.0000 Median :3.000 Mode :character
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Sex Age SibSp Parch
## Length:891 Min. : 0.42 Min. :0.000 Min. :0.0000
## Class :character 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.000 Median :0.0000
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Ticket Fare Cabin Embarked
## Length:891 Min. : 0.00 Length:891 Length:891
## Class :character 1st Qu.: 7.91 Class :character Class :character
## Mode :character Median : 14.45 Mode :character Mode :character
## Mean : 32.20
## 3rd Qu.: 31.00
## Max. :512.33
##
str(dataset)
## spc_tbl_ [891 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ PassengerId: num [1:891] 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : num [1:891] 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : num [1:891] 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr [1:891] "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr [1:891] "male" "female" "female" "female" ...
## $ Age : num [1:891] 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : num [1:891] 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : num [1:891] 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr [1:891] "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num [1:891] 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr [1:891] NA "C85" NA "C123" ...
## $ Embarked : chr [1:891] "S" "C" "S" "S" ...
## - attr(*, "spec")=
## .. cols(
## .. PassengerId = col_double(),
## .. Survived = col_double(),
## .. Pclass = col_double(),
## .. Name = col_character(),
## .. Sex = col_character(),
## .. Age = col_double(),
## .. SibSp = col_double(),
## .. Parch = col_double(),
## .. Ticket = col_character(),
## .. Fare = col_double(),
## .. Cabin = col_character(),
## .. Embarked = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
data <- dataset[, c(
"Survived",
"Pclass",
"Sex",
"Age",
"SibSp",
"Parch",
"Fare",
"Embarked"
)]
head(data)
## # A tibble: 6 × 8
## Survived Pclass Sex Age SibSp Parch Fare Embarked
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 0 3 male 22 1 0 7.25 S
## 2 1 1 female 38 1 0 71.3 C
## 3 1 3 female 26 0 0 7.92 S
## 4 1 1 female 35 1 0 53.1 S
## 5 0 3 male 35 0 0 8.05 S
## 6 0 3 male NA 0 0 8.46 Q
colSums(is.na(data))
## Survived Pclass Sex Age SibSp Parch Fare Embarked
## 0 0 0 177 0 0 0 2
df <- na.omit(data)
colSums(is.na(df))
## Survived Pclass Sex Age SibSp Parch Fare Embarked
## 0 0 0 0 0 0 0 0
num_cols <- c("Age", "SibSp", "Parch", "Fare")
df_no_outlier <- df
for (col in num_cols) {
Q1 <- quantile(df_no_outlier[[col]], 0.25)
Q3 <- quantile(df_no_outlier[[col]], 0.75)
IQR_val <- Q3 - Q1
lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val
df_no_outlier <- df_no_outlier[
df_no_outlier[[col]] >= lower &
df_no_outlier[[col]] <= upper,
]
}
dim(df_no_outlier)
## [1] 438 8
model_lm <- lm(
as.numeric(Pclass) ~ Sex + Age + SibSp + Fare + Embarked,
data = df_no_outlier
)
unique(df_no_outlier$Pclass)
## [1] 3 1 2
####Uji Multikolinearitas
vif(model_lm)
## GVIF Df GVIF^(1/(2*Df))
## Sex 1.062838 1 1.030940
## Age 1.181174 1 1.086818
## SibSp 1.188701 1 1.090276
## Fare 1.336931 1 1.156257
## Embarked 1.070436 2 1.017162
VIF < 5 → aman VIF > 10 → bermasalah
res_multi <- residuals(model_lm)
plot(res_multi,
main = "Residual Plot Multinomial",
ylab = "Residual")
unique(df_no_outlier$Pclass)
## [1] 3 1 2
vif(model_lm)
## GVIF Df GVIF^(1/(2*Df))
## Sex 1.062838 1 1.030940
## Age 1.181174 1 1.086818
## SibSp 1.188701 1 1.090276
## Fare 1.336931 1 1.156257
## Embarked 1.070436 2 1.017162
VIF < 5 → tidak terjadi multikolinearitas VIF > 10 → terjadi multikolinearitas
num_data <- df_no_outlier[, c("Age", "SibSp", "Fare")]
mvn(data = num_data, mvn_test = "hz")
## $multivariate_normality
## Test Statistic p.value Method MVN
## 1 Henze-Zirkler 46.529 <0.001 asymptotic ✗ Not normal
##
## $univariate_normality
## Test Variable Statistic p.value Normality
## 1 Anderson-Darling Age 8.065 <0.001 ✗ Not normal
## 2 Anderson-Darling SibSp 113.939 <0.001 ✗ Not normal
## 3 Anderson-Darling Fare 41.359 <0.001 ✗ Not normal
##
## $descriptives
## Variable n Mean Std.Dev Median Min Max 25th 75th Skew Kurtosis
## 1 Age 438 31.152 11.670 29.0 5 65.0 22.000 37.000 0.821 3.106
## 2 SibSp 438 0.192 0.438 0.0 0 2.0 0.000 0.000 2.197 7.159
## 3 Fare 438 14.401 10.619 9.5 0 53.1 7.854 16.075 1.903 6.560
##
## $data
## # A tibble: 438 × 3
## Age SibSp Fare
## <dbl> <dbl> <dbl>
## 1 22 1 7.25
## 2 26 0 7.92
## 3 35 1 53.1
## 4 35 0 8.05
## 5 54 0 51.9
## 6 14 1 30.1
## 7 58 0 26.6
## 8 20 0 8.05
## 9 14 0 7.85
## 10 55 0 16
## # ℹ 428 more rows
##
## $subset
## NULL
##
## $outlierMethod
## [1] "none"
##
## attr(,"class")
## [1] "mvn"
Berdasarkan uji normalitas multivariat menggunakan metode Henze-Zirkler diperoleh p-value < 0,001, sehingga data tidak memenuhi asumsi normalitas multivariat. Selain itu, hasil uji normalitas univariat menunjukkan bahwa variabel Age, SibSp, dan Fare juga tidak berdistribusi normal. Namun, karena jumlah sampel yang digunakan cukup besar (n = 438), analisis Linear Discriminant Analysis (LDA) tetap dapat dilakukan
boxM(
df_no_outlier[, c("Age", "SibSp", "Fare")],
df_no_outlier$Pclass
)
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: df_no_outlier[, c("Age", "SibSp", "Fare")]
## Chi-Sq (approx.) = 332.15, df = 12, p-value < 2.2e-16
p-value > 0.05 → matriks kovarians homogen p-value < 0.05 → matriks kovarians tidak homogen
boxplot(df_no_outlier$Fare,
main = "Boxplot Fare",
ylab = "Fare")
df_no_outlier$Pclass <- as.factor(df_no_outlier$Pclass)
model_multi <- multinom(
Pclass ~ Sex + Age + SibSp + Fare + Embarked,
data = df_no_outlier
)
## # weights: 24 (14 variable)
## initial value 481.192182
## iter 10 value 231.368972
## iter 20 value 177.079898
## iter 30 value 176.936955
## iter 40 value 176.932657
## iter 50 value 176.910348
## final value 176.909622
## converged
summary(model_multi)
## Call:
## multinom(formula = Pclass ~ Sex + Age + SibSp + Fare + Embarked,
## data = df_no_outlier)
##
## Coefficients:
## (Intercept) Sexmale Age SibSp Fare EmbarkedQ EmbarkedS
## 2 7.679567 -0.45498864 -0.06753613 3.521992 -0.2457288 12.22268 0.9836065
## 3 15.860827 0.08793624 -0.11187104 7.603431 -0.7910136 12.12441 -0.0502402
##
## Std. Errors:
## (Intercept) Sexmale Age SibSp Fare EmbarkedQ EmbarkedS
## 2 1.451050 0.6451927 0.0235075 0.8398208 0.03584790 0.5702750 0.8029937
## 3 1.826344 0.7226692 0.0271683 1.1182719 0.07428998 0.5702731 1.0300711
##
## Residual Deviance: 353.8192
## AIC: 381.8192
# Model null (tanpa variabel independen)
model_null <- multinom(Pclass ~ 1, data = df_no_outlier)
## # weights: 6 (2 variable)
## initial value 481.192182
## final value 415.714921
## converged
# Model penuh
model_full <- model_multi
# Hitung Likelihood Ratio Test
LR_stat <- 2 * (logLik(model_full) - logLik(model_null))
# Derajat bebas
df <- attr(logLik(model_full), "df") - attr(logLik(model_null), "df")
# p-value
p_value_lr <- pchisq(LR_stat, df, lower.tail = FALSE)
LR_stat
## 'log Lik.' 477.6106 (df=14)
df
## [1] 12
p_value_lr
## 'log Lik.' 1.283409e-94 (df=14)
odds_ratio <- exp(coef(model_multi))
round(odds_ratio, 3)
## (Intercept) Sexmale Age SibSp Fare EmbarkedQ EmbarkedS
## 2 2163.682 0.634 0.935 33.852 0.782 203350.2 2.674
## 3 7731601.489 1.092 0.894 2005.063 0.453 184315.8 0.951
z <- summary(model_multi)$coefficients /
summary(model_multi)$standard.errors
p_value <- (1 - pnorm(abs(z))) * 2
p_value
## (Intercept) Sexmale Age SibSp Fare EmbarkedQ
## 2 1.207091e-07 0.4806870 4.066450e-03 2.743894e-05 7.143175e-12 0
## 3 0.000000e+00 0.9031504 3.826639e-05 1.051514e-11 0.000000e+00 0
## EmbarkedS
## 2 0.2206038
## 3 0.9610998
exp(coef(model_multi))
## (Intercept) Sexmale Age SibSp Fare EmbarkedQ EmbarkedS
## 2 2163.682 0.6344552 0.9346940 33.8518 0.7821343 203350.2 2.674083
## 3 7731601.489 1.0919185 0.8941596 2005.0629 0.4533850 184315.8 0.951001
pred_multi <- predict(model_multi)
head(pred_multi)
## [1] 3 3 1 3 1 2
## Levels: 1 2 3
conf_matrix <- table(
Actual = df_no_outlier$Pclass,
Predicted = pred_multi
)
conf_matrix
## Predicted
## Actual 1 2 3
## 1 61 1 4
## 2 5 86 24
## 3 1 9 247
accuracy <- sum(diag(conf_matrix)) /
sum(conf_matrix)
accuracy
## [1] 0.8995434
precision <- diag(conf_matrix) /
colSums(conf_matrix)
recall <- diag(conf_matrix) /
rowSums(conf_matrix)
f1_score <- 2 * (precision * recall) /
(precision + recall)
precision
## 1 2 3
## 0.9104478 0.8958333 0.8981818
recall
## 1 2 3
## 0.9242424 0.7478261 0.9610895
f1_score
## 1 2 3
## 0.9172932 0.8151659 0.9285714
Sebelum membentuk model, variabel kategorik diubah menjadi factor.
df_no_outlier$Pclass <- as.factor(df_no_outlier$Pclass)
df_no_outlier$Sex <- as.factor(df_no_outlier$Sex)
df_no_outlier$Embarked <- as.factor(df_no_outlier$Embarked)
model_lda <- MASS::lda(
Pclass ~ Sex + Age + SibSp + Fare + Embarked,
data = df_no_outlier
)
model_lda
## Call:
## lda(Pclass ~ Sex + Age + SibSp + Fare + Embarked, data = df_no_outlier)
##
## Prior probabilities of groups:
## 1 2 3
## 0.1506849 0.2625571 0.5867580
##
## Group means:
## Sexmale Age SibSp Fare EmbarkedQ EmbarkedS
## 1 0.8030303 42.49242 0.1969697 33.131944 0.00000000 0.7727273
## 2 0.6347826 32.93043 0.2260870 16.033332 0.01739130 0.8956522
## 3 0.7937743 27.44358 0.1750973 8.860456 0.06225681 0.8326848
##
## Coefficients of linear discriminants:
## LD1 LD2
## Sexmale 0.02834371 2.05391805
## Age -0.03509041 -0.02708070
## SibSp 1.23469434 -0.31403669
## Fare -0.17189268 0.01842656
## EmbarkedQ 0.54694306 0.88596915
## EmbarkedS 0.23642672 -1.20366962
##
## Proportion of trace:
## LD1 LD2
## 0.983 0.017
pred_lda <- predict(model_lda)
head(pred_lda$class)
## [1] 3 3 1 3 1 2
## Levels: 1 2 3
conf_matrix_lda <- table(
Actual = df_no_outlier$Pclass,
Predicted = pred_lda$class
)
conf_matrix_lda
## Predicted
## Actual 1 2 3
## 1 62 0 4
## 2 6 55 54
## 3 1 10 246
accuracy_lda <- sum(diag(conf_matrix_lda)) /
sum(conf_matrix_lda)
accuracy_lda
## [1] 0.8287671
precision_lda <- diag(conf_matrix_lda) /
colSums(conf_matrix_lda)
recall_lda <- diag(conf_matrix_lda) /
rowSums(conf_matrix_lda)
f1_lda <- 2 * (precision_lda * recall_lda) /
(precision_lda + recall_lda)
precision_lda
## 1 2 3
## 0.8985507 0.8461538 0.8092105
recall_lda
## 1 2 3
## 0.9393939 0.4782609 0.9571984
f1_lda
## 1 2 3
## 0.9185185 0.6111111 0.8770053