library(lmtest)
library(nnet)
library(car)
library(readr)
dataset <- read_csv("C:/Users/Lenovo/Downloads/train (1).csv")
## Rows: 891 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Name, Sex, Ticket, Cabin, Embarked
## dbl (7): PassengerId, Survived, Pclass, Age, SibSp, Parch, Fare
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(dataset)
## # A tibble: 6 × 12
## PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin
## <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <chr>
## 1 1 0 3 Braund… male 22 1 0 A/5 2… 7.25 <NA>
## 2 2 1 1 Cuming… fema… 38 1 0 PC 17… 71.3 C85
## 3 3 1 3 Heikki… fema… 26 0 0 STON/… 7.92 <NA>
## 4 4 1 1 Futrel… fema… 35 1 0 113803 53.1 C123
## 5 5 0 3 Allen,… male 35 0 0 373450 8.05 <NA>
## 6 6 0 3 Moran,… male NA 0 0 330877 8.46 <NA>
## # ℹ 1 more variable: Embarked <chr>
str(dataset)
## spc_tbl_ [891 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ PassengerId: num [1:891] 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : num [1:891] 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : num [1:891] 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr [1:891] "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr [1:891] "male" "female" "female" "female" ...
## $ Age : num [1:891] 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : num [1:891] 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : num [1:891] 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr [1:891] "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num [1:891] 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr [1:891] NA "C85" NA "C123" ...
## $ Embarked : chr [1:891] "S" "C" "S" "S" ...
## - attr(*, "spec")=
## .. cols(
## .. PassengerId = col_double(),
## .. Survived = col_double(),
## .. Pclass = col_double(),
## .. Name = col_character(),
## .. Sex = col_character(),
## .. Age = col_double(),
## .. SibSp = col_double(),
## .. Parch = col_double(),
## .. Ticket = col_character(),
## .. Fare = col_double(),
## .. Cabin = col_character(),
## .. Embarked = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
data <- dataset[, c(
"Survived",
"Pclass",
"Sex",
"Age",
"SibSp",
"Parch",
"Fare",
"Embarked"
)]
head(data)
## # A tibble: 6 × 8
## Survived Pclass Sex Age SibSp Parch Fare Embarked
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 0 3 male 22 1 0 7.25 S
## 2 1 1 female 38 1 0 71.3 C
## 3 1 3 female 26 0 0 7.92 S
## 4 1 1 female 35 1 0 53.1 S
## 5 0 3 male 35 0 0 8.05 S
## 6 0 3 male NA 0 0 8.46 Q
colSums(is.na(data))
## Survived Pclass Sex Age SibSp Parch Fare Embarked
## 0 0 0 177 0 0 0 2
df <- na.omit(data)
colSums(is.na(df))
## Survived Pclass Sex Age SibSp Parch Fare Embarked
## 0 0 0 0 0 0 0 0
num_cols <- c("Age", "SibSp", "Parch", "Fare")
df_no_outlier <- df
for (col in num_cols) {
Q1 <- quantile(df_no_outlier[[col]], 0.25)
Q3 <- quantile(df_no_outlier[[col]], 0.75)
IQR_val <- Q3 - Q1
lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val
df_no_outlier <- df_no_outlier[
df_no_outlier[[col]] >= lower &
df_no_outlier[[col]] <= upper,
]
}
dim(df_no_outlier)
## [1] 438 8
model <- glm(
Survived ~ Pclass + Sex + Age +
SibSp + Fare + Embarked,
data = df_no_outlier,
family = binomial
)
summary(model)
##
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + SibSp + Fare +
## Embarked, family = binomial, data = df_no_outlier)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.281112 1.151853 3.717 0.000202 ***
## Pclass -0.906143 0.294531 -3.077 0.002094 **
## Sexmale -2.665099 0.290720 -9.167 < 2e-16 ***
## Age -0.023974 0.012835 -1.868 0.061788 .
## SibSp -0.505224 0.370747 -1.363 0.172971
## Fare 0.009696 0.019896 0.487 0.626003
## EmbarkedQ -0.480804 0.725488 -0.663 0.507503
## EmbarkedS -0.615154 0.381581 -1.612 0.106936
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 527.45 on 437 degrees of freedom
## Residual deviance: 385.18 on 430 degrees of freedom
## AIC: 401.18
##
## Number of Fisher Scoring iterations: 5
####Uji Multikolinearitas
vif(model)
## GVIF Df GVIF^(1/(2*Df))
## Pclass 3.307435 1 1.818636
## Sex 1.164798 1 1.079258
## Age 1.430442 1 1.196011
## SibSp 1.486278 1 1.219130
## Fare 3.263727 1 1.806579
## Embarked 1.097102 2 1.023439
VIF < 5 → aman VIF > 10 → bermasalah
# Tambahkan +1 agar tidak ada nilai 0
df_no_outlier$Age_bt <- df_no_outlier$Age + 1
df_no_outlier$SibSp_bt <- df_no_outlier$SibSp + 1
df_no_outlier$Fare_bt <- df_no_outlier$Fare + 1
# Jalankan Box-Tidwell
boxTidwell(
Survived ~ Age_bt + SibSp_bt + Fare_bt,
data = df_no_outlier
)
## MLE of lambda Score Statistic (t) Pr(>|t|)
## Age_bt -1.49300 0.1643 0.8696
## SibSp_bt 7.51766 -0.9264 0.3548
## Fare_bt 0.45363 -1.8036 0.0720 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## iterations = 12
##
## Score test for null hypothesis that all lambdas = 1:
## F = 1.4321, df = 3 and 431, Pr(>F) = 0.2328
dwtest(model)
##
## Durbin-Watson test
##
## data: model
## DW = 1.849, p-value = 0.05618
## alternative hypothesis: true autocorrelation is greater than 0
Hal ini menunjukkan bahwa tidak terdapat autokorelasi pada model, sehingga asumsi independensi terpenuhi.
cooksd <- cooks.distance(model)
plot(cooksd,
main="Cook's Distance",
ylab="Cook's Distance")
which(cooksd > 4/length(cooksd))
## 19 53 56 61 88 103 135 164 203 220 236 244 247 260 264 274 298 310 313 318
## 19 53 56 61 88 103 135 164 203 220 236 244 247 260 264 274 298 310 313 318
## 341 350 369 371 375
## 341 350 369 371 375
threshold <- 4 / length(cooksd)
threshold
## [1] 0.00913242
cooksd[cooksd > threshold]
## 19 53 56 61 88 103
## 0.009510140 0.014751990 0.018522194 0.024586992 0.028463633 0.018624642
## 135 164 203 220 236 244
## 0.017892931 0.012048581 0.011334057 0.015096981 0.016601652 0.024142677
## 247 260 264 274 298 310
## 0.045309598 0.010271252 0.013956914 0.018413661 0.015446884 0.010059385
## 313 318 341 350 369 371
## 0.025356915 0.017902553 0.016838468 0.014656352 0.013890319 0.021108981
## 375
## 0.009724721
Berdasarkan hasil analisis Cook’s Distance diperoleh beberapa observasi yang memiliki nilai lebih besar dari batas 4/n (0,0091). Namun nilai Cook’s Distance tertinggi hanya sebesar 0,045 (< 1), sehingga tidak terdapat outlier yang berpengaruh secara ekstrem terhadap model. Oleh karena itu, seluruh data tetap digunakan dalam analisis regresi logistik.
df_no_outlier$Pclass <- as.factor(df_no_outlier$Pclass)
model_multi <- multinom(
Pclass ~ Sex + Age + SibSp + Fare + Embarked,
data = df_no_outlier
)
## # weights: 24 (14 variable)
## initial value 481.192182
## iter 10 value 231.368972
## iter 20 value 177.079898
## iter 30 value 176.936955
## iter 40 value 176.932657
## iter 50 value 176.910348
## final value 176.909622
## converged
summary(model_multi)
## Call:
## multinom(formula = Pclass ~ Sex + Age + SibSp + Fare + Embarked,
## data = df_no_outlier)
##
## Coefficients:
## (Intercept) Sexmale Age SibSp Fare EmbarkedQ EmbarkedS
## 2 7.679567 -0.45498864 -0.06753613 3.521992 -0.2457288 12.22268 0.9836065
## 3 15.860827 0.08793624 -0.11187104 7.603431 -0.7910136 12.12441 -0.0502402
##
## Std. Errors:
## (Intercept) Sexmale Age SibSp Fare EmbarkedQ EmbarkedS
## 2 1.451050 0.6451927 0.0235075 0.8398208 0.03584790 0.5702750 0.8029937
## 3 1.826344 0.7226692 0.0271683 1.1182719 0.07428998 0.5702731 1.0300711
##
## Residual Deviance: 353.8192
## AIC: 381.8192
z <- summary(model_multi)$coefficients /
summary(model_multi)$standard.errors
p_value <- (1 - pnorm(abs(z))) * 2
p_value
## (Intercept) Sexmale Age SibSp Fare EmbarkedQ
## 2 1.207091e-07 0.4806870 4.066450e-03 2.743894e-05 7.143175e-12 0
## 3 0.000000e+00 0.9031504 3.826639e-05 1.051514e-11 0.000000e+00 0
## EmbarkedS
## 2 0.2206038
## 3 0.9610998
exp(coef(model_multi))
## (Intercept) Sexmale Age SibSp Fare EmbarkedQ EmbarkedS
## 2 2163.682 0.6344552 0.9346940 33.8518 0.7821343 203350.2 2.674083
## 3 7731601.489 1.0919185 0.8941596 2005.0629 0.4533850 184315.8 0.951001
pred_multi <- predict(model_multi)
head(pred_multi)
## [1] 3 3 1 3 1 2
## Levels: 1 2 3
confusion matrix
conf_matrix <- table(
Actual = df_no_outlier$Pclass,
Predicted = pred_multi
)
conf_matrix
## Predicted
## Actual 1 2 3
## 1 61 1 4
## 2 5 86 24
## 3 1 9 247
accuracy <- sum(diag(conf_matrix)) /
sum(conf_matrix)
accuracy
## [1] 0.8995434
precision <- diag(conf_matrix) /
colSums(conf_matrix)
recall <- diag(conf_matrix) /
rowSums(conf_matrix)
f1_score <- 2 * (precision * recall) /
(precision + recall)
precision
## 1 2 3
## 0.9104478 0.8958333 0.8981818
recall
## 1 2 3
## 0.9242424 0.7478261 0.9610895
f1_score
## 1 2 3
## 0.9172932 0.8151659 0.9285714