library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(MVN)
## Warning: package 'MVN' was built under R version 4.4.3
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(heplots)
## Warning: package 'heplots' was built under R version 4.4.3
## Loading required package: broom
library(ggplot2)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(psych)
## Warning: package 'psych' was built under R version 4.4.3
##
## Attaching package: 'psych'
##
## The following object is masked from 'package:car':
##
## logit
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
data <- read.csv("C:/Users/lenovo/OneDrive/Documents/SEMESTER 4/Analisis Multivariat/Projek Anmul/adult.csv", stringsAsFactors = FALSE)
head(data)
## age workclass fnlwgt education education.num marital.status
## 1 90 ? 77053 HS-grad 9 Widowed
## 2 82 Private 132870 HS-grad 9 Widowed
## 3 66 ? 186061 Some-college 10 Widowed
## 4 54 Private 140359 7th-8th 4 Divorced
## 5 41 Private 264663 Some-college 10 Separated
## 6 34 Private 216864 HS-grad 9 Divorced
## occupation relationship race sex capital.gain capital.loss
## 1 ? Not-in-family White Female 0 4356
## 2 Exec-managerial Not-in-family White Female 0 4356
## 3 ? Unmarried Black Female 0 4356
## 4 Machine-op-inspct Unmarried White Female 0 3900
## 5 Prof-specialty Own-child White Female 0 3900
## 6 Other-service Unmarried White Female 0 3770
## hours.per.week native.country income
## 1 40 United-States <=50K
## 2 18 United-States <=50K
## 3 40 United-States <=50K
## 4 40 United-States <=50K
## 5 40 United-States <=50K
## 6 45 United-States <=50K
str(data)
## 'data.frame': 32561 obs. of 15 variables:
## $ age : int 90 82 66 54 41 34 38 74 68 41 ...
## $ workclass : chr "?" "Private" "?" "Private" ...
## $ fnlwgt : int 77053 132870 186061 140359 264663 216864 150601 88638 422013 70037 ...
## $ education : chr "HS-grad" "HS-grad" "Some-college" "7th-8th" ...
## $ education.num : int 9 9 10 4 10 9 6 16 9 10 ...
## $ marital.status: chr "Widowed" "Widowed" "Widowed" "Divorced" ...
## $ occupation : chr "?" "Exec-managerial" "?" "Machine-op-inspct" ...
## $ relationship : chr "Not-in-family" "Not-in-family" "Unmarried" "Unmarried" ...
## $ race : chr "White" "White" "Black" "White" ...
## $ sex : chr "Female" "Female" "Female" "Female" ...
## $ capital.gain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ capital.loss : int 4356 4356 4356 3900 3900 3770 3770 3683 3683 3004 ...
## $ hours.per.week: int 40 18 40 40 40 45 40 20 40 60 ...
## $ native.country: chr "United-States" "United-States" "United-States" "United-States" ...
## $ income : chr "<=50K" "<=50K" "<=50K" "<=50K" ...
summary(data)
## age workclass fnlwgt education
## Min. :17.00 Length:32561 Min. : 12285 Length:32561
## 1st Qu.:28.00 Class :character 1st Qu.: 117827 Class :character
## Median :37.00 Mode :character Median : 178356 Mode :character
## Mean :38.58 Mean : 189778
## 3rd Qu.:48.00 3rd Qu.: 237051
## Max. :90.00 Max. :1484705
## education.num marital.status occupation relationship
## Min. : 1.00 Length:32561 Length:32561 Length:32561
## 1st Qu.: 9.00 Class :character Class :character Class :character
## Median :10.00 Mode :character Mode :character Mode :character
## Mean :10.08
## 3rd Qu.:12.00
## Max. :16.00
## race sex capital.gain capital.loss
## Length:32561 Length:32561 Min. : 0 Min. : 0.0
## Class :character Class :character 1st Qu.: 0 1st Qu.: 0.0
## Mode :character Mode :character Median : 0 Median : 0.0
## Mean : 1078 Mean : 87.3
## 3rd Qu.: 0 3rd Qu.: 0.0
## Max. :99999 Max. :4356.0
## hours.per.week native.country income
## Min. : 1.00 Length:32561 Length:32561
## 1st Qu.:40.00 Class :character Class :character
## Median :40.00 Mode :character Mode :character
## Mean :40.44
## 3rd Qu.:45.00
## Max. :99.00
sapply(data, function(x) sum(x == "?"))
## age workclass fnlwgt education education.num
## 0 1836 0 0 0
## marital.status occupation relationship race sex
## 0 1843 0 0 0
## capital.gain capital.loss hours.per.week native.country income
## 0 0 0 583 0
data[data == "?"] <- NA
summary(data)
## age workclass fnlwgt education
## Min. :17.00 Length:32561 Min. : 12285 Length:32561
## 1st Qu.:28.00 Class :character 1st Qu.: 117827 Class :character
## Median :37.00 Mode :character Median : 178356 Mode :character
## Mean :38.58 Mean : 189778
## 3rd Qu.:48.00 3rd Qu.: 237051
## Max. :90.00 Max. :1484705
## education.num marital.status occupation relationship
## Min. : 1.00 Length:32561 Length:32561 Length:32561
## 1st Qu.: 9.00 Class :character Class :character Class :character
## Median :10.00 Mode :character Mode :character Mode :character
## Mean :10.08
## 3rd Qu.:12.00
## Max. :16.00
## race sex capital.gain capital.loss
## Length:32561 Length:32561 Min. : 0 Min. : 0.0
## Class :character Class :character 1st Qu.: 0 1st Qu.: 0.0
## Mode :character Mode :character Median : 0 Median : 0.0
## Mean : 1078 Mean : 87.3
## 3rd Qu.: 0 3rd Qu.: 0.0
## Max. :99999 Max. :4356.0
## hours.per.week native.country income
## Min. : 1.00 Length:32561 Length:32561
## 1st Qu.:40.00 Class :character Class :character
## Median :40.00 Mode :character Mode :character
## Mean :40.44
## 3rd Qu.:45.00
## Max. :99.00
data$workclass <- as.factor(data$workclass)
data$education <- as.factor(data$education)
data$marital.status <- as.factor(data$marital.status)
data$occupation <- as.factor(data$occupation)
data$relationship <- as.factor(data$relationship)
data$race <- as.factor(data$race)
data$sex <- as.factor(data$sex)
data$native.country <- as.factor(data$native.country)
data$income <- as.factor(data$income)
data$age <- as.numeric(data$age)
data$fnlwgt <- as.numeric(data$fnlwgt)
data$education.num <- as.numeric(data$education.num)
data$capital.gain <- as.numeric(data$capital.gain)
data$capital.loss <- as.numeric(data$capital.loss)
data$hours.per.week <- as.numeric(data$hours.per.week)
str(data)
## 'data.frame': 32561 obs. of 15 variables:
## $ age : num 90 82 66 54 41 34 38 74 68 41 ...
## $ workclass : Factor w/ 8 levels "Federal-gov",..: NA 4 NA 4 4 4 4 7 1 4 ...
## $ fnlwgt : num 77053 132870 186061 140359 264663 ...
## $ education : Factor w/ 16 levels "10th","11th",..: 12 12 16 6 16 12 1 11 12 16 ...
## $ education.num : num 9 9 10 4 10 9 6 16 9 10 ...
## $ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 7 7 7 1 6 1 6 5 1 5 ...
## $ occupation : Factor w/ 14 levels "Adm-clerical",..: NA 4 NA 7 10 8 1 10 10 3 ...
## $ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 2 2 5 5 4 5 5 3 2 5 ...
## $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 3 5 5 5 5 5 5 5 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 1 2 1 1 2 ...
## $ capital.gain : num 0 0 0 0 0 0 0 0 0 0 ...
## $ capital.loss : num 4356 4356 4356 3900 3900 ...
## $ hours.per.week: num 40 18 40 40 40 45 40 20 40 60 ...
## $ native.country: Factor w/ 41 levels "Cambodia","Canada",..: 39 39 39 39 39 39 39 39 39 NA ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 1 2 ...
get_mode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
for(col in c("workclass", "occupation", "native.country")) {
if(sum(is.na(data[[col]])) > 0) {
mode_val <- get_mode(data[[col]][!is.na(data[[col]])])
data[[col]][is.na(data[[col]])] <- mode_val
}
}
for(col in c("age", "fnlwgt", "education.num", "capital.gain", "capital.loss", "hours.per.week")) {
if(sum(is.na(data[[col]])) > 0) {
median_val <- median(data[[col]], na.rm = TRUE)
data[[col]][is.na(data[[col]])] <- median_val
}
}
numeric_vars <- data[, c("age", "fnlwgt", "education.num", "capital.gain", "capital.loss", "hours.per.week")]
describeBy(numeric_vars, group = data$income)
##
## Descriptive statistics by group
## group: <=50K
## vars n mean sd median trimmed mad min
## age 1 24720 36.78 14.02 34 35.49 14.83 17
## fnlwgt 2 24720 190340.87 106482.27 179465 181308.88 90642.46 12285
## education.num 3 24720 9.60 2.44 9 9.73 1.48 1
## capital.gain 4 24720 148.75 963.14 0 0.00 0.00 0
## capital.loss 5 24720 53.14 310.76 0 0.00 0.00 0
## hours.per.week 6 24720 38.84 12.32 40 39.02 2.97 1
## max range skew kurtosis se
## age 90 73 0.76 0.03 0.09
## fnlwgt 1484705 1472420 1.46 6.49 677.26
## education.num 16 15 -0.43 0.99 0.02
## capital.gain 41310 41310 18.34 605.14 6.13
## capital.loss 4356 4356 6.06 38.21 1.98
## hours.per.week 99 98 0.21 2.97 0.08
## ------------------------------------------------------------
## group: >50K
## vars n mean sd median trimmed mad min
## age 1 7841 44.25 10.52 44 43.77 10.38 19
## fnlwgt 2 7841 188005.00 102541.78 176101 179150.09 83744.66 14878
## education.num 3 7841 11.61 2.39 12 11.60 2.97 2
## capital.gain 4 7841 4006.14 14570.38 0 931.71 0.00 0
## capital.loss 5 7841 195.00 595.49 0 0.00 0.00 0
## hours.per.week 6 7841 45.47 11.01 40 44.84 7.41 1
## max range skew kurtosis se
## age 90 71 0.48 0.15 0.12
## fnlwgt 1226583 1211705 1.39 5.14 1158.02
## education.num 16 14 -0.32 -0.18 0.03
## capital.gain 99999 99999 5.84 35.27 164.55
## capital.loss 3683 3683 2.80 6.10 6.72
## hours.per.week 99 98 0.65 3.91 0.12
write.csv(data, "C:/Users/lenovo/OneDrive/Documents/SEMESTER 4/Analisis Multivariat/Projek Anmul/adult after preprocessing.csv", row.names = FALSE)
data2 <- read.csv("C:/Users/lenovo/OneDrive/Documents/SEMESTER 4/Analisis Multivariat/Projek Anmul/adult after preprocessing.csv")
filtered <- data2[1:4000, ]
numeric_vars <- filtered[, c("age", "fnlwgt", "education.num", "capital.gain", "capital.loss", "hours.per.week")]
cor_matrix <- cor(numeric_vars, use = "complete.obs")
print(cor_matrix)
## age fnlwgt education.num capital.gain capital.loss
## age 1.00000000 -0.06060087 0.05544708 0.09084746 -0.06506821
## fnlwgt -0.06060087 1.00000000 -0.02027398 0.01211428 -0.01974455
## education.num 0.05544708 -0.02027398 1.00000000 0.19270049 -0.02813855
## capital.gain 0.09084746 0.01211428 0.19270049 1.00000000 -0.33885134
## capital.loss -0.06506821 -0.01974455 -0.02813855 -0.33885134 1.00000000
## hours.per.week -0.03546348 0.01210848 0.15580521 0.12225821 -0.02362749
## hours.per.week
## age -0.03546348
## fnlwgt 0.01210848
## education.num 0.15580521
## capital.gain 0.12225821
## capital.loss -0.02362749
## hours.per.week 1.00000000
corrplot(cor_matrix, method = "circle", type = "upper",
tl.col = "black", tl.srt = 45, addCoef.col = "black")
filtered <- filtered %>%
mutate(
log_capital_gain = log(capital.gain + 1),
log_capital_loss = log(capital.loss + 1)
)
filtered$education <- as.factor(filtered$education)
filtered$occupation <- as.factor(filtered$occupation)
filtered$sex <- as.factor(filtered$sex)
filtered$income <- as.factor(filtered$income)
analysis_data <- filtered
mvn_result <- mvn(
data = analysis_data[, c("log_capital_gain", "log_capital_loss")],
multivariatePlot = "qq"
)
print(mvn_result$multivariateNormality)
## Test HZ p value MVN
## 1 Henze-Zirkler 562.4332 0 NO
mardia_result <- mvn(
data = filtered[, c("log_capital_gain", "log_capital_loss")],
mvnTest = "mardia"
)
print(mardia_result$multivariateNormality)
## Test Statistic p value Result
## 1 Mardia Skewness 2236.49493443131 0 NO
## 2 Mardia Kurtosis 10.9106108551151 0 NO
## 3 MVN <NA> <NA> NO
shapiro.test(filtered$log_capital_gain)
##
## Shapiro-Wilk normality test
##
## data: filtered$log_capital_gain
## W = 0.7311, p-value < 2.2e-16
shapiro.test(filtered$log_capital_loss)
##
## Shapiro-Wilk normality test
##
## data: filtered$log_capital_loss
## W = 0.63036, p-value < 2.2e-16
boxm_result <- boxM(
cbind(log_capital_gain, log_capital_loss) ~ sex,
data = filtered
)
print(boxm_result)
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: Y
## Chi-Sq (approx.) = 1.5295, df = 3, p-value = 0.6755
correlation <- cor(filtered[, c("log_capital_gain", "log_capital_loss")])
print(correlation)
## log_capital_gain log_capital_loss
## log_capital_gain 1.0000000 -0.9860883
## log_capital_loss -0.9860883 1.0000000
z_scores <- scale(analysis_data[, c("log_capital_gain", "log_capital_loss")])
outliers_uni <- which(apply(abs(z_scores) > 3, 1, any))
cat("Jumlah Outlier Univariat:", length(outliers_uni), "\n")
## Jumlah Outlier Univariat: 0
boxplot(analysis_data$log_capital_gain, main = "Boxplot log_capital_gain")
points(outliers_uni, analysis_data$log_capital_gain[outliers_uni], col = "red", pch = 19)
boxplot(analysis_data$log_capital_loss, main = "Boxplot log_capital_loss")
points(outliers_uni, analysis_data$log_capital_loss[outliers_uni], col = "red", pch = 19)
md <- mahalanobis(analysis_data[, c("log_capital_gain", "log_capital_loss")],
colMeans(analysis_data[, c("log_capital_gain", "log_capital_loss")]),
cov(analysis_data[, c("log_capital_gain", "log_capital_loss")]))
cutoff <- qchisq(0.975, df = 2)
outliers_multi <- which(md > cutoff)
cat("Jumlah Outlier Multivariat:", length(outliers_multi), "\n")
## Jumlah Outlier Multivariat: 167
plot(
analysis_data$log_capital_gain,
analysis_data$log_capital_loss,
main = "Outlier Multivariat (Mahalanobis Distance)",
xlab = "log_capital_gain",
ylab = "log_capital_loss",
col = ifelse(1:nrow(analysis_data) %in% outliers_multi, "red", "black"),
pch = ifelse(1:nrow(analysis_data) %in% outliers_multi, 19, 1)
)
legend("topright", legend = c("Outlier", "Normal"), col = c("red", "black"), pch = c(19, 1))
##### Visualisasi scatterplot Mahalanobis Distance menunjukkan beberapa
outlier multivariat, ditandai dengan titik merah, yang memiliki
kombinasi nilai log_capital_gain dan log_capital_loss yang tidak lazim.
Sebagian besar data terkonsentrasi pada log_capital_gain tinggi dan
log_capital_loss rendah, namun outlier terlihat pada kasus dengan
capital_gain sangat tinggi atau capital_loss sangat tinggi. Hal ini
menunjukkan bahwa meskipun nilai masing-masing variabel mungkin tampak
normal secara univariat, kombinasi keduanya dapat menghasilkan outlier
secara multivariat. Visualisasi ini menegaskan pentingnya analisis
multivariat dalam mendeteksi pola tidak biasa dalam data.
boxplot(analysis_data[, c("log_capital_gain", "log_capital_loss")],
main = "Boxplot Log Capital Gain dan Loss")
ggplot(analysis_data, aes(x = log_capital_gain, y = log_capital_loss, color = income)) +
geom_point(alpha = 0.6) +
facet_wrap(~income) +
labs(title = "Scatterplot Log Capital Gain vs Loss by Income")
##### Scatterplot menunjukkan bahwa tidak ada hubungan linear yang jelas
antara log_capital_gain dan log_capital_loss, baik pada kelompok
pendapatan <=50K maupun >50K. Titik-titik tersebar tanpa pola yang
konsisten, sehingga asumsi linearitas tidak terpenuhi secara visual.
fit_linear <- lm(log_capital_loss ~ log_capital_gain + income, data = analysis_data)
fit_dev <- lm(log_capital_loss ~ poly(log_capital_gain, 2) + income, data = analysis_data)
anova(fit_linear, fit_dev)
## Analysis of Variance Table
##
## Model 1: log_capital_loss ~ log_capital_gain + income
## Model 2: log_capital_loss ~ poly(log_capital_gain, 2) + income
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 3997 1174.25
## 2 3996 106.87 1 1067.4 39911 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
res_model <- lm(cbind(log_capital_loss, log_capital_gain) ~ occupation + income + age, data = analysis_data)
residuals <- residuals(res_model)
plot(residuals[,1], type = "l", main = "Residual Plot Log Capital Loss")
plot(residuals[,2], type = "l", main = "Residual Plot Log Capital Gain")
durbinWatsonTest(lm(log_capital_loss ~ occupation + income + age, data = analysis_data))
## lag Autocorrelation D-W Statistic p-value
## 1 0.9916496 0.01584166 0
## Alternative hypothesis: rho != 0
durbinWatsonTest(lm(log_capital_gain ~ occupation + income + age, data = analysis_data))
## lag Autocorrelation D-W Statistic p-value
## 1 0.9884228 0.02257686 0
## Alternative hypothesis: rho != 0
analysis_data$occupation <- as.factor(analysis_data$occupation)
analysis_data$income <- as.factor(analysis_data$income)
manova_model <- manova(cbind(log_capital_gain, log_capital_loss) ~ occupation + income, data = analysis_data)
# Hasil uji MANOVA dengan berbagai statistik
summary(manova_model, test = "Pillai")
## Df Pillai approx F num Df den Df Pr(>F)
## occupation 13 0.093629 15.06 26 7970 < 2.2e-16 ***
## income 1 0.204138 510.95 2 3984 < 2.2e-16 ***
## Residuals 3985
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(manova_model, test = "Wilks")
## Df Wilks approx F num Df den Df Pr(>F)
## occupation 13 0.90656 15.41 26 7968 < 2.2e-16 ***
## income 1 0.79586 510.95 2 3984 < 2.2e-16 ***
## Residuals 3985
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(manova_model, test = "Hotelling-Lawley")
## Df Hotelling-Lawley approx F num Df den Df Pr(>F)
## occupation 13 0.10286 15.76 26 7966 < 2.2e-16 ***
## income 1 0.25650 510.95 2 3984 < 2.2e-16 ***
## Residuals 3985
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(manova_model, test = "Roy")
## Df Roy approx F num Df den Df Pr(>F)
## occupation 13 0.10076 30.89 13 3985 < 2.2e-16 ***
## income 1 0.25650 510.95 2 3984 < 2.2e-16 ***
## Residuals 3985
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary.aov(manova_model)
## Response log_capital_gain :
## Df Sum Sq Mean Sq F value Pr(>F)
## occupation 13 659 50.7 2.7492 0.0006772 ***
## income 1 3806 3806.1 206.5338 < 2.2e-16 ***
## Residuals 3985 73438 18.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response log_capital_loss :
## Df Sum Sq Mean Sq F value Pr(>F)
## occupation 13 179 13.77 1.06 0.3898
## income 1 1289 1288.79 99.23 <2e-16 ***
## Residuals 3985 51757 12.99
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(2, 2))
boxplot(log_capital_gain ~ occupation, data = analysis_data, main = "Log Capital Gain by Occupation")
boxplot(log_capital_loss ~ occupation, data = analysis_data, main = "Log Capital Loss by Occupation")
boxplot(log_capital_gain ~ income, data = analysis_data, main = "Log Capital Gain by Income")
boxplot(log_capital_loss ~ income, data = analysis_data, main = "Log Capital Loss by Income")
par(mfrow = c(1, 1))