Anggota Kelompok:
- Berlianti Debby Maharani (23031554006)
- Shellomitha Sulvana Dewi (23031554096)
- Nisrina Afaf (23031554165)
Load Data
library(readr)
## Warning: package 'readr' was built under R version 4.4.3
# Pastikan file CSV ada di folder yang sama dengan file .Rmd ini
data <- read.csv("C:/Users/sitiz/Desktop/KULIAH SEMESTER 4/ANALISIS MULTIVARIAT/PROJECT ANMUL/Extended_Employee_Performance_and_Productivity_Data.csv", stringsAsFactors = FALSE)
head(data)
## Employee_ID Department Gender Age Job_Title Hire_Date
## 1 1 IT Male 55 Specialist 2022-01-19 08:03:05.556036
## 2 2 Finance Male 29 Developer 2024-04-18 08:03:05.556036
## 3 3 Finance Male 55 Specialist 2015-10-26 08:03:05.556036
## 4 4 Customer Support Female 48 Analyst 2016-10-22 08:03:05.556036
## 5 5 Engineering Female 36 Analyst 2021-07-23 08:03:05.556036
## 6 6 IT Male 43 Manager 2016-08-14 08:03:05.556036
## Years_At_Company Education_Level Performance_Score Monthly_Salary
## 1 2 High School 5 6750
## 2 0 High School 5 7500
## 3 8 High School 3 5850
## 4 7 Bachelor 2 4800
## 5 3 Bachelor 2 4800
## 6 8 High School 3 7800
## Work_Hours_Per_Week Projects_Handled Overtime_Hours Sick_Days
## 1 33 32 22 2
## 2 34 34 13 14
## 3 37 27 6 3
## 4 52 10 28 12
## 5 38 11 29 13
## 6 46 31 8 0
## Remote_Work_Frequency Team_Size Training_Hours Promotions
## 1 0 14 66 0
## 2 100 12 61 2
## 3 50 10 1 0
## 4 100 10 0 1
## 5 100 15 9 1
## 6 100 15 95 0
## Employee_Satisfaction_Score Resigned
## 1 2.63 False
## 2 1.72 False
## 3 3.17 False
## 4 1.86 False
## 5 1.25 False
## 6 2.77 False
# Tambahkan kolom Marital_Status secara acak
set.seed(123)
status_options <- c("Single", "Married", "Divorced")
data$Marital_Status <- sample(status_options, size = nrow(data), replace = TRUE)
# Lihat hasil
head(data)
## Employee_ID Department Gender Age Job_Title Hire_Date
## 1 1 IT Male 55 Specialist 2022-01-19 08:03:05.556036
## 2 2 Finance Male 29 Developer 2024-04-18 08:03:05.556036
## 3 3 Finance Male 55 Specialist 2015-10-26 08:03:05.556036
## 4 4 Customer Support Female 48 Analyst 2016-10-22 08:03:05.556036
## 5 5 Engineering Female 36 Analyst 2021-07-23 08:03:05.556036
## 6 6 IT Male 43 Manager 2016-08-14 08:03:05.556036
## Years_At_Company Education_Level Performance_Score Monthly_Salary
## 1 2 High School 5 6750
## 2 0 High School 5 7500
## 3 8 High School 3 5850
## 4 7 Bachelor 2 4800
## 5 3 Bachelor 2 4800
## 6 8 High School 3 7800
## Work_Hours_Per_Week Projects_Handled Overtime_Hours Sick_Days
## 1 33 32 22 2
## 2 34 34 13 14
## 3 37 27 6 3
## 4 52 10 28 12
## 5 38 11 29 13
## 6 46 31 8 0
## Remote_Work_Frequency Team_Size Training_Hours Promotions
## 1 0 14 66 0
## 2 100 12 61 2
## 3 50 10 1 0
## 4 100 10 0 1
## 5 100 15 9 1
## 6 100 15 95 0
## Employee_Satisfaction_Score Resigned Marital_Status
## 1 2.63 False Divorced
## 2 1.72 False Divorced
## 3 3.17 False Divorced
## 4 1.86 False Married
## 5 1.25 False Divorced
## 6 2.77 False Married
# Simpan dataset ke file baru CSV
write.csv(data, "C:/Users/sitiz/Desktop/KULIAH SEMESTER 4/ANALISIS MULTIVARIAT/PROJECT ANMUL/Employee_Data_with_MaritalStatus.csv", row.names = FALSE)
data <- read.csv("C:/Users/sitiz/Desktop/KULIAH SEMESTER 4/ANALISIS MULTIVARIAT/PROJECT ANMUL/Employee_Data_with_MaritalStatus.csv", stringsAsFactors = FALSE)
# Lihat ringkasan
str(data)
## 'data.frame': 100000 obs. of 21 variables:
## $ Employee_ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Department : chr "IT" "Finance" "Finance" "Customer Support" ...
## $ Gender : chr "Male" "Male" "Male" "Female" ...
## $ Age : int 55 29 55 48 36 43 37 55 55 45 ...
## $ Job_Title : chr "Specialist" "Developer" "Specialist" "Analyst" ...
## $ Hire_Date : chr "2022-01-19 08:03:05.556036" "2024-04-18 08:03:05.556036" "2015-10-26 08:03:05.556036" "2016-10-22 08:03:05.556036" ...
## $ Years_At_Company : int 2 0 8 7 3 8 1 9 1 7 ...
## $ Education_Level : chr "High School" "High School" "High School" "Bachelor" ...
## $ Performance_Score : int 5 5 3 2 2 3 5 2 2 1 ...
## $ Monthly_Salary : int 6750 7500 5850 4800 4800 7800 5250 7200 4200 6050 ...
## $ Work_Hours_Per_Week : int 33 34 37 52 38 46 55 42 51 41 ...
## $ Projects_Handled : int 32 34 27 10 11 31 20 46 23 33 ...
## $ Overtime_Hours : int 22 13 6 28 29 8 29 7 21 2 ...
## $ Sick_Days : int 2 14 3 12 13 0 2 8 14 6 ...
## $ Remote_Work_Frequency : int 0 100 50 100 100 100 0 100 0 75 ...
## $ Team_Size : int 14 12 10 10 15 15 16 7 1 4 ...
## $ Training_Hours : int 66 61 1 0 9 95 27 64 0 53 ...
## $ Promotions : int 0 2 0 1 1 0 0 0 1 2 ...
## $ Employee_Satisfaction_Score: num 2.63 1.72 3.17 1.86 1.25 2.77 4.46 2.09 1.44 2.93 ...
## $ Resigned : chr "False" "False" "False" "False" ...
## $ Marital_Status : chr "Divorced" "Divorced" "Divorced" "Married" ...
summary(data)
## Employee_ID Department Gender Age
## Min. : 1 Length:100000 Length:100000 Min. :22.00
## 1st Qu.: 25001 Class :character Class :character 1st Qu.:31.00
## Median : 50001 Mode :character Mode :character Median :41.00
## Mean : 50001 Mean :41.03
## 3rd Qu.: 75000 3rd Qu.:51.00
## Max. :100000 Max. :60.00
## Job_Title Hire_Date Years_At_Company Education_Level
## Length:100000 Length:100000 Min. : 0.000 Length:100000
## Class :character Class :character 1st Qu.: 2.000 Class :character
## Mode :character Mode :character Median : 4.000 Mode :character
## Mean : 4.476
## 3rd Qu.: 7.000
## Max. :10.000
## Performance_Score Monthly_Salary Work_Hours_Per_Week Projects_Handled
## Min. :1.000 Min. :3850 Min. :30.00 Min. : 0.00
## 1st Qu.:2.000 1st Qu.:5250 1st Qu.:37.00 1st Qu.:12.00
## Median :3.000 Median :6500 Median :45.00 Median :24.00
## Mean :2.995 Mean :6403 Mean :44.96 Mean :24.43
## 3rd Qu.:4.000 3rd Qu.:7500 3rd Qu.:53.00 3rd Qu.:37.00
## Max. :5.000 Max. :9000 Max. :60.00 Max. :49.00
## Overtime_Hours Sick_Days Remote_Work_Frequency Team_Size
## Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. : 1.00
## 1st Qu.: 7.00 1st Qu.: 3.000 1st Qu.: 25.00 1st Qu.: 5.00
## Median :15.00 Median : 7.000 Median : 50.00 Median :10.00
## Mean :14.51 Mean : 7.009 Mean : 50.09 Mean :10.01
## 3rd Qu.:22.00 3rd Qu.:11.000 3rd Qu.: 75.00 3rd Qu.:15.00
## Max. :29.00 Max. :14.000 Max. :100.00 Max. :19.00
## Training_Hours Promotions Employee_Satisfaction_Score
## Min. : 0.00 Min. :0.0000 Min. :1.000
## 1st Qu.:25.00 1st Qu.:0.0000 1st Qu.:2.010
## Median :49.00 Median :1.0000 Median :3.000
## Mean :49.51 Mean :0.9997 Mean :2.999
## 3rd Qu.:75.00 3rd Qu.:2.0000 3rd Qu.:3.990
## Max. :99.00 Max. :2.0000 Max. :5.000
## Resigned Marital_Status
## Length:100000 Length:100000
## Class :character Class :character
## Mode :character Mode :character
##
##
##
Preprocessing
# Cek missing value
colSums(is.na(data))
## Employee_ID Department
## 0 0
## Gender Age
## 0 0
## Job_Title Hire_Date
## 0 0
## Years_At_Company Education_Level
## 0 0
## Performance_Score Monthly_Salary
## 0 0
## Work_Hours_Per_Week Projects_Handled
## 0 0
## Overtime_Hours Sick_Days
## 0 0
## Remote_Work_Frequency Team_Size
## 0 0
## Training_Hours Promotions
## 0 0
## Employee_Satisfaction_Score Resigned
## 0 0
## Marital_Status
## 0
# Encoding kategorik
data$Job_Title <- as.factor(data$Job_Title)
data$Department <- as.factor(data$Department)
data$Gender <- as.factor(data$Gender)
data$Education_Level <- as.factor(data$Education_Level)
data$Marital_Status <- as.factor(data$Marital_Status)
# Normalisasi numerik
num_cols <- c("Monthly_Salary", "Employee_Satisfaction_Score", "Age", "Years_At_Company")
data[num_cols] <- scale(data[num_cols])
Eksplorasi Data dan Visualisasi
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.4.3
library(psych)
## Warning: package 'psych' was built under R version 4.4.3
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
# Statistik deskriptif
describe(data[num_cols])
## vars n mean sd median trimmed mad min max
## Monthly_Salary 1 1e+05 0 1 0.07 -0.01 1.19 -1.86 1.89
## Employee_Satisfaction_Score 2 1e+05 0 1 0.00 0.00 1.28 -1.74 1.74
## Age 3 1e+05 0 1 0.00 0.00 1.32 -1.69 1.69
## Years_At_Company 4 1e+05 0 1 -0.17 0.00 1.03 -1.56 1.93
## range skew kurtosis se
## Monthly_Salary 3.75 0.10 -0.88 0
## Employee_Satisfaction_Score 3.48 0.00 -1.19 0
## Age 3.38 -0.01 -1.20 0
## Years_At_Company 3.49 0.01 -1.22 0
summary(data[, c("Performance_Score", "Employee_Satisfaction_Score", "Age", "Years_At_Company", "Monthly_Salary")])
## Performance_Score Employee_Satisfaction_Score Age
## Min. :1.000 Min. :-1.7372510 Min. :-1.692388
## 1st Qu.:2.000 1st Qu.:-0.8595392 1st Qu.:-0.891969
## Median :3.000 Median : 0.0007922 Median :-0.002616
## Mean :2.995 Mean : 0.0000000 Mean : 0.000000
## 3rd Qu.:4.000 3rd Qu.: 0.8611236 3rd Qu.: 0.886738
## Max. :5.000 Max. : 1.7388354 Max. : 1.687156
## Years_At_Company Monthly_Salary
## Min. :-1.5600 Min. :-1.86025
## 1st Qu.:-0.8629 1st Qu.:-0.84022
## Median :-0.1659 Median : 0.07052
## Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.8796 3rd Qu.: 0.79911
## Max. : 1.9252 Max. : 1.89200
# Frekuensi kategori
table(data$Gender)
##
## Female Male Other
## 48001 48031 3968
prop.table(table(data$Marital_Status))
##
## Divorced Married Single
## 0.33294 0.33305 0.33401
# Uji normalitas
shapiro.test(data$Monthly_Salary[1:500])
##
## Shapiro-Wilk normality test
##
## data: data$Monthly_Salary[1:500]
## W = 0.97062, p-value = 1.825e-08
shapiro.test(data$Employee_Satisfaction_Score[1:500])
##
## Shapiro-Wilk normality test
##
## data: data$Employee_Satisfaction_Score[1:500]
## W = 0.95419, p-value = 2.442e-11
shapiro.test(data$Age[1:500])
##
## Shapiro-Wilk normality test
##
## data: data$Age[1:500]
## W = 0.95513, p-value = 3.403e-11
shapiro.test(data$Years_At_Company[1:500])
##
## Shapiro-Wilk normality test
##
## data: data$Years_At_Company[1:500]
## W = 0.94204, p-value = 4.667e-13
# Boxplot
for (col in num_cols) {
print(
ggplot(data, aes_string(y = col)) +
geom_boxplot(fill = "skyblue") +
ggtitle(paste("Boxplot of", col))
)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.




# Histogram
for (col in num_cols) {
print(
ggplot(data, aes_string(x = col)) +
geom_histogram(bins = 30, fill = "lightgreen", color = "black") +
ggtitle(paste("Histogram of", col))
)
}




# Korelasi
cor_matrix <- cor(data[num_cols], use = "complete.obs")
print(cor_matrix)
## Monthly_Salary Employee_Satisfaction_Score
## Monthly_Salary 1.0000000000 0.0010827334
## Employee_Satisfaction_Score 0.0010827334 1.0000000000
## Age 0.0027567514 -0.0001242389
## Years_At_Company -0.0006446329 -0.0031801797
## Age Years_At_Company
## Monthly_Salary 2.756751e-03 -6.446329e-04
## Employee_Satisfaction_Score -1.242389e-04 -3.180180e-03
## Age 1.000000e+00 7.807067e-05
## Years_At_Company 7.807067e-05 1.000000e+00
corrplot(cor_matrix, method = "color", addCoef.col = "black", number.cex = 0.8)

Uji Asumsi MANOVA dan MANCOVA
library(MVN)
## Warning: package 'MVN' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:psych':
##
## logit
library(biotools)
## Warning: package 'biotools' was built under R version 4.4.3
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## ---
## biotools version 4.3
set.seed(123)
data_sample <- data %>% sample_n(1000)
# Uji normalitas multivariat
mvn(data_sample[, c("Monthly_Salary", "Employee_Satisfaction_Score")], multivariatePlot = "qq")

## $multivariateNormality
## Test HZ p value MVN
## 1 Henze-Zirkler 7.40563 0 NO
##
## $univariateNormality
## Test Variable Statistic p value Normality
## 1 Anderson-Darling Monthly_Salary 6.3440 <0.001 NO
## 2 Anderson-Darling Employee_Satisfaction_Score 11.7781 <0.001 NO
##
## $Descriptives
## n Mean Std.Dev Median Min
## Monthly_Salary 1000 0.028079239 1.018747 0.14337905 -1.860251
## Employee_Satisfaction_Score 1000 0.009612769 1.026405 -0.02527845 -1.737251
## Max 25th 75th Skew Kurtosis
## Monthly_Salary 1.892002 -0.8402213 0.7991126 0.06847576 -0.9013698
## Employee_Satisfaction_Score 1.738835 -0.8943000 0.9328178 0.03102002 -1.2125339
# Korelasi antar DV
cor(data_sample[, c("Monthly_Salary", "Employee_Satisfaction_Score")])
## Monthly_Salary Employee_Satisfaction_Score
## Monthly_Salary 1.00000000 -0.01189362
## Employee_Satisfaction_Score -0.01189362 1.00000000
# Uji homogenitas
boxM(data_sample[, c("Monthly_Salary", "Employee_Satisfaction_Score")], grouping = data_sample$Gender)
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: data_sample[, c("Monthly_Salary", "Employee_Satisfaction_Score")]
## Chi-Sq (approx.) = 4.0848, df = 6, p-value = 0.6652
MANOVA
data$Job_Title <- as.factor(data$Job_Title)
data$Department <- as.factor(data$Department)
data$Gender <- as.factor(data$Gender)
data$Education_Level <- as.factor(data$Education_Level)
data$Marital_Status <- as.factor(data$Marital_Status)
manova_model <- manova(cbind(Employee_Satisfaction_Score, Monthly_Salary) ~ Department + Job_Title + Gender + Education_Level + Marital_Status, data = data)
summary(manova_model, test = "Pillai")
## Df Pillai approx F num Df den Df Pr(>F)
## Department 8 0.00051 3.2 16 199956 1.587e-05 ***
## Job_Title 6 0.73332 9646.7 12 199956 < 2.2e-16 ***
## Gender 2 0.00003 0.9 4 199956 0.4923
## Education_Level 3 0.00008 1.4 6 199956 0.2237
## Marital_Status 2 0.00002 0.6 4 199956 0.6684
## Residuals 99978
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary.aov(manova_model)
## Response Employee_Satisfaction_Score :
## Df Sum Sq Mean Sq F value Pr(>F)
## Department 8 22 2.71027 2.7105 0.00554 **
## Job_Title 6 3 0.57659 0.5766 0.74930
## Gender 2 2 1.12732 1.1274 0.32387
## Education_Level 3 1 0.39323 0.3933 0.75785
## Marital_Status 2 1 0.38402 0.3841 0.68110
## Residuals 99978 99970 0.99992
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Monthly_Salary :
## Df Sum Sq Mean Sq F value Pr(>F)
## Department 8 8 1.0 3.6704 0.0002741 ***
## Job_Title 6 73320 12220.0 45811.9130 < 2.2e-16 ***
## Gender 2 0 0.2 0.5726 0.5640354
## Education_Level 3 2 0.6 2.3391 0.0713578 .
## Marital_Status 2 0 0.2 0.8017 0.4485741
## Residuals 99978 26668 0.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
MANCOVA
mancova_model <- manova(cbind(Employee_Satisfaction_Score, Monthly_Salary) ~ Department + Job_Title + Gender + Education_Level + Marital_Status + Years_At_Company + Age, data = data)
summary(mancova_model, test = "Pillai")
## Df Pillai approx F num Df den Df Pr(>F)
## Department 8 0.00051 3.2 16 199952 1.587e-05 ***
## Job_Title 6 0.73332 9646.6 12 199952 < 2.2e-16 ***
## Gender 2 0.00003 0.9 4 199952 0.4923
## Education_Level 3 0.00008 1.4 6 199952 0.2237
## Marital_Status 2 0.00002 0.6 4 199952 0.6684
## Years_At_Company 1 0.00001 0.6 2 99975 0.5378
## Age 1 0.00000 0.2 2 99975 0.8502
## Residuals 99976
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary.aov(mancova_model)
## Response Employee_Satisfaction_Score :
## Df Sum Sq Mean Sq F value Pr(>F)
## Department 8 22 2.71027 2.7105 0.00554 **
## Job_Title 6 3 0.57659 0.5766 0.74931
## Gender 2 2 1.12732 1.1274 0.32388
## Education_Level 3 1 0.39323 0.3933 0.75786
## Marital_Status 2 1 0.38402 0.3840 0.68110
## Years_At_Company 1 1 0.99174 0.9918 0.31930
## Age 1 0 0.00498 0.0050 0.94373
## Residuals 99976 99969 0.99993
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Monthly_Salary :
## Df Sum Sq Mean Sq F value Pr(>F)
## Department 8 8 1.0 3.6704 0.0002742 ***
## Job_Title 6 73320 12220.0 45811.2561 < 2.2e-16 ***
## Gender 2 0 0.2 0.5726 0.5640400
## Education_Level 3 2 0.6 2.3390 0.0713610 .
## Marital_Status 2 0 0.2 0.8017 0.4485792
## Years_At_Company 1 0 0.1 0.2468 0.6193270
## Age 1 0 0.1 0.3195 0.5719218
## Residuals 99976 26668 0.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Uji tambahan
summary(manova_model, test = c("Pillai", "Wilks", "Hotelling-Lawley", "Roy"))
## Df Pillai approx F num Df den Df Pr(>F)
## Department 8 0.00051 3.2 16 199956 1.587e-05 ***
## Job_Title 6 0.73332 9646.7 12 199956 < 2.2e-16 ***
## Gender 2 0.00003 0.9 4 199956 0.4923
## Education_Level 3 0.00008 1.4 6 199956 0.2237
## Marital_Status 2 0.00002 0.6 4 199956 0.6684
## Residuals 99978
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(mancova_model, test = c("Pillai", "Wilks", "Hotelling-Lawley", "Roy"))
## Df Pillai approx F num Df den Df Pr(>F)
## Department 8 0.00051 3.2 16 199952 1.587e-05 ***
## Job_Title 6 0.73332 9646.6 12 199952 < 2.2e-16 ***
## Gender 2 0.00003 0.9 4 199952 0.4923
## Education_Level 3 0.00008 1.4 6 199952 0.2237
## Marital_Status 2 0.00002 0.6 4 199952 0.6684
## Years_At_Company 1 0.00001 0.6 2 99975 0.5378
## Age 1 0.00000 0.2 2 99975 0.8502
## Residuals 99976
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1