Linear Discriminant Analysis (LDA) adalah sebuah metode statistik supervised yang digunakan untuk dua tujuan utama: klasifikasi dan reduksi dimensi. Sebagai metode klasifikasi, LDA bekerja dengan mencari satu set kombinasi linear dari variabel prediktor, yang dikenal sebagai fungsi diskriminan, yang mampu memaksimalkan pemisahan (separasi) antara dua atau lebih kelas atau kelompok yang telah ditentukan sebelumnya. Sedangkan Ordinal Logistic Regression (OLR), sering juga disebut sebagai Proportional Odds Model, adalah ekstensi dari regresi logistik biner yang dirancang khusus untuk memodelkan variabel dependen yang bersifat ordinal. Variabel ordinal adalah variabel kategorikal di mana terdapat urutan atau tingkatan yang jelas antar kategorinya (contoh: ‘rendah’, ‘sedang’, ‘tinggi’; atau ‘tidak setuju’, ‘netral’, ‘setuju’).
# Install dan load package
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
# Load dataset
library(readr)
data <- read_csv("D:/Tugas SMT 4/analisis multivariat/data.csv")
## Rows: 3338 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (20): GameID, LeagueIndex, Age, HoursPerWeek, TotalHours, APM, SelectByH...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Cek jumlah baris dan kolom
dim(data) # atau dim(data) jika sebelum preprocessing
## [1] 3338 20
# cek duplikat data
sum(duplicated(data))
## [1] 0
# Cek nama kolom
colnames(data)
## [1] "GameID" "LeagueIndex" "Age"
## [4] "HoursPerWeek" "TotalHours" "APM"
## [7] "SelectByHotkeys" "AssignToHotkeys" "UniqueHotkeys"
## [10] "MinimapAttacks" "MinimapRightClicks" "NumberOfPACs"
## [13] "GapBetweenPACs" "ActionLatency" "ActionsInPAC"
## [16] "TotalMapExplored" "WorkersMade" "UniqueUnitsMade"
## [19] "ComplexUnitsMade" "ComplexAbilitiesUsed"
# Cek missing values
colSums(is.na(data))
## GameID LeagueIndex Age
## 0 0 0
## HoursPerWeek TotalHours APM
## 0 0 0
## SelectByHotkeys AssignToHotkeys UniqueHotkeys
## 0 0 0
## MinimapAttacks MinimapRightClicks NumberOfPACs
## 0 0 0
## GapBetweenPACs ActionLatency ActionsInPAC
## 0 0 0
## TotalMapExplored WorkersMade UniqueUnitsMade
## 0 0 0
## ComplexUnitsMade ComplexAbilitiesUsed
## 0 0
# Statistik ringkasan untuk semua kolom
summary(data)
## GameID LeagueIndex Age HoursPerWeek
## Min. : 52 Min. :1.000 Min. :16.00 Min. : 0.00
## 1st Qu.:2423 1st Qu.:3.000 1st Qu.:19.00 1st Qu.: 8.00
## Median :4788 Median :4.000 Median :21.00 Median : 12.00
## Mean :4720 Mean :4.121 Mean :21.65 Mean : 15.91
## 3rd Qu.:6995 3rd Qu.:5.000 3rd Qu.:24.00 3rd Qu.: 20.00
## Max. :9271 Max. :7.000 Max. :44.00 Max. :168.00
## TotalHours APM SelectByHotkeys AssignToHotkeys
## Min. : 3.0 Min. : 22.06 Min. :0.000000 Min. :0.0000000
## 1st Qu.: 300.0 1st Qu.: 79.23 1st Qu.:0.001245 1st Qu.:0.0002017
## Median : 500.0 Median :107.07 Median :0.002445 Median :0.0003487
## Mean : 960.4 Mean :114.58 Mean :0.004023 Mean :0.0003641
## 3rd Qu.: 800.0 3rd Qu.:140.16 3rd Qu.:0.004945 3rd Qu.:0.0004929
## Max. :1000000.0 Max. :389.83 Max. :0.043088 Max. :0.0016483
## UniqueHotkeys MinimapAttacks MinimapRightClicks NumberOfPACs
## Min. : 0.000 Min. :0.000e+00 Min. :0.0000000 Min. :0.000679
## 1st Qu.: 3.000 1st Qu.:0.000e+00 1st Qu.:0.0001388 1st Qu.:0.002743
## Median : 4.000 Median :3.864e-05 Median :0.0002784 Median :0.003376
## Mean : 4.316 Mean :9.378e-05 Mean :0.0003802 Mean :0.003433
## 3rd Qu.: 6.000 3rd Qu.:1.134e-04 3rd Qu.:0.0005076 3rd Qu.:0.004003
## Max. :10.000 Max. :3.019e-03 Max. :0.0036877 Max. :0.007971
## GapBetweenPACs ActionLatency ActionsInPAC TotalMapExplored
## Min. : 6.667 Min. : 24.63 Min. : 2.039 Min. : 5.00
## 1st Qu.: 29.327 1st Qu.: 50.89 1st Qu.: 4.262 1st Qu.:17.00
## Median : 37.059 Median : 61.30 Median : 5.087 Median :22.00
## Mean : 40.714 Mean : 64.21 Mean : 5.267 Mean :22.12
## 3rd Qu.: 48.510 3rd Qu.: 74.03 3rd Qu.: 6.027 3rd Qu.:27.00
## Max. :237.143 Max. :176.37 Max. :18.558 Max. :58.00
## WorkersMade UniqueUnitsMade ComplexUnitsMade ComplexAbilitiesUsed
## Min. :7.698e-05 Min. : 2.000 Min. :0.000e+00 Min. :0.000e+00
## 1st Qu.:6.818e-04 1st Qu.: 5.000 1st Qu.:0.000e+00 1st Qu.:0.000e+00
## Median :9.042e-04 Median : 6.000 Median :0.000e+00 Median :2.043e-05
## Mean :1.031e-03 Mean : 6.541 Mean :5.998e-05 Mean :1.419e-04
## 3rd Qu.:1.258e-03 3rd Qu.: 8.000 3rd Qu.:8.742e-05 3rd Qu.:1.823e-04
## Max. :5.149e-03 Max. :13.000 Max. :9.023e-04 Max. :3.084e-03
# Bar chart LeagueIndex
library(ggplot2)
ggplot(data, aes(x = LeagueIndex)) +
geom_bar(fill = "#2c7fb8") +
labs(title = "Distribusi Tingkatan Liga", x = "LeagueIndex", y = "Jumlah")
# Loop histogram untuk semua variabel numerik
num_cols <- c('GameID','LeagueIndex','Age','HoursPerWeek','TotalHours','APM','SelectByHotkeys','AssignToHotkeys','UniqueHotkeys','MinimapAttacks','MinimapRightClicks','NumberOfPACs','GapBetweenPACs','ActionLatency','ActionsInPAC','TotalMapExplored','WorkersMade','UniqueUnitsMade','ComplexUnitsMade','ComplexAbilitiesUsed')
for (col in num_cols) {
print(
ggplot(data, aes_string(x = col)) +
geom_histogram(bins = 30, fill = "#1a9641", color = "white") +
labs(title = paste("Distribusi", col), x = col, y = "Frekuensi")
) # Add the closing parenthesis here
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Korelasi dan heatmap
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
cor_matrix <- cor(data[, num_cols])
corrplot(cor_matrix, method = "color", tl.cex = 0.8)
# cek outlier menggunakan boxplot untuk semua variabel numerik
for (col in num_cols) {
print(
ggplot(data, aes_string(y = col)) +
geom_boxplot(fill = "#d7191c") +
labs(title = paste("Boxplot", col), y = col)
)
}
# hapus kolom Game_Id
data <- subset(data, select = -GameID)
colnames(data)
## [1] "LeagueIndex" "Age" "HoursPerWeek"
## [4] "TotalHours" "APM" "SelectByHotkeys"
## [7] "AssignToHotkeys" "UniqueHotkeys" "MinimapAttacks"
## [10] "MinimapRightClicks" "NumberOfPACs" "GapBetweenPACs"
## [13] "ActionLatency" "ActionsInPAC" "TotalMapExplored"
## [16] "WorkersMade" "UniqueUnitsMade" "ComplexUnitsMade"
## [19] "ComplexAbilitiesUsed"
# Handling Outlier
Q1 <- quantile(data$MinimapRightClicks, 0.25)
Q3 <- quantile(data$MinimapRightClicks, 0.75)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
data_clean <- subset(data, MinimapRightClicks >= lower_bound & MinimapRightClicks <= upper_bound)
boxplot(data_clean$MinimapRightClicks, col = "red", main = "Boxplot setelah Outlier Dihapus")
#handling outlier
data_clean <- subset(data_clean, TotalHours <= 25000)
boxplot(data_clean$TotalHours, col = "blue", main = "Boxplot TotalHours setelah Outlier > 25000 Dihapus")
# Menstandarkan variabel numerik
num_cols <- c('HoursPerWeek', 'TotalHours','SelectByHotkeys',
'AssignToHotkeys', 'UniqueHotkeys', 'MinimapAttacks',
'MinimapRightClicks', 'GapBetweenPACs',
'ActionLatency', 'ActionsInPAC', 'TotalMapExplored',
'WorkersMade', 'UniqueUnitsMade', 'ComplexUnitsMade',
'ComplexAbilitiesUsed')
# Create data_scale as a copy of the original data frame
data_scaled <- data
data_scaled[num_cols] <- scale(data[num_cols])
# Cek hasil
head(data_scaled)
## # A tibble: 6 × 19
## LeagueIndex Age HoursPerWeek TotalHours APM SelectByHotkeys
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 5 27 -0.494 0.118 144. -0.108
## 2 5 23 -0.494 0.233 129. -0.152
## 3 4 30 -0.494 -0.0439 70.0 -0.618
## 4 3 19 0.342 -0.0324 108. -0.633
## 5 3 32 -0.494 -0.0266 123. -0.611
## 6 2 27 -0.828 -0.0514 44.5 -0.644
## # ℹ 13 more variables: AssignToHotkeys <dbl>, UniqueHotkeys <dbl>,
## # MinimapAttacks <dbl>, MinimapRightClicks <dbl>, NumberOfPACs <dbl>,
## # GapBetweenPACs <dbl>, ActionLatency <dbl>, ActionsInPAC <dbl>,
## # TotalMapExplored <dbl>, WorkersMade <dbl>, UniqueUnitsMade <dbl>,
## # ComplexUnitsMade <dbl>, ComplexAbilitiesUsed <dbl>
summary(data_scaled)
## LeagueIndex Age HoursPerWeek TotalHours
## Min. :1.000 Min. :16.00 Min. :-1.3297 Min. :-0.05528
## 1st Qu.:3.000 1st Qu.:19.00 1st Qu.:-0.6611 1st Qu.:-0.03813
## Median :4.000 Median :21.00 Median :-0.3268 Median :-0.02659
## Mean :4.121 Mean :21.65 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.:5.000 3rd Qu.:24.00 3rd Qu.: 0.3419 3rd Qu.:-0.00926
## Max. :7.000 Max. :44.00 Max. :12.7118 Max. :57.68748
## APM SelectByHotkeys AssignToHotkeys UniqueHotkeys
## Min. : 22.06 Min. :-0.8512 Min. :-1.73386 Min. :-1.8499
## 1st Qu.: 79.23 1st Qu.:-0.5879 1st Qu.:-0.77334 1st Qu.:-0.5642
## Median :107.07 Median :-0.3339 Median :-0.07359 Median :-0.1356
## Mean :114.58 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.:140.16 3rd Qu.: 0.1950 3rd Qu.: 0.61285 3rd Qu.: 0.7216
## Max. :389.83 Max. : 8.2653 Max. : 6.11437 Max. : 2.4359
## MinimapAttacks MinimapRightClicks NumberOfPACs GapBetweenPACs
## Min. :-0.5899 Min. :-1.0577 Min. :0.000679 Min. :-1.9961
## 1st Qu.:-0.5899 1st Qu.:-0.6716 1st Qu.:0.002743 1st Qu.:-0.6676
## Median :-0.3468 Median :-0.2833 Median :0.003376 Median :-0.2143
## Mean : 0.0000 Mean : 0.0000 Mean :0.003433 Mean : 0.0000
## 3rd Qu.: 0.1237 3rd Qu.: 0.3542 3rd Qu.:0.004003 3rd Qu.: 0.4571
## Max. :18.4020 Max. : 9.2003 Max. :0.007971 Max. :11.5159
## ActionLatency ActionsInPAC TotalMapExplored WorkersMade
## Min. :-2.0789 Min. :-2.1512 Min. :-2.3004 Min. :-1.8334
## 1st Qu.:-0.6998 1st Qu.:-0.6700 1st Qu.:-0.6877 1st Qu.:-0.6710
## Median :-0.1530 Median :-0.1199 Median :-0.0157 Median :-0.2436
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5160 3rd Qu.: 0.5067 3rd Qu.: 0.6563 3rd Qu.: 0.4371
## Max. : 5.8917 Max. : 8.8572 Max. : 4.8224 Max. : 7.9142
## UniqueUnitsMade ComplexUnitsMade ComplexAbilitiesUsed
## Min. :-2.4427 Min. :-0.5379 Min. :-0.5343
## 1st Qu.:-0.8289 1st Qu.:-0.5379 1st Qu.:-0.5343
## Median :-0.2910 Median :-0.5379 Median :-0.4574
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.7848 3rd Qu.: 0.2460 3rd Qu.: 0.1520
## Max. : 3.4743 Max. : 7.5535 Max. :11.0740
head(data_scaled)
## # A tibble: 6 × 19
## LeagueIndex Age HoursPerWeek TotalHours APM SelectByHotkeys
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 5 27 -0.494 0.118 144. -0.108
## 2 5 23 -0.494 0.233 129. -0.152
## 3 4 30 -0.494 -0.0439 70.0 -0.618
## 4 3 19 0.342 -0.0324 108. -0.633
## 5 3 32 -0.494 -0.0266 123. -0.611
## 6 2 27 -0.828 -0.0514 44.5 -0.644
## # ℹ 13 more variables: AssignToHotkeys <dbl>, UniqueHotkeys <dbl>,
## # MinimapAttacks <dbl>, MinimapRightClicks <dbl>, NumberOfPACs <dbl>,
## # GapBetweenPACs <dbl>, ActionLatency <dbl>, ActionsInPAC <dbl>,
## # TotalMapExplored <dbl>, WorkersMade <dbl>, UniqueUnitsMade <dbl>,
## # ComplexUnitsMade <dbl>, ComplexAbilitiesUsed <dbl>
summary(data_scaled)
## LeagueIndex Age HoursPerWeek TotalHours
## Min. :1.000 Min. :16.00 Min. :-1.3297 Min. :-0.05528
## 1st Qu.:3.000 1st Qu.:19.00 1st Qu.:-0.6611 1st Qu.:-0.03813
## Median :4.000 Median :21.00 Median :-0.3268 Median :-0.02659
## Mean :4.121 Mean :21.65 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.:5.000 3rd Qu.:24.00 3rd Qu.: 0.3419 3rd Qu.:-0.00926
## Max. :7.000 Max. :44.00 Max. :12.7118 Max. :57.68748
## APM SelectByHotkeys AssignToHotkeys UniqueHotkeys
## Min. : 22.06 Min. :-0.8512 Min. :-1.73386 Min. :-1.8499
## 1st Qu.: 79.23 1st Qu.:-0.5879 1st Qu.:-0.77334 1st Qu.:-0.5642
## Median :107.07 Median :-0.3339 Median :-0.07359 Median :-0.1356
## Mean :114.58 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.:140.16 3rd Qu.: 0.1950 3rd Qu.: 0.61285 3rd Qu.: 0.7216
## Max. :389.83 Max. : 8.2653 Max. : 6.11437 Max. : 2.4359
## MinimapAttacks MinimapRightClicks NumberOfPACs GapBetweenPACs
## Min. :-0.5899 Min. :-1.0577 Min. :0.000679 Min. :-1.9961
## 1st Qu.:-0.5899 1st Qu.:-0.6716 1st Qu.:0.002743 1st Qu.:-0.6676
## Median :-0.3468 Median :-0.2833 Median :0.003376 Median :-0.2143
## Mean : 0.0000 Mean : 0.0000 Mean :0.003433 Mean : 0.0000
## 3rd Qu.: 0.1237 3rd Qu.: 0.3542 3rd Qu.:0.004003 3rd Qu.: 0.4571
## Max. :18.4020 Max. : 9.2003 Max. :0.007971 Max. :11.5159
## ActionLatency ActionsInPAC TotalMapExplored WorkersMade
## Min. :-2.0789 Min. :-2.1512 Min. :-2.3004 Min. :-1.8334
## 1st Qu.:-0.6998 1st Qu.:-0.6700 1st Qu.:-0.6877 1st Qu.:-0.6710
## Median :-0.1530 Median :-0.1199 Median :-0.0157 Median :-0.2436
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5160 3rd Qu.: 0.5067 3rd Qu.: 0.6563 3rd Qu.: 0.4371
## Max. : 5.8917 Max. : 8.8572 Max. : 4.8224 Max. : 7.9142
## UniqueUnitsMade ComplexUnitsMade ComplexAbilitiesUsed
## Min. :-2.4427 Min. :-0.5379 Min. :-0.5343
## 1st Qu.:-0.8289 1st Qu.:-0.5379 1st Qu.:-0.5343
## Median :-0.2910 Median :-0.5379 Median :-0.4574
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.7848 3rd Qu.: 0.2460 3rd Qu.: 0.1520
## Max. : 3.4743 Max. : 7.5535 Max. :11.0740
str(data_scaled)
## tibble [3,338 × 19] (S3: tbl_df/tbl/data.frame)
## $ LeagueIndex : num [1:3338] 5 5 4 3 3 2 1 7 4 4 ...
## $ Age : num [1:3338] 27 23 30 19 32 27 21 17 20 18 ...
## $ HoursPerWeek : num [1:3338] -0.494 -0.494 -0.494 0.342 -0.494 ...
## $ TotalHours : num [1:3338] 0.1178 0.2333 -0.0439 -0.0324 -0.0266 ...
## $ APM : num [1:3338] 144 129 70 108 123 ...
## $ SelectByHotkeys : num [1:3338] -0.108 -0.152 -0.618 -0.633 -0.611 ...
## $ AssignToHotkeys : num [1:3338] -0.688 -0.498 -0.136 -0.719 -0.175 ...
## $ UniqueHotkeys : num [1:3338] 1.15 -0.136 -0.136 -1.421 -0.993 ...
## $ MinimapAttacks : num [1:3338] 0.101 1.26 1.257 -0.255 -0.59 ...
## $ MinimapRightClicks : num [1:3338] 0.0336 0.1452 0.2258 0.4539 2.6379 ...
## $ NumberOfPACs : num [1:3338] 0.00485 0.00431 0.00293 0.00378 0.00237 ...
## $ GapBetweenPACs : num [1:3338] -0.472 -0.457 0.231 -0.674 -1.057 ...
## $ ActionLatency : num [1:3338] -1.226 -1.148 0.585 -0.55 -0.112 ...
## $ ActionsInPAC : num [1:3338] -0.344 -0.282 -0.816 -0.234 2.737 ...
## $ TotalMapExplored : num [1:3338] 0.7907 -0.0157 -0.0157 -0.4189 -0.9565 ...
## $ WorkersMade : num [1:3338] 0.703 0.312 -0.55 -1.162 0.276 ...
## $ UniqueUnitsMade : num [1:3338] -0.291 -0.829 -0.291 0.247 -1.367 ...
## $ ComplexUnitsMade : num [1:3338] -0.538 -0.538 -0.538 -0.538 -0.538 ...
## $ ComplexAbilitiesUsed: num [1:3338] -0.534 0.247 0.176 0.91 -0.462 ...
prop.table(table(data_scaled$LeagueIndex))
##
## 1 2 3 4 5 6 7
## 0.05002996 0.10395446 0.16566806 0.24295986 0.24086279 0.18603954 0.01048532
#3Uji Asumsi
#LDA 1. Uji Multikolinearitas
# Pastikan dplyr sudah aktif
library(MASS)
# Pisahkan fitur numerik (selain LeagueIndex)
features <- data_scaled %>% dplyr::select(-LeagueIndex)
# Bagi data berdasarkan LeagueIndex
grouped_data <- split(data_scaled, data_scaled$LeagueIndex)
# Uji normalitas Shapiro-Wilk untuk tiap fitur per grup LeagueIndex
normality_results <- lapply(names(features), function(feat) {
sapply(grouped_data, function(group) {
# Pastikan kolom fitur ada dan tidak semua NA
if (nrow(group) >= 3 && all(!is.na(group[[feat]]))) {
tryCatch(
shapiro.test(group[[feat]])$p.value,
error = function(e) NA
)
} else {
NA
}
})
})
# Ubah hasil ke data.frame
normality_results_df <- as.data.frame(do.call(rbind, normality_results))
rownames(normality_results_df) <- names(features)
# Tampilkan hasil
normality_results_df
## 1 2 3 4
## Age 1.564557e-08 1.857704e-13 4.895264e-18 4.525719e-18
## HoursPerWeek 8.376279e-13 2.926029e-19 2.101601e-23 4.456902e-29
## TotalHours 1.145293e-13 1.645084e-18 1.452047e-38 2.135359e-46
## APM 1.769937e-08 1.186078e-07 6.062001e-13 1.743495e-13
## SelectByHotkeys 5.168161e-20 1.737062e-24 3.876843e-31 2.230163e-36
## AssignToHotkeys 3.227129e-09 1.991833e-10 9.923346e-11 1.045519e-16
## UniqueHotkeys 3.667612e-05 1.298086e-08 9.136284e-12 5.467023e-12
## MinimapAttacks 3.968334e-19 2.334363e-29 6.704285e-36 5.448659e-36
## MinimapRightClicks 7.911731e-14 6.904780e-22 1.554658e-25 2.262078e-31
## NumberOfPACs 6.618546e-03 1.504687e-01 1.022977e-02 1.053482e-01
## GapBetweenPACs 6.058565e-10 8.187020e-11 5.428698e-12 1.457525e-13
## ActionLatency 1.646470e-05 2.258143e-10 8.977229e-11 2.034559e-14
## ActionsInPAC 7.587403e-09 1.852385e-16 1.475459e-13 8.107886e-21
## TotalMapExplored 5.882548e-07 6.726613e-03 1.652148e-08 2.913655e-12
## WorkersMade 6.280443e-09 2.563764e-15 7.368747e-21 2.449994e-24
## UniqueUnitsMade 3.530102e-04 1.799087e-08 1.158809e-09 7.256471e-12
## ComplexUnitsMade 1.167487e-23 2.850728e-32 2.113443e-35 1.405968e-38
## ComplexAbilitiesUsed 7.930522e-22 8.495354e-32 8.935050e-37 2.579207e-39
## 5 6 7
## Age 2.035832e-15 1.472301e-11 5.791262e-02
## HoursPerWeek 3.141134e-27 1.019794e-27 4.682444e-03
## TotalHours 4.552724e-53 1.788244e-42 7.550364e-09
## APM 4.098781e-19 1.012480e-10 2.025266e-01
## SelectByHotkeys 9.654193e-35 6.154131e-27 5.888732e-03
## AssignToHotkeys 1.322127e-07 9.953129e-10 1.487034e-01
## UniqueHotkeys 1.223919e-10 1.314679e-10 8.702914e-02
## MinimapAttacks 5.461243e-41 2.895374e-32 2.358348e-04
## MinimapRightClicks 1.887322e-27 5.392883e-25 2.658611e-02
## NumberOfPACs 6.848102e-02 4.220428e-05 3.940142e-01
## GapBetweenPACs 1.615802e-10 1.695354e-12 7.404780e-01
## ActionLatency 5.157857e-09 2.596012e-08 2.166930e-01
## ActionsInPAC 5.698484e-27 5.045023e-17 3.226406e-01
## TotalMapExplored 4.572844e-07 3.417213e-09 9.784375e-02
## WorkersMade 2.227548e-24 5.800788e-22 1.743396e-04
## UniqueUnitsMade 6.811580e-12 3.372936e-09 2.678919e-01
## ComplexUnitsMade 9.849624e-36 1.721935e-32 3.018311e-07
## ComplexAbilitiesUsed 2.132116e-37 1.223818e-33 9.687512e-10
library(biotools)
## Warning: package 'biotools' was built under R version 4.4.3
## ---
## biotools version 4.3
library(MASS)
boxM_result <- boxM(data_scaled[, -which(names(data_scaled) == "LeagueIndex")],
grouping = data_scaled$LeagueIndex)
boxM_result
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: data_scaled[, -which(names(data_scaled) == "LeagueIndex")]
## Chi-Sq (approx.) = 22901, df = 1026, p-value < 2.2e-16
#Olr Uji Multikonileritas
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(dplyr)
# Fit model OLS sementara untuk cek VIF
model_ols <- lm(as.numeric(LeagueIndex) ~ ., data = data_scaled)
vif(model_ols)
## Age HoursPerWeek TotalHours
## 1.125685 1.098924 1.008281
## APM SelectByHotkeys AssignToHotkeys
## 36.723621 12.258322 1.612387
## UniqueHotkeys MinimapAttacks MinimapRightClicks
## 1.309584 1.136730 1.292985
## NumberOfPACs GapBetweenPACs ActionLatency
## 13.724444 2.252419 5.332298
## ActionsInPAC TotalMapExplored WorkersMade
## 8.026527 1.855805 1.307615
## UniqueUnitsMade ComplexUnitsMade ComplexAbilitiesUsed
## 1.673609 1.819523 1.674409
#Jika VIF > 5 atau 10, ada indikasi multikolinearitas
#Reduksi Dimensi PCA
# PCA tanpa variabel target
pca_result <- prcomp(data_scaled[, -which(names(data_scaled) == "LeagueIndex")], center = TRUE, scale. = TRUE)
# Ringkasan hasil PCA
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.1948 1.4365 1.2297 1.12599 1.02865 0.99614 0.99178
## Proportion of Variance 0.2676 0.1146 0.0840 0.07044 0.05878 0.05513 0.05465
## Cumulative Proportion 0.2676 0.3823 0.4663 0.53670 0.59548 0.65061 0.70525
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.91031 0.87131 0.85881 0.84404 0.77100 0.71518 0.64969
## Proportion of Variance 0.04604 0.04218 0.04098 0.03958 0.03302 0.02842 0.02345
## Cumulative Proportion 0.75129 0.79347 0.83444 0.87402 0.90704 0.93546 0.95891
## PC15 PC16 PC17 PC18
## Standard deviation 0.59764 0.50516 0.33377 0.12585
## Proportion of Variance 0.01984 0.01418 0.00619 0.00088
## Cumulative Proportion 0.97875 0.99293 0.99912 1.00000
# Proporsi varian kumulatif
cum_var <- cumsum(pca_result$sdev^2 / sum(pca_result$sdev^2))
plot(cum_var, type = "b", xlab = "Jumlah Komponen", ylab = "Proporsi Varian Kumulatif",
main = "Scree Plot PCA", col = "blue", pch = 19)
abline(h = 0.9, col = "red", lty = 2) # misalnya threshold 90% varian
Factor Analysis
library(psych)
## Warning: package 'psych' was built under R version 4.4.3
##
## Attaching package: 'psych'
## The following object is masked from 'package:car':
##
## logit
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
# Menentukan jumlah faktor dengan scree plot
fa.parallel(data_scaled[, -which(names(data_scaled) == "LeagueIndex")], fa = "fa")
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Parallel analysis suggests that the number of factors = 7 and the number of components = NA
# Lakukan FA dengan jumlah faktor misalnya 5
fa_result <- fa(data_scaled[, -which(names(data_scaled) == "LeagueIndex")], nfactors = 5, rotate = "varimax")
print(fa_result)
## Factor Analysis using method = minres
## Call: fa(r = data_scaled[, -which(names(data_scaled) == "LeagueIndex")],
## nfactors = 5, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
## MR1 MR4 MR5 MR3 MR2 h2 u2 com
## Age -0.27 -0.12 0.10 0.02 -0.11 0.1050 0.895 2.1
## HoursPerWeek 0.16 0.20 0.00 0.06 0.07 0.0724 0.928 2.4
## TotalHours 0.02 0.08 0.00 0.00 -0.01 0.0076 0.992 1.1
## APM 0.49 0.74 0.14 0.42 0.07 0.9865 0.014 2.5
## SelectByHotkeys 0.14 0.91 0.04 0.11 0.03 0.8684 0.132 1.1
## AssignToHotkeys 0.31 0.43 0.25 0.16 0.08 0.3710 0.629 2.9
## UniqueHotkeys 0.15 0.27 0.37 0.09 0.02 0.2411 0.759 2.4
## MinimapAttacks 0.05 0.10 0.23 0.27 -0.04 0.1437 0.856 2.4
## MinimapRightClicks 0.09 0.04 0.20 0.47 0.03 0.2670 0.733 1.5
## NumberOfPACs 0.76 0.28 0.50 -0.11 0.04 0.9196 0.080 2.1
## GapBetweenPACs -0.59 -0.17 -0.09 -0.39 0.03 0.5388 0.461 2.0
## ActionLatency -0.85 -0.25 -0.28 -0.19 -0.07 0.9025 0.097 1.5
## ActionsInPAC 0.00 0.10 -0.32 0.84 0.09 0.8313 0.169 1.4
## TotalMapExplored 0.12 0.04 0.71 0.03 0.20 0.5586 0.441 1.2
## WorkersMade 0.25 0.11 0.08 0.29 0.14 0.1838 0.816 3.0
## UniqueUnitsMade 0.02 -0.03 0.66 0.05 0.29 0.5261 0.474 1.4
## ComplexUnitsMade 0.09 0.01 0.19 0.06 0.87 0.8092 0.191 1.1
## ComplexAbilitiesUsed 0.09 0.02 0.16 0.05 0.65 0.4603 0.540 1.2
##
## MR1 MR4 MR5 MR3 MR2
## SS loadings 2.24 1.90 1.77 1.51 1.37
## Proportion Var 0.12 0.11 0.10 0.08 0.08
## Cumulative Var 0.12 0.23 0.33 0.41 0.49
## Proportion Explained 0.25 0.22 0.20 0.17 0.16
## Cumulative Proportion 0.25 0.47 0.67 0.84 1.00
##
## Mean item complexity = 1.8
## Test of the hypothesis that 5 factors are sufficient.
##
## df null model = 153 with the objective function = 8.92 with Chi Square = 29690.4
## df of the model are 73 and the objective function was 1
##
## The root mean square of the residuals (RMSR) is 0.02
## The df corrected root mean square of the residuals is 0.04
##
## The harmonic n.obs is 3338 with the empirical chi square 609.46 with prob < 1.3e-85
## The total n.obs was 3338 with Likelihood Chi Square = 3311.6 with prob < 0
##
## Tucker Lewis Index of factoring reliability = 0.77
## RMSEA index = 0.115 and the 90 % confidence intervals are 0.112 0.119
## BIC = 2719.35
## Fit based upon off diagonal values = 0.99
## Measures of factor score adequacy
## MR1 MR4 MR5 MR3 MR2
## Correlation of (regression) scores with factors 0.94 0.97 0.85 0.93 0.90
## Multiple R square of scores with factors 0.88 0.94 0.73 0.86 0.81
## Minimum correlation of possible factor scores 0.76 0.88 0.46 0.73 0.62
Visualaisasi PCA
#2D Scatter Plot (2 Komponen)
# Ambil dua komponen utama
pca_df <- as.data.frame(pca_result$x[, 1:2])
pca_df$LeagueIndex <- data_scaled$LeagueIndex
# install.packages("ggplot2") jika belum
library(ggplot2)
ggplot(pca_df, aes(x = PC1, y = PC2, color = LeagueIndex)) +
geom_point(alpha = 0.7) +
labs(title = "Visualisasi PCA (2D)", x = "PC1", y = "PC2") +
theme_minimal()
#3D Plot
#3D plot
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
pca_3d <- as.data.frame(pca_result$x[, 1:3])
pca_3d$LeagueIndex <- data_scaled$LeagueIndex
plot_ly(pca_3d, x = ~PC1, y = ~PC2, z = ~PC3, color = ~LeagueIndex, colors = "Set1", type = "scatter3d", mode = "markers") %>%
layout(title = "Visualisasi PCA (3D)")
#Perceptual Mapping Multidimensional Scaling (MDS)
# Jarak antar observasi berdasarkan fitur numerik
dist_matrix <- dist(data_scaled[, -which(names(data_scaled) == "LeagueIndex")])
# MDS klasik (metric MDS) ke 2 dimensi
mds_result <- cmdscale(dist_matrix, k = 2)
mds_df <- as.data.frame(mds_result)
colnames(mds_df) <- c("Dim1", "Dim2")
mds_df$LeagueIndex <- data_scaled$LeagueIndex
# Visualisasi hasil MDS
library(ggplot2)
ggplot(mds_df, aes(x = Dim1, y = Dim2, color = LeagueIndex)) +
geom_point(alpha = 0.7) +
labs(title = "Perceptual Mapping - MDS", x = "Dimensi 1", y = "Dimensi 2") +
theme_minimal()
Biplot PCA
# PCA telah dilakukan sebelumnya (prcomp)
biplot(pca_result, scale = 0, cex = 0.6, main = "Biplot dari PCA")
# Biplot dengan ggplot (lebih rapi)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_pca_biplot(pca_result, label = "var", habillage = data_scaled$LeagueIndex,
addEllipses = TRUE, repel = TRUE,
col.var = "steelblue", col.ind = "gray30",
title = "Biplot PCA dengan LeagueIndex")
#Pemodelan 1. LDA
library(MASS)
library(caret)
# Pastikan LeagueIndex adalah faktor
data_scaled$LeagueIndex <- as.factor(data_scaled$LeagueIndex)
# Bagi data (misalnya 70:30)
set.seed(123)
train_index <- createDataPartition(data_scaled$LeagueIndex, p = 0.7, list = FALSE)
train_data <- data_scaled[train_index, ]
test_data <- data_scaled[-train_index, ]
# LDA
lda_model <- lda(LeagueIndex ~ ., data = train_data)
lda_pred <- predict(lda_model, test_data)
# Pastikan faktor dengan level identik
ref_levels <- levels(data_scaled$LeagueIndex)
predicted <- factor(lda_pred$class, levels = ref_levels)
actual <- factor(test_data$LeagueIndex, levels = ref_levels)
# Confusion Matrix
confusionMatrix(predicted, actual)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5 6 7
## 1 25 17 20 3 0 0 0
## 2 10 16 11 5 0 0 0
## 3 7 31 37 20 10 1 0
## 4 8 37 79 137 68 28 0
## 5 0 2 15 61 108 71 0
## 6 0 1 3 16 51 77 7
## 7 0 0 0 1 4 9 3
##
## Overall Statistics
##
## Accuracy : 0.4034
## 95% CI : (0.3728, 0.4346)
## No Information Rate : 0.2432
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2513
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity 0.50000 0.15385 0.22424 0.5638 0.4481 0.41398
## Specificity 0.95785 0.97095 0.91727 0.7090 0.8034 0.90406
## Pos Pred Value 0.38462 0.38095 0.34906 0.3838 0.4202 0.49677
## Neg Pred Value 0.97323 0.90805 0.85666 0.8349 0.8208 0.87085
## Prevalence 0.05005 0.10410 0.16517 0.2432 0.2412 0.18619
## Detection Rate 0.02503 0.01602 0.03704 0.1371 0.1081 0.07708
## Detection Prevalence 0.06507 0.04204 0.10611 0.3574 0.2573 0.15516
## Balanced Accuracy 0.72893 0.56240 0.57075 0.6364 0.6258 0.65902
## Class: 7
## Sensitivity 0.300000
## Specificity 0.985844
## Pos Pred Value 0.176471
## Neg Pred Value 0.992872
## Prevalence 0.010010
## Detection Rate 0.003003
## Detection Prevalence 0.017017
## Balanced Accuracy 0.642922
# Pastikan LeagueIndex adalah ordered factor
data_scaled$LeagueIndex <- factor(data_scaled$LeagueIndex, ordered = TRUE)
# Model ordinal logistic regression
library(MASS)
olr_model <- polr(LeagueIndex ~ ., data = train_data, Hess = TRUE)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Prediksi
olr_pred <- predict(olr_model, newdata = test_data)
# Evaluasi
confusionMatrix(olr_pred, test_data$LeagueIndex)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5 6 7
## 1 10 9 5 1 0 0 0
## 2 25 24 29 8 2 0 0
## 3 10 37 42 31 7 1 0
## 4 5 31 72 127 79 17 0
## 5 0 3 13 63 101 82 0
## 6 0 0 4 12 50 85 10
## 7 0 0 0 1 2 1 0
##
## Overall Statistics
##
## Accuracy : 0.3894
## 95% CI : (0.359, 0.4204)
## No Information Rate : 0.2432
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2315
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity 0.20000 0.23077 0.25455 0.5226 0.4191 0.45699
## Specificity 0.98419 0.92849 0.89688 0.7302 0.7876 0.90652
## Pos Pred Value 0.40000 0.27273 0.32812 0.3837 0.3855 0.52795
## Neg Pred Value 0.95893 0.91218 0.85878 0.8263 0.8100 0.87947
## Prevalence 0.05005 0.10410 0.16517 0.2432 0.2412 0.18619
## Detection Rate 0.01001 0.02402 0.04204 0.1271 0.1011 0.08509
## Detection Prevalence 0.02503 0.08809 0.12813 0.3313 0.2623 0.16116
## Balanced Accuracy 0.59210 0.57963 0.57571 0.6264 0.6033 0.68175
## Class: 7
## Sensitivity 0.000000
## Specificity 0.995956
## Pos Pred Value 0.000000
## Neg Pred Value 0.989950
## Prevalence 0.010010
## Detection Rate 0.000000
## Detection Prevalence 0.004004
## Balanced Accuracy 0.497978
#Evaluasi
# Asumsikan sudah ada lda_pred, olr_pred, dan test_data$LeagueIndex
# Samakan tipe data: ubah ke karakter agar bisa dibandingkan
lda_pred_char <- as.character(lda_pred$class)
olr_pred_char <- as.character(olr_pred)
true_char <- as.character(test_data$LeagueIndex)
# Hitung confusion matrix untuk LDA
conf_matrix_lda <- table(Predicted = lda_pred_char, Actual = true_char)
print(conf_matrix_lda)
## Actual
## Predicted 1 2 3 4 5 6 7
## 1 25 17 20 3 0 0 0
## 2 10 16 11 5 0 0 0
## 3 7 31 37 20 10 1 0
## 4 8 37 79 137 68 28 0
## 5 0 2 15 61 108 71 0
## 6 0 1 3 16 51 77 7
## 7 0 0 0 1 4 9 3
# Hitung confusion matrix untuk OLR
conf_matrix_olr <- table(Predicted = olr_pred_char, Actual = true_char)
print(conf_matrix_olr)
## Actual
## Predicted 1 2 3 4 5 6 7
## 1 10 9 5 1 0 0 0
## 2 25 24 29 8 2 0 0
## 3 10 37 42 31 7 1 0
## 4 5 31 72 127 79 17 0
## 5 0 3 13 63 101 82 0
## 6 0 0 4 12 50 85 10
## 7 0 0 0 1 2 1 0
# Fungsi hitung metrik evaluasi: akurasi, precision, recall (per kelas)
eval_metrics <- function(conf_matrix) {
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
precision <- diag(conf_matrix) / rowSums(conf_matrix)
recall <- diag(conf_matrix) / colSums(conf_matrix)
list(accuracy = accuracy, precision = precision, recall = recall)
}
metrics_lda <- eval_metrics(conf_matrix_lda)
metrics_olr <- eval_metrics(conf_matrix_olr)
print("LDA Metrics:")
## [1] "LDA Metrics:"
print(metrics_lda)
## $accuracy
## [1] 0.4034034
##
## $precision
## 1 2 3 4 5 6 7
## 0.3846154 0.3809524 0.3490566 0.3837535 0.4202335 0.4967742 0.1764706
##
## $recall
## 1 2 3 4 5 6 7
## 0.5000000 0.1538462 0.2242424 0.5637860 0.4481328 0.4139785 0.3000000
print("OLR Metrics:")
## [1] "OLR Metrics:"
print(metrics_olr)
## $accuracy
## [1] 0.3893894
##
## $precision
## 1 2 3 4 5 6 7
## 0.4000000 0.2727273 0.3281250 0.3836858 0.3854962 0.5279503 0.0000000
##
## $recall
## 1 2 3 4 5 6 7
## 0.2000000 0.2307692 0.2545455 0.5226337 0.4190871 0.4569892 0.0000000
# Buat tabel perbandingan benar/tidak prediksi per model
lda_correct <- lda_pred_char == true_char
olr_correct <- olr_pred_char == true_char
# Contingency table untuk uji McNemar
comparison_table <- table(LDA_correct = lda_correct, OLR_correct = olr_correct)
print(comparison_table)
## OLR_correct
## LDA_correct FALSE TRUE
## FALSE 501 95
## TRUE 109 294
# Uji McNemar untuk signifikansi perbedaan prediksi
mcnemar_result <- mcnemar.test(comparison_table)
print(mcnemar_result)
##
## McNemar's Chi-squared test with continuity correction
##
## data: comparison_table
## McNemar's chi-squared = 0.82843, df = 1, p-value = 0.3627
# Gabungkan metrik
metrics_df <- data.frame(
Metric = rep(c("Accuracy", "Precision", "Recall"), each = length(metrics_lda$precision)),
Model = rep(c("LDA", "OLR"), times = c(length(metrics_lda$precision) * 3, length(metrics_olr$precision) * 3)),
Value = c(
rep(metrics_lda$accuracy, length(metrics_lda$precision)),
metrics_lda$precision,
metrics_lda$recall,
rep(metrics_olr$accuracy, length(metrics_olr$precision)),
metrics_olr$precision,
metrics_olr$recall
),
Class = rep(names(metrics_lda$precision), times = 6)
)
# Plot barplot
ggplot(metrics_df, aes(x = Class, y = Value, fill = Model)) +
geom_bar(stat = "identity", position = position_dodge()) +
facet_wrap(~ Metric, scales = "free_y") +
labs(title = "Perbandingan Akurasi, Precision, dan Recall per Kelas",
y = "Value", x = "Kelas") +
theme_minimal()
accuracy_df <- data.frame(
Model = c("LDA", "OLR"),
Accuracy = c(metrics_lda$accuracy, metrics_olr$accuracy)
)
ggplot(accuracy_df, aes(x = Model, y = Accuracy, fill = Model)) +
geom_bar(stat = "identity", width = 0.5) +
ylim(0, 1) +
labs(title = "Perbandingan Akurasi Model", y = "Akurasi", x = "") +
theme_minimal()