Analisis Metode 1: Discriminant Analysis

Statistika Multivariat - Project

Referensi: PENGARUH FINANCIAL INDICATOR DAN NON-FINANCIAL INDICATOR TERHADAP FINANCIAL DISTRESS DENGAN DISCRIMINANT ANALYSIS DAN LOGISTIC REGRESSION (Studi Pada Perusahaan Sektor Energi yang Terdaftar Di Bursa Efek Indonesia Tahun 2018-2021)

library(readxl)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(corrplot)

## corrplot 0.95 loaded

library(biotools)

## ---
## biotools version 4.3

library(ggplot2)
library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

## The following object is masked from 'package:dplyr':
## 
##     recode

library(GGally)
library(knitr)
library(nortest)

1 Dataset

fin <- readxl::read_excel("C:/Users/DIVNA WIDYASTUTI/Documents/KULIAH/SEM 5/STATMUL 2/projek/datafinansial.xlsx")
glimpse(fin)

## Rows: 200
## Columns: 10
## $ Perusahaan <chr> "ADRO", "ADRO", "ADRO", "ADRO", "AKRA", "AKRA", "AKRA", "AK…
## $ Tahun      <dbl> 2018, 2019, 2020, 2021, 2018, 2019, 2020, 2021, 2018, 2019,…
## $ ROA        <dbl> 0.0676, 0.0603, 0.0248, 0.1356, 0.0333, 0.0327, 0.0515, 0.0…
## $ DER        <dbl> 0.6410, 0.8118, 0.6149, 0.7017, 1.0088, 1.1267, 0.7699, 1.0…
## $ OCFR       <dbl> 0.3285, 0.2837, 0.3040, 0.4591, -0.0448, 0.0607, 0.1313, 0.…
## $ AGE        <dbl> 14, 15, 16, 17, 41, 42, 43, 44, 34, 35, 36, 37, 37, 37, 37,…
## $ SIZE       <dbl> 32.2592, 32.2386, 32.1295, 32.3167, 30.6238, 30.6948, 30.55…
## $ INST_OWN   <dbl> 43.91120, 43.91120, 43.91120, 43.91120, 59.59670, 59.59670,…
## $ MNJ_OWN    <dbl> 12.4001, 12.4034, 12.3956, 12.3865, 0.6755, 0.6755, 0.6619,…
## $ Y          <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,…

summary(fin)

##   Perusahaan            Tahun           ROA                DER          
##  Length:200         Min.   :2018   Min.   :-1.53830   Min.   :-43.0864  
##  Class :character   1st Qu.:2019   1st Qu.:-0.01142   1st Qu.:  0.4656  
##  Mode  :character   Median :2020   Median : 0.02640   Median :  0.9827  
##                     Mean   :2020   Mean   : 0.01774   Mean   :  1.3858  
##                     3rd Qu.:2020   3rd Qu.: 0.06985   3rd Qu.:  1.8298  
##                     Max.   :2021   Max.   : 0.55260   Max.   : 34.0556  
##                                                                         
##       OCFR               AGE             SIZE          INST_OWN    
##  Min.   :-0.34300   Min.   :14.00   Min.   :24.77   Min.   :10.11  
##  1st Qu.: 0.05872   1st Qu.:37.00   1st Qu.:27.84   1st Qu.:43.91  
##  Median : 0.16095   Median :37.00   Median :28.84   Median :68.44  
##  Mean   : 0.30064   Mean   :36.65   Mean   :28.93   Mean   :63.31  
##  3rd Qu.: 0.35018   3rd Qu.:37.00   3rd Qu.:30.08   3rd Qu.:81.82  
##  Max.   : 2.77350   Max.   :44.00   Max.   :32.32   Max.   :97.50  
##                                                     NA's   :1      
##     MNJ_OWN              Y        
##  Min.   : 0.0000   Min.   :0.000  
##  1st Qu.: 0.0000   1st Qu.:0.000  
##  Median : 0.0111   Median :1.000  
##  Mean   : 3.7853   Mean   :1.105  
##  3rd Qu.: 0.6755   3rd Qu.:2.000  
##  Max.   :61.4791   Max.   :2.000  
##

2 Preprocessing

vars_predictor <- c("ROA","DER","OCFR","AGE","SIZE","INST_OWN","MNJ_OWN")

2.1 Identifikasi Missing Value

# cek missing value
colSums(is.na(fin))

## Perusahaan      Tahun        ROA        DER       OCFR        AGE       SIZE 
##          0          0          0          0          0          0          0 
##   INST_OWN    MNJ_OWN          Y 
##          1          0          0

2.2 Penanganan Missing Value

# Imputasi median
if(any(is.na(fin$INST_OWN))){
  fin$INST_OWN[is.na(fin$INST_OWN)] <- median(fin$INST_OWN, na.rm = TRUE)
}
# cek ulang 
colSums(is.na(fin))

## Perusahaan      Tahun        ROA        DER       OCFR        AGE       SIZE 
##          0          0          0          0          0          0          0 
##   INST_OWN    MNJ_OWN          Y 
##          0          0          0

2.3 Transformasi Variabel Respons dan Seleksi Fitur Prediktor

# Konversi Y ke faktor
fin$Y <- factor(fin$Y,
                levels = sort(unique(fin$Y)),
                labels = c("Distress","Grey","NonDistress")[1:length(unique(fin$Y))])

# Pilih kolom final
fin <- fin[, c(vars_predictor, "Y")]

2.4 Visualisasi Korelasi antar Variabel

numeric_vars <- fin[, vars_predictor]
corr_matrix <- cor(numeric_vars, use = "complete.obs")
corrplot(corr_matrix, method = "color", type = "upper", tl.cex = 0.8, addCoef.col = "black")

2.5 Deteksi Outlier

z <- scale(fin[, vars_predictor])
outlier_rows <- unique(which(abs(z) > 3, arr.ind = TRUE)[,1])
outlier_rows

##  [1]  18  19 126 134  13  43  65  66  48 116 120 129   1   2   3   4  46  47  77
## [20]  78  79  80 133 135

boxplot(fin[, vars_predictor], 
        main = "Boxplot Outlier (Z-score Method)")

2.6 Penanganan Outlier

fin_final <- fin[-outlier_rows, ]
nrow(fin_final)

## [1] 176

boxplot(fin_final[, vars_predictor],
        main = "Boxplot Setelah Penanganan Outlier (Z-score)")

z2 <- scale(fin_final[, vars_predictor])
outlier_rows_after <- unique(which(abs(z2) > 3, arr.ind = TRUE)[,1])
outlier_rows_after

##  [1]  29  33  37  38  67  88  10  13  25  54  83  91  97 111   1   2   3   4   5
## [20]   9  11  93  94 169 170 171

length(outlier_rows_after)

## [1] 26

3 Discriminant Analysis Assumption Testing

3.1 Normalitas (Kolmogorov-Smirnov)

ks_by_group <- by(fin_final[, vars_predictor], fin_final$Y, function(sub){
  sapply(sub, function(x){
    x2 <- na.omit(x)
    if(length(unique(x2)) >= 3){
      round(ks.test(x2, "pnorm", mean=mean(x2), sd=sd(x2))$p.value,4)
    } else NA
  })
})

## Warning in ks.test.default(x2, "pnorm", mean = mean(x2), sd = sd(x2)): ties
## should not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(x2, "pnorm", mean = mean(x2), sd = sd(x2)): ties
## should not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(x2, "pnorm", mean = mean(x2), sd = sd(x2)): ties
## should not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(x2, "pnorm", mean = mean(x2), sd = sd(x2)): ties
## should not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(x2, "pnorm", mean = mean(x2), sd = sd(x2)): ties
## should not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(x2, "pnorm", mean = mean(x2), sd = sd(x2)): ties
## should not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(x2, "pnorm", mean = mean(x2), sd = sd(x2)): ties
## should not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(x2, "pnorm", mean = mean(x2), sd = sd(x2)): ties
## should not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(x2, "pnorm", mean = mean(x2), sd = sd(x2)): ties
## should not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(x2, "pnorm", mean = mean(x2), sd = sd(x2)): ties
## should not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(x2, "pnorm", mean = mean(x2), sd = sd(x2)): ties
## should not be present for the one-sample Kolmogorov-Smirnov test

ks_by_group

## fin_final$Y: Distress
##      ROA      DER     OCFR      AGE     SIZE INST_OWN  MNJ_OWN 
##   0.0061   0.0033   0.0040   0.0000   0.8709   0.0456   0.0000 
## ------------------------------------------------------------ 
## fin_final$Y: Grey
##      ROA      DER     OCFR      AGE     SIZE INST_OWN  MNJ_OWN 
##   0.9406   0.0141   0.1837       NA   0.8283   0.0409   0.0000 
## ------------------------------------------------------------ 
## fin_final$Y: NonDistress
##      ROA      DER     OCFR      AGE     SIZE INST_OWN  MNJ_OWN 
##   0.0923   0.0174   0.1498   0.0000   0.7562   0.4207   0.0000

3.2 Homogenitas matriks kovarians (Box’s M)

boxm_res <- boxM(fin_final[, vars_predictor], grouping = fin_final$Y)
boxm_res

## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  fin_final[, vars_predictor]
## Chi-Sq (approx.) = Inf, df = 56, p-value < 2.2e-16

3.3 Multikolinearitas (VIF)

vif_mod <- lm(ROA ~ ., data = fin_final[, vars_predictor])
car::vif(vif_mod)

##      DER     OCFR      AGE     SIZE INST_OWN  MNJ_OWN 
## 1.176032 1.111543 1.028787 1.095973 1.159639 1.172900

3.4 Outlier multivariat (Mahalanobis)

center <- colMeans(fin_final[, vars_predictor])
S <- cov(fin_final[, vars_predictor])
m_dist <- mahalanobis(fin_final[, vars_predictor], center, S)
p <- length(vars_predictor)
cutoff <- qchisq(0.975, df = p)
sum(m_dist > cutoff)

## [1] 16

which_outliers_maha <- which(m_dist > cutoff)

3.5 MANOVA (perbedaan rata-rata multivariat antar kelas)

manova_model <- manova(as.matrix(fin_final[, vars_predictor]) ~ Y, data = fin_final)
summary(manova_model, test = "Wilks")

##            Df Wilks approx F num Df den Df    Pr(>F)    
## Y           2 0.523   9.1318     14    334 < 2.2e-16 ***
## Residuals 173                                           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4 Discriminant Analysis

summarize.class <- function(original, classify) {
  class.table <- table(original, classify)
  numb <- rowSums(class.table)
  prop <- round(class.table / numb, 4)
  overall <- round(sum(diag(class.table)) / sum(class.table), 4)
  list(class.table = class.table, prop = prop, overall.correct = overall)
}

4.1 Linear Discriminant Analysis (LDA)

#Model LDA
lda_model <- lda(Y ~ ., data = fin_final)
pred_lda <- predict(lda_model, newdata = fin_final)
res_lda <- summarize.class(fin_final$Y, pred_lda$class)
res_lda$class.table

##              classify
## original      Distress Grey NonDistress
##   Distress          47    5           8
##   Grey              16    6          16
##   NonDistress        7    5          66

res_lda$overall.correct

## [1] 0.6761

#Struktur dan Fungsi Diskriminan LDA
ld_scores <- pred_lda$x
structure_mat <- cor(fin_final[, vars_predictor], ld_scores)
structure_mat

##                 LD1        LD2
## ROA       0.7251231 -0.0592660
## DER      -0.3483893 -0.1434445
## OCFR      0.6892072  0.5481679
## AGE       0.2927508  0.1036356
## SIZE      0.3129089 -0.3851916
## INST_OWN  0.5376004 -0.2717051
## MNJ_OWN  -0.3459319  0.4978753

# Visualisasi LDA
ld_vals <- as.data.frame(ld_scores)
ld_vals$Y <- fin_final$Y
if("LD2" %in% names(ld_vals)){
  ggplot(ld_vals, aes(x=LD1, y=LD2, color=Y)) +
    geom_point(alpha=0.8, size=2) +
    stat_ellipse(aes(fill=Y), geom="polygon", alpha=0.15, show.legend=FALSE) +
    labs(title="Proyeksi LDA", x="LD1", y="LD2") + theme_minimal()
} else {
  ggplot(ld_vals, aes(x=LD1, fill=Y)) +
    geom_density(alpha=0.6) +
    labs(title="Distribusi LD1 per kelas") + theme_minimal()
}

4.2 Quadratic Discriminant Analysis (QDA)

#Reduksi Dimensi Menggunakan PCA
pca <- prcomp(fin_final[, vars_predictor], scale.=TRUE)
fin_pca <- data.frame(Y=fin_final$Y, pca$x[,1:3])
#Model QDA
qda_model <- qda(Y ~ ., data=fin_pca)
pred_qda <- predict(qda_model, newdata=fin_pca)
res_qda <- summarize.class(fin_pca$Y, pred_qda$class)
res_qda$class.table

##              classify
## original      Distress Grey NonDistress
##   Distress          32   18          10
##   Grey               2   27           9
##   NonDistress        2   28          48

res_qda$overall.correct

## [1] 0.608

# Visualisasi QDA
qdavals <- data.frame(PC1=fin_pca$PC1, PC2=fin_pca$PC2, Y=fin_pca$Y)
ggplot(qdavals, aes(x=PC1, y=PC2, color=Y)) +
  geom_point(alpha=0.8, size=2) +
  stat_ellipse(aes(fill=Y), geom="polygon", alpha=0.15, show.legend=FALSE) +
  labs(title="Proyeksi QDA berbasis PCA", x="PC1", y="PC2") + theme_minimal()

4.3 Perbandingan LDA dan QDA

res_lda$overall.correct

## [1] 0.6761

res_qda$overall.correct

## [1] 0.608