#kELOPMOK 5
#Amaany (23031554038)
#Nurin Nasi'ah Salsabila (23031554148)

##LOAD DAN EXPLOR DATA##
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.4.3
## Loading required package: ggplot2
df_MLR <- read.csv("C:\\Users\\ASUS\\Downloads\\cumulative.csv (1)\\cumulative.csv", header = TRUE, sep = ",")
head(df_MLR, n=5)
##   rowid    kepid kepoi_name  kepler_name koi_disposition koi_pdisposition
## 1     1 10797460  K00752.01 Kepler-227 b       CONFIRMED        CANDIDATE
## 2     2 10797460  K00752.02 Kepler-227 c       CONFIRMED        CANDIDATE
## 3     3 10811496  K00753.01               FALSE POSITIVE   FALSE POSITIVE
## 4     4 10848459  K00754.01               FALSE POSITIVE   FALSE POSITIVE
## 5     5 10854555  K00755.01 Kepler-664 b       CONFIRMED        CANDIDATE
##   koi_score koi_fpflag_nt koi_fpflag_ss koi_fpflag_co koi_fpflag_ec koi_period
## 1     1.000             0             0             0             0   9.488036
## 2     0.969             0             0             0             0  54.418383
## 3     0.000             0             1             0             0  19.899140
## 4     0.000             0             1             0             0   1.736952
## 5     1.000             0             0             0             0   2.525592
##   koi_period_err1 koi_period_err2 koi_time0bk koi_time0bk_err1 koi_time0bk_err2
## 1       2.775e-05      -2.775e-05    170.5387         0.002160        -0.002160
## 2       2.479e-04      -2.479e-04    162.5138         0.003520        -0.003520
## 3       1.494e-05      -1.494e-05    175.8503         0.000581        -0.000581
## 4       2.630e-07      -2.630e-07    170.3076         0.000115        -0.000115
## 5       3.761e-06      -3.761e-06    171.5956         0.001130        -0.001130
##   koi_impact koi_impact_err1 koi_impact_err2 koi_duration koi_duration_err1
## 1      0.146           0.318          -0.146      2.95750           0.08190
## 2      0.586           0.059          -0.443      4.50700           0.11600
## 3      0.969           5.126          -0.077      1.78220           0.03410
## 4      1.276           0.115          -0.092      2.40641           0.00537
## 5      0.701           0.235          -0.478      1.65450           0.04200
##   koi_duration_err2 koi_depth koi_depth_err1 koi_depth_err2 koi_prad
## 1          -0.08190     615.8           19.5          -19.5     2.26
## 2          -0.11600     874.8           35.5          -35.5     2.83
## 3          -0.03410   10829.0          171.0         -171.0    14.60
## 4          -0.00537    8079.2           12.8          -12.8    33.46
## 5          -0.04200     603.3           16.9          -16.9     2.75
##   koi_prad_err1 koi_prad_err2 koi_teq koi_teq_err1 koi_teq_err2 koi_insol
## 1          0.26         -0.15     793           NA           NA     93.59
## 2          0.32         -0.19     443           NA           NA      9.11
## 3          3.92         -1.31     638           NA           NA     39.30
## 4          8.50         -2.83    1395           NA           NA    891.96
## 5          0.88         -0.35    1406           NA           NA    926.16
##   koi_insol_err1 koi_insol_err2 koi_model_snr koi_tce_plnt_num
## 1          29.45         -16.65          35.8                1
## 2           2.87          -1.62          25.8                2
## 3          31.04         -10.49          76.3                1
## 4         668.95        -230.35         505.6                1
## 5         874.33        -314.24          40.9                1
##   koi_tce_delivname koi_steff koi_steff_err1 koi_steff_err2 koi_slogg
## 1   q1_q17_dr25_tce      5455             81            -81     4.467
## 2   q1_q17_dr25_tce      5455             81            -81     4.467
## 3   q1_q17_dr25_tce      5853            158           -176     4.544
## 4   q1_q17_dr25_tce      5805            157           -174     4.564
## 5   q1_q17_dr25_tce      6031            169           -211     4.438
##   koi_slogg_err1 koi_slogg_err2 koi_srad koi_srad_err1 koi_srad_err2       ra
## 1          0.064         -0.096    0.927         0.105        -0.061 291.9342
## 2          0.064         -0.096    0.927         0.105        -0.061 291.9342
## 3          0.044         -0.176    0.868         0.233        -0.078 297.0048
## 4          0.053         -0.168    0.791         0.201        -0.067 285.5346
## 5          0.070         -0.210    1.046         0.334        -0.133 288.7549
##        dec koi_kepmag
## 1 48.14165     15.347
## 2 48.14165     15.347
## 3 48.13413     15.436
## 4 48.28521     15.597
## 5 48.22620     15.509
names(df_MLR)
##  [1] "rowid"             "kepid"             "kepoi_name"       
##  [4] "kepler_name"       "koi_disposition"   "koi_pdisposition" 
##  [7] "koi_score"         "koi_fpflag_nt"     "koi_fpflag_ss"    
## [10] "koi_fpflag_co"     "koi_fpflag_ec"     "koi_period"       
## [13] "koi_period_err1"   "koi_period_err2"   "koi_time0bk"      
## [16] "koi_time0bk_err1"  "koi_time0bk_err2"  "koi_impact"       
## [19] "koi_impact_err1"   "koi_impact_err2"   "koi_duration"     
## [22] "koi_duration_err1" "koi_duration_err2" "koi_depth"        
## [25] "koi_depth_err1"    "koi_depth_err2"    "koi_prad"         
## [28] "koi_prad_err1"     "koi_prad_err2"     "koi_teq"          
## [31] "koi_teq_err1"      "koi_teq_err2"      "koi_insol"        
## [34] "koi_insol_err1"    "koi_insol_err2"    "koi_model_snr"    
## [37] "koi_tce_plnt_num"  "koi_tce_delivname" "koi_steff"        
## [40] "koi_steff_err1"    "koi_steff_err2"    "koi_slogg"        
## [43] "koi_slogg_err1"    "koi_slogg_err2"    "koi_srad"         
## [46] "koi_srad_err1"     "koi_srad_err2"     "ra"               
## [49] "dec"               "koi_kepmag"
nrow(df_MLR)
## [1] 9564
ncol(df_MLR)
## [1] 50
str(df_MLR)
## 'data.frame':    9564 obs. of  50 variables:
##  $ rowid            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ kepid            : int  10797460 10797460 10811496 10848459 10854555 10872983 10872983 10872983 6721123 10910878 ...
##  $ kepoi_name       : chr  "K00752.01" "K00752.02" "K00753.01" "K00754.01" ...
##  $ kepler_name      : chr  "Kepler-227 b" "Kepler-227 c" "" "" ...
##  $ koi_disposition  : chr  "CONFIRMED" "CONFIRMED" "FALSE POSITIVE" "FALSE POSITIVE" ...
##  $ koi_pdisposition : chr  "CANDIDATE" "CANDIDATE" "FALSE POSITIVE" "FALSE POSITIVE" ...
##  $ koi_score        : num  1 0.969 0 0 1 1 1 0.992 0 1 ...
##  $ koi_fpflag_nt    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ koi_fpflag_ss    : int  0 0 1 1 0 0 0 0 1 0 ...
##  $ koi_fpflag_co    : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ koi_fpflag_ec    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ koi_period       : num  9.49 54.42 19.9 1.74 2.53 ...
##  $ koi_period_err1  : num  2.78e-05 2.48e-04 1.49e-05 2.63e-07 3.76e-06 ...
##  $ koi_period_err2  : num  -2.78e-05 -2.48e-04 -1.49e-05 -2.63e-07 -3.76e-06 ...
##  $ koi_time0bk      : num  171 163 176 170 172 ...
##  $ koi_time0bk_err1 : num  0.00216 0.00352 0.000581 0.000115 0.00113 0.00141 0.0019 0.00461 0.00253 0.000517 ...
##  $ koi_time0bk_err2 : num  -0.00216 -0.00352 -0.000581 -0.000115 -0.00113 -0.00141 -0.0019 -0.00461 -0.00253 -0.000517 ...
##  $ koi_impact       : num  0.146 0.586 0.969 1.276 0.701 ...
##  $ koi_impact_err1  : num  0.318 0.059 5.126 0.115 0.235 ...
##  $ koi_impact_err2  : num  -0.146 -0.443 -0.077 -0.092 -0.478 -0.428 -0.532 -0.523 -0.044 -0.052 ...
##  $ koi_duration     : num  2.96 4.51 1.78 2.41 1.65 ...
##  $ koi_duration_err1: num  0.0819 0.116 0.0341 0.00537 0.042 0.061 0.0673 0.165 0.136 0.0241 ...
##  $ koi_duration_err2: num  -0.0819 -0.116 -0.0341 -0.00537 -0.042 -0.061 -0.0673 -0.165 -0.136 -0.0241 ...
##  $ koi_depth        : num  616 875 10829 8079 603 ...
##  $ koi_depth_err1   : num  19.5 35.5 171 12.8 16.9 24.2 18.7 16.8 5.8 33.3 ...
##  $ koi_depth_err2   : num  -19.5 -35.5 -171 -12.8 -16.9 -24.2 -18.7 -16.8 -5.8 -33.3 ...
##  $ koi_prad         : num  2.26 2.83 14.6 33.46 2.75 ...
##  $ koi_prad_err1    : num  0.26 0.32 3.92 8.5 0.88 1.27 0.9 0.52 6.45 0.22 ...
##  $ koi_prad_err2    : num  -0.15 -0.19 -1.31 -2.83 -0.35 -0.42 -0.3 -0.17 -9.67 -0.49 ...
##  $ koi_teq          : num  793 443 638 1395 1406 ...
##  $ koi_teq_err1     : logi  NA NA NA NA NA NA ...
##  $ koi_teq_err2     : logi  NA NA NA NA NA NA ...
##  $ koi_insol        : num  93.59 9.11 39.3 891.96 926.16 ...
##  $ koi_insol_err1   : num  29.45 2.87 31.04 668.95 874.33 ...
##  $ koi_insol_err2   : num  -16.65 -1.62 -10.49 -230.35 -314.24 ...
##  $ koi_model_snr    : num  35.8 25.8 76.3 505.6 40.9 ...
##  $ koi_tce_plnt_num : int  1 2 1 1 1 1 2 3 1 1 ...
##  $ koi_tce_delivname: chr  "q1_q17_dr25_tce" "q1_q17_dr25_tce" "q1_q17_dr25_tce" "q1_q17_dr25_tce" ...
##  $ koi_steff        : num  5455 5455 5853 5805 6031 ...
##  $ koi_steff_err1   : num  81 81 158 157 169 189 189 189 111 75 ...
##  $ koi_steff_err2   : num  -81 -81 -176 -174 -211 -232 -232 -232 -124 -83 ...
##  $ koi_slogg        : num  4.47 4.47 4.54 4.56 4.44 ...
##  $ koi_slogg_err1   : num  0.064 0.064 0.044 0.053 0.07 0.054 0.054 0.054 0.182 0.083 ...
##  $ koi_slogg_err2   : num  -0.096 -0.096 -0.176 -0.168 -0.21 -0.229 -0.229 -0.229 -0.098 -0.028 ...
##  $ koi_srad         : num  0.927 0.927 0.868 0.791 1.046 ...
##  $ koi_srad_err1    : num  0.105 0.105 0.233 0.201 0.334 0.315 0.315 0.315 0.322 0.033 ...
##  $ koi_srad_err2    : num  -0.061 -0.061 -0.078 -0.067 -0.133 -0.105 -0.105 -0.105 -0.483 -0.072 ...
##  $ ra               : num  292 292 297 286 289 ...
##  $ dec              : num  48.1 48.1 48.1 48.3 48.2 ...
##  $ koi_kepmag       : num  15.3 15.3 15.4 15.6 15.5 ...
summary(df_MLR)
##      rowid          kepid           kepoi_name        kepler_name       
##  Min.   :   1   Min.   :  757450   Length:9564        Length:9564       
##  1st Qu.:2392   1st Qu.: 5556034   Class :character   Class :character  
##  Median :4782   Median : 7906892   Mode  :character   Mode  :character  
##  Mean   :4782   Mean   : 7690628                                        
##  3rd Qu.:7173   3rd Qu.: 9873066                                        
##  Max.   :9564   Max.   :12935144                                        
##                                                                         
##  koi_disposition    koi_pdisposition     koi_score      koi_fpflag_nt   
##  Length:9564        Length:9564        Min.   :0.0000   Min.   :0.0000  
##  Class :character   Class :character   1st Qu.:0.0000   1st Qu.:0.0000  
##  Mode  :character   Mode  :character   Median :0.3340   Median :0.0000  
##                                        Mean   :0.4808   Mean   :0.1882  
##                                        3rd Qu.:0.9980   3rd Qu.:0.0000  
##                                        Max.   :1.0000   Max.   :1.0000  
##                                        NA's   :1510                     
##  koi_fpflag_ss    koi_fpflag_co    koi_fpflag_ec    koi_period       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00   Min.   :     0.24  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00   1st Qu.:     2.73  
##  Median :0.0000   Median :0.0000   Median :0.00   Median :     9.75  
##  Mean   :0.2316   Mean   :0.1949   Mean   :0.12   Mean   :    75.67  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.00   3rd Qu.:    40.72  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00   Max.   :129995.78  
##                                                                      
##  koi_period_err1  koi_period_err2    koi_time0bk     koi_time0bk_err1
##  Min.   :0.0000   Min.   :-0.1725   Min.   : 120.5   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:-0.0003   1st Qu.: 132.8   1st Qu.:0.0012  
##  Median :0.0000   Median : 0.0000   Median : 137.2   Median :0.0041  
##  Mean   :0.0021   Mean   :-0.0021   Mean   : 166.2   Mean   :0.0099  
##  3rd Qu.:0.0003   3rd Qu.: 0.0000   3rd Qu.: 170.7   3rd Qu.:0.0105  
##  Max.   :0.1725   Max.   : 0.0000   Max.   :1472.5   Max.   :0.5690  
##  NA's   :454      NA's   :454                        NA's   :454     
##  koi_time0bk_err2    koi_impact       koi_impact_err1  koi_impact_err2   
##  Min.   :-0.5690   Min.   :  0.0000   Min.   : 0.000   Min.   :-59.3200  
##  1st Qu.:-0.0105   1st Qu.:  0.1970   1st Qu.: 0.040   1st Qu.: -0.4450  
##  Median :-0.0041   Median :  0.5370   Median : 0.193   Median : -0.2070  
##  Mean   :-0.0099   Mean   :  0.7351   Mean   : 1.960   Mean   : -0.3326  
##  3rd Qu.:-0.0012   3rd Qu.:  0.8890   3rd Qu.: 0.378   3rd Qu.: -0.0460  
##  Max.   : 0.0000   Max.   :100.8060   Max.   :85.540   Max.   :  0.0000  
##  NA's   :454       NA's   :363        NA's   :454      NA's   :454       
##   koi_duration     koi_duration_err1 koi_duration_err2    koi_depth        
##  Min.   :  0.052   Min.   : 0.0000   Min.   :-20.2000   Min.   :      0.0  
##  1st Qu.:  2.438   1st Qu.: 0.0508   1st Qu.: -0.3500   1st Qu.:    159.9  
##  Median :  3.793   Median : 0.1420   Median : -0.1420   Median :    421.1  
##  Mean   :  5.622   Mean   : 0.3399   Mean   : -0.3399   Mean   :  23791.3  
##  3rd Qu.:  6.277   3rd Qu.: 0.3500   3rd Qu.: -0.0508   3rd Qu.:   1473.4  
##  Max.   :138.540   Max.   :20.2000   Max.   :  0.0000   Max.   :1541400.0  
##                    NA's   :454       NA's   :454        NA's   :363        
##  koi_depth_err1     koi_depth_err2         koi_prad         koi_prad_err1     
##  Min.   :     0.0   Min.   :-388600.0   Min.   :     0.08   Min.   :    0.00  
##  1st Qu.:     9.6   1st Qu.:    -49.5   1st Qu.:     1.40   1st Qu.:    0.23  
##  Median :    20.8   Median :    -20.8   Median :     2.39   Median :    0.52  
##  Mean   :   123.2   Mean   :   -123.2   Mean   :   102.89   Mean   :   17.66  
##  3rd Qu.:    49.5   3rd Qu.:     -9.6   3rd Qu.:    14.93   3rd Qu.:    2.32  
##  Max.   :388600.0   Max.   :      0.0   Max.   :200346.00   Max.   :21640.00  
##  NA's   :454        NA's   :454         NA's   :363         NA's   :363       
##  koi_prad_err2          koi_teq      koi_teq_err1   koi_teq_err2  
##  Min.   :-77180.00   Min.   :   25   Mode:logical   Mode:logical  
##  1st Qu.:    -1.94   1st Qu.:  539   NA's:9564      NA's:9564     
##  Median :    -0.30   Median :  878                                
##  Mean   :   -33.02   Mean   : 1085                                
##  3rd Qu.:    -0.14   3rd Qu.: 1379                                
##  Max.   :     0.00   Max.   :14667                                
##  NA's   :363         NA's   :363                                  
##    koi_insol        koi_insol_err1    koi_insol_err2     koi_model_snr   
##  Min.   :       0   Min.   :      0   Min.   :-5600031   Min.   :   0.0  
##  1st Qu.:      20   1st Qu.:      9   1st Qu.:    -287   1st Qu.:  12.0  
##  Median :     142   Median :     73   Median :     -40   Median :  23.0  
##  Mean   :    7746   Mean   :   3751   Mean   :   -4044   Mean   : 259.9  
##  3rd Qu.:     870   3rd Qu.:    519   3rd Qu.:      -5   3rd Qu.:  78.0  
##  Max.   :10947555   Max.   :3617133   Max.   :       0   Max.   :9054.7  
##  NA's   :321        NA's   :321       NA's   :321        NA's   :363     
##  koi_tce_plnt_num koi_tce_delivname    koi_steff     koi_steff_err1 
##  Min.   :1.000    Length:9564        Min.   : 2661   Min.   :  0.0  
##  1st Qu.:1.000    Class :character   1st Qu.: 5310   1st Qu.:106.0  
##  Median :1.000    Mode  :character   Median : 5767   Median :157.0  
##  Mean   :1.244                       Mean   : 5707   Mean   :144.6  
##  3rd Qu.:1.000                       3rd Qu.: 6112   3rd Qu.:174.0  
##  Max.   :8.000                       Max.   :15896   Max.   :676.0  
##  NA's   :346                         NA's   :363     NA's   :468    
##  koi_steff_err2      koi_slogg     koi_slogg_err1   koi_slogg_err2   
##  Min.   :-1762.0   Min.   :0.047   Min.   :0.0000   Min.   :-1.2070  
##  1st Qu.: -198.0   1st Qu.:4.218   1st Qu.:0.0420   1st Qu.:-0.1960  
##  Median : -160.0   Median :4.438   Median :0.0700   Median :-0.1280  
##  Mean   : -162.3   Mean   :4.310   Mean   :0.1207   Mean   :-0.1432  
##  3rd Qu.: -114.0   3rd Qu.:4.543   3rd Qu.:0.1490   3rd Qu.:-0.0880  
##  Max.   :    0.0   Max.   :5.364   Max.   :1.4720   Max.   : 0.0000  
##  NA's   :483       NA's   :363     NA's   :468      NA's   :468      
##     koi_srad       koi_srad_err1     koi_srad_err2             ra       
##  Min.   :  0.109   Min.   : 0.0000   Min.   :-116.1370   Min.   :279.9  
##  1st Qu.:  0.829   1st Qu.: 0.1290   1st Qu.:  -0.2500   1st Qu.:288.7  
##  Median :  1.000   Median : 0.2510   Median :  -0.1110   Median :292.3  
##  Mean   :  1.729   Mean   : 0.3623   Mean   :  -0.3948   Mean   :292.1  
##  3rd Qu.:  1.345   3rd Qu.: 0.3640   3rd Qu.:  -0.0690   3rd Qu.:295.9  
##  Max.   :229.908   Max.   :33.0910   Max.   :   0.0000   Max.   :301.7  
##  NA's   :363       NA's   :468       NA's   :468                        
##       dec          koi_kepmag    
##  Min.   :36.58   Min.   : 6.966  
##  1st Qu.:40.78   1st Qu.:13.440  
##  Median :43.68   Median :14.520  
##  Mean   :43.81   Mean   :14.265  
##  3rd Qu.:46.71   3rd Qu.:15.322  
##  Max.   :52.34   Max.   :20.003  
##                  NA's   :1
#colSums(is.na(df_MLR))

na_counts <- sort(colSums(is.na(df_MLR)), decreasing = TRUE)
na_df <- data.frame(Missing_Values = na_counts)
print(na_df)
##                   Missing_Values
## koi_teq_err1                9564
## koi_teq_err2                9564
## koi_score                   1510
## koi_steff_err2               483
## koi_steff_err1               468
## koi_slogg_err1               468
## koi_slogg_err2               468
## koi_srad_err1                468
## koi_srad_err2                468
## koi_period_err1              454
## koi_period_err2              454
## koi_time0bk_err1             454
## koi_time0bk_err2             454
## koi_impact_err1              454
## koi_impact_err2              454
## koi_duration_err1            454
## koi_duration_err2            454
## koi_depth_err1               454
## koi_depth_err2               454
## koi_impact                   363
## koi_depth                    363
## koi_prad                     363
## koi_prad_err1                363
## koi_prad_err2                363
## koi_teq                      363
## koi_model_snr                363
## koi_steff                    363
## koi_slogg                    363
## koi_srad                     363
## koi_tce_plnt_num             346
## koi_insol                    321
## koi_insol_err1               321
## koi_insol_err2               321
## koi_kepmag                     1
## rowid                          0
## kepid                          0
## kepoi_name                     0
## kepler_name                    0
## koi_disposition                0
## koi_pdisposition               0
## koi_fpflag_nt                  0
## koi_fpflag_ss                  0
## koi_fpflag_co                  0
## koi_fpflag_ec                  0
## koi_period                     0
## koi_time0bk                    0
## koi_duration                   0
## koi_tce_delivname              0
## ra                             0
## dec                            0
#hapus variabel yang tidak relevan dan bernilai NaN semua
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data_new <- df_MLR %>%
  dplyr::select(-koi_teq_err1,-koi_teq_err2, -kepler_name, -koi_tce_delivname, -kepoi_name, -kepid, -rowid)
head(data_new)
##   koi_disposition koi_pdisposition koi_score koi_fpflag_nt koi_fpflag_ss
## 1       CONFIRMED        CANDIDATE     1.000             0             0
## 2       CONFIRMED        CANDIDATE     0.969             0             0
## 3  FALSE POSITIVE   FALSE POSITIVE     0.000             0             1
## 4  FALSE POSITIVE   FALSE POSITIVE     0.000             0             1
## 5       CONFIRMED        CANDIDATE     1.000             0             0
## 6       CONFIRMED        CANDIDATE     1.000             0             0
##   koi_fpflag_co koi_fpflag_ec koi_period koi_period_err1 koi_period_err2
## 1             0             0   9.488036       2.775e-05      -2.775e-05
## 2             0             0  54.418383       2.479e-04      -2.479e-04
## 3             0             0  19.899140       1.494e-05      -1.494e-05
## 4             0             0   1.736952       2.630e-07      -2.630e-07
## 5             0             0   2.525592       3.761e-06      -3.761e-06
## 6             0             0  11.094321       2.036e-05      -2.036e-05
##   koi_time0bk koi_time0bk_err1 koi_time0bk_err2 koi_impact koi_impact_err1
## 1    170.5387         0.002160        -0.002160      0.146           0.318
## 2    162.5138         0.003520        -0.003520      0.586           0.059
## 3    175.8503         0.000581        -0.000581      0.969           5.126
## 4    170.3076         0.000115        -0.000115      1.276           0.115
## 5    171.5956         0.001130        -0.001130      0.701           0.235
## 6    171.2012         0.001410        -0.001410      0.538           0.030
##   koi_impact_err2 koi_duration koi_duration_err1 koi_duration_err2 koi_depth
## 1          -0.146      2.95750           0.08190          -0.08190     615.8
## 2          -0.443      4.50700           0.11600          -0.11600     874.8
## 3          -0.077      1.78220           0.03410          -0.03410   10829.0
## 4          -0.092      2.40641           0.00537          -0.00537    8079.2
## 5          -0.478      1.65450           0.04200          -0.04200     603.3
## 6          -0.428      4.59450           0.06100          -0.06100    1517.5
##   koi_depth_err1 koi_depth_err2 koi_prad koi_prad_err1 koi_prad_err2 koi_teq
## 1           19.5          -19.5     2.26          0.26         -0.15     793
## 2           35.5          -35.5     2.83          0.32         -0.19     443
## 3          171.0         -171.0    14.60          3.92         -1.31     638
## 4           12.8          -12.8    33.46          8.50         -2.83    1395
## 5           16.9          -16.9     2.75          0.88         -0.35    1406
## 6           24.2          -24.2     3.90          1.27         -0.42     835
##   koi_insol koi_insol_err1 koi_insol_err2 koi_model_snr koi_tce_plnt_num
## 1     93.59          29.45         -16.65          35.8                1
## 2      9.11           2.87          -1.62          25.8                2
## 3     39.30          31.04         -10.49          76.3                1
## 4    891.96         668.95        -230.35         505.6                1
## 5    926.16         874.33        -314.24          40.9                1
## 6    114.81         112.85         -36.70          66.5                1
##   koi_steff koi_steff_err1 koi_steff_err2 koi_slogg koi_slogg_err1
## 1      5455             81            -81     4.467          0.064
## 2      5455             81            -81     4.467          0.064
## 3      5853            158           -176     4.544          0.044
## 4      5805            157           -174     4.564          0.053
## 5      6031            169           -211     4.438          0.070
## 6      6046            189           -232     4.486          0.054
##   koi_slogg_err2 koi_srad koi_srad_err1 koi_srad_err2       ra      dec
## 1         -0.096    0.927         0.105        -0.061 291.9342 48.14165
## 2         -0.096    0.927         0.105        -0.061 291.9342 48.14165
## 3         -0.176    0.868         0.233        -0.078 297.0048 48.13413
## 4         -0.168    0.791         0.201        -0.067 285.5346 48.28521
## 5         -0.210    1.046         0.334        -0.133 288.7549 48.22620
## 6         -0.229    0.972         0.315        -0.105 296.2861 48.22467
##   koi_kepmag
## 1     15.347
## 2     15.347
## 3     15.436
## 4     15.597
## 5     15.509
## 6     15.714
str(data_new)
## 'data.frame':    9564 obs. of  43 variables:
##  $ koi_disposition  : chr  "CONFIRMED" "CONFIRMED" "FALSE POSITIVE" "FALSE POSITIVE" ...
##  $ koi_pdisposition : chr  "CANDIDATE" "CANDIDATE" "FALSE POSITIVE" "FALSE POSITIVE" ...
##  $ koi_score        : num  1 0.969 0 0 1 1 1 0.992 0 1 ...
##  $ koi_fpflag_nt    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ koi_fpflag_ss    : int  0 0 1 1 0 0 0 0 1 0 ...
##  $ koi_fpflag_co    : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ koi_fpflag_ec    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ koi_period       : num  9.49 54.42 19.9 1.74 2.53 ...
##  $ koi_period_err1  : num  2.78e-05 2.48e-04 1.49e-05 2.63e-07 3.76e-06 ...
##  $ koi_period_err2  : num  -2.78e-05 -2.48e-04 -1.49e-05 -2.63e-07 -3.76e-06 ...
##  $ koi_time0bk      : num  171 163 176 170 172 ...
##  $ koi_time0bk_err1 : num  0.00216 0.00352 0.000581 0.000115 0.00113 0.00141 0.0019 0.00461 0.00253 0.000517 ...
##  $ koi_time0bk_err2 : num  -0.00216 -0.00352 -0.000581 -0.000115 -0.00113 -0.00141 -0.0019 -0.00461 -0.00253 -0.000517 ...
##  $ koi_impact       : num  0.146 0.586 0.969 1.276 0.701 ...
##  $ koi_impact_err1  : num  0.318 0.059 5.126 0.115 0.235 ...
##  $ koi_impact_err2  : num  -0.146 -0.443 -0.077 -0.092 -0.478 -0.428 -0.532 -0.523 -0.044 -0.052 ...
##  $ koi_duration     : num  2.96 4.51 1.78 2.41 1.65 ...
##  $ koi_duration_err1: num  0.0819 0.116 0.0341 0.00537 0.042 0.061 0.0673 0.165 0.136 0.0241 ...
##  $ koi_duration_err2: num  -0.0819 -0.116 -0.0341 -0.00537 -0.042 -0.061 -0.0673 -0.165 -0.136 -0.0241 ...
##  $ koi_depth        : num  616 875 10829 8079 603 ...
##  $ koi_depth_err1   : num  19.5 35.5 171 12.8 16.9 24.2 18.7 16.8 5.8 33.3 ...
##  $ koi_depth_err2   : num  -19.5 -35.5 -171 -12.8 -16.9 -24.2 -18.7 -16.8 -5.8 -33.3 ...
##  $ koi_prad         : num  2.26 2.83 14.6 33.46 2.75 ...
##  $ koi_prad_err1    : num  0.26 0.32 3.92 8.5 0.88 1.27 0.9 0.52 6.45 0.22 ...
##  $ koi_prad_err2    : num  -0.15 -0.19 -1.31 -2.83 -0.35 -0.42 -0.3 -0.17 -9.67 -0.49 ...
##  $ koi_teq          : num  793 443 638 1395 1406 ...
##  $ koi_insol        : num  93.59 9.11 39.3 891.96 926.16 ...
##  $ koi_insol_err1   : num  29.45 2.87 31.04 668.95 874.33 ...
##  $ koi_insol_err2   : num  -16.65 -1.62 -10.49 -230.35 -314.24 ...
##  $ koi_model_snr    : num  35.8 25.8 76.3 505.6 40.9 ...
##  $ koi_tce_plnt_num : int  1 2 1 1 1 1 2 3 1 1 ...
##  $ koi_steff        : num  5455 5455 5853 5805 6031 ...
##  $ koi_steff_err1   : num  81 81 158 157 169 189 189 189 111 75 ...
##  $ koi_steff_err2   : num  -81 -81 -176 -174 -211 -232 -232 -232 -124 -83 ...
##  $ koi_slogg        : num  4.47 4.47 4.54 4.56 4.44 ...
##  $ koi_slogg_err1   : num  0.064 0.064 0.044 0.053 0.07 0.054 0.054 0.054 0.182 0.083 ...
##  $ koi_slogg_err2   : num  -0.096 -0.096 -0.176 -0.168 -0.21 -0.229 -0.229 -0.229 -0.098 -0.028 ...
##  $ koi_srad         : num  0.927 0.927 0.868 0.791 1.046 ...
##  $ koi_srad_err1    : num  0.105 0.105 0.233 0.201 0.334 0.315 0.315 0.315 0.322 0.033 ...
##  $ koi_srad_err2    : num  -0.061 -0.061 -0.078 -0.067 -0.133 -0.105 -0.105 -0.105 -0.483 -0.072 ...
##  $ ra               : num  292 292 297 286 289 ...
##  $ dec              : num  48.1 48.1 48.1 48.3 48.2 ...
##  $ koi_kepmag       : num  15.3 15.3 15.4 15.6 15.5 ...
#penanganan missing value dengan kNN
#install.packages("VIM")

library(VIM)
## Warning: package 'VIM' was built under R version 4.4.3
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
imputed_data <- kNN(data_new, k=5, imp_var = FALSE)
head(imputed_data)
##   koi_disposition koi_pdisposition koi_score koi_fpflag_nt koi_fpflag_ss
## 1       CONFIRMED        CANDIDATE     1.000             0             0
## 2       CONFIRMED        CANDIDATE     0.969             0             0
## 3  FALSE POSITIVE   FALSE POSITIVE     0.000             0             1
## 4  FALSE POSITIVE   FALSE POSITIVE     0.000             0             1
## 5       CONFIRMED        CANDIDATE     1.000             0             0
## 6       CONFIRMED        CANDIDATE     1.000             0             0
##   koi_fpflag_co koi_fpflag_ec koi_period koi_period_err1 koi_period_err2
## 1             0             0   9.488036       2.775e-05      -2.775e-05
## 2             0             0  54.418383       2.479e-04      -2.479e-04
## 3             0             0  19.899140       1.494e-05      -1.494e-05
## 4             0             0   1.736952       2.630e-07      -2.630e-07
## 5             0             0   2.525592       3.761e-06      -3.761e-06
## 6             0             0  11.094321       2.036e-05      -2.036e-05
##   koi_time0bk koi_time0bk_err1 koi_time0bk_err2 koi_impact koi_impact_err1
## 1    170.5387         0.002160        -0.002160      0.146           0.318
## 2    162.5138         0.003520        -0.003520      0.586           0.059
## 3    175.8503         0.000581        -0.000581      0.969           5.126
## 4    170.3076         0.000115        -0.000115      1.276           0.115
## 5    171.5956         0.001130        -0.001130      0.701           0.235
## 6    171.2012         0.001410        -0.001410      0.538           0.030
##   koi_impact_err2 koi_duration koi_duration_err1 koi_duration_err2 koi_depth
## 1          -0.146      2.95750           0.08190          -0.08190     615.8
## 2          -0.443      4.50700           0.11600          -0.11600     874.8
## 3          -0.077      1.78220           0.03410          -0.03410   10829.0
## 4          -0.092      2.40641           0.00537          -0.00537    8079.2
## 5          -0.478      1.65450           0.04200          -0.04200     603.3
## 6          -0.428      4.59450           0.06100          -0.06100    1517.5
##   koi_depth_err1 koi_depth_err2 koi_prad koi_prad_err1 koi_prad_err2 koi_teq
## 1           19.5          -19.5     2.26          0.26         -0.15     793
## 2           35.5          -35.5     2.83          0.32         -0.19     443
## 3          171.0         -171.0    14.60          3.92         -1.31     638
## 4           12.8          -12.8    33.46          8.50         -2.83    1395
## 5           16.9          -16.9     2.75          0.88         -0.35    1406
## 6           24.2          -24.2     3.90          1.27         -0.42     835
##   koi_insol koi_insol_err1 koi_insol_err2 koi_model_snr koi_tce_plnt_num
## 1     93.59          29.45         -16.65          35.8                1
## 2      9.11           2.87          -1.62          25.8                2
## 3     39.30          31.04         -10.49          76.3                1
## 4    891.96         668.95        -230.35         505.6                1
## 5    926.16         874.33        -314.24          40.9                1
## 6    114.81         112.85         -36.70          66.5                1
##   koi_steff koi_steff_err1 koi_steff_err2 koi_slogg koi_slogg_err1
## 1      5455             81            -81     4.467          0.064
## 2      5455             81            -81     4.467          0.064
## 3      5853            158           -176     4.544          0.044
## 4      5805            157           -174     4.564          0.053
## 5      6031            169           -211     4.438          0.070
## 6      6046            189           -232     4.486          0.054
##   koi_slogg_err2 koi_srad koi_srad_err1 koi_srad_err2       ra      dec
## 1         -0.096    0.927         0.105        -0.061 291.9342 48.14165
## 2         -0.096    0.927         0.105        -0.061 291.9342 48.14165
## 3         -0.176    0.868         0.233        -0.078 297.0048 48.13413
## 4         -0.168    0.791         0.201        -0.067 285.5346 48.28521
## 5         -0.210    1.046         0.334        -0.133 288.7549 48.22620
## 6         -0.229    0.972         0.315        -0.105 296.2861 48.22467
##   koi_kepmag
## 1     15.347
## 2     15.347
## 3     15.436
## 4     15.597
## 5     15.509
## 6     15.714
na_counts <- sort(colSums(is.na(imputed_data)), decreasing = TRUE)
na_df <- data.frame(Missing_Values = na_counts)
print(na_df)
##                   Missing_Values
## koi_disposition                0
## koi_pdisposition               0
## koi_score                      0
## koi_fpflag_nt                  0
## koi_fpflag_ss                  0
## koi_fpflag_co                  0
## koi_fpflag_ec                  0
## koi_period                     0
## koi_period_err1                0
## koi_period_err2                0
## koi_time0bk                    0
## koi_time0bk_err1               0
## koi_time0bk_err2               0
## koi_impact                     0
## koi_impact_err1                0
## koi_impact_err2                0
## koi_duration                   0
## koi_duration_err1              0
## koi_duration_err2              0
## koi_depth                      0
## koi_depth_err1                 0
## koi_depth_err2                 0
## koi_prad                       0
## koi_prad_err1                  0
## koi_prad_err2                  0
## koi_teq                        0
## koi_insol                      0
## koi_insol_err1                 0
## koi_insol_err2                 0
## koi_model_snr                  0
## koi_tce_plnt_num               0
## koi_steff                      0
## koi_steff_err1                 0
## koi_steff_err2                 0
## koi_slogg                      0
## koi_slogg_err1                 0
## koi_slogg_err2                 0
## koi_srad                       0
## koi_srad_err1                  0
## koi_srad_err2                  0
## ra                             0
## dec                            0
## koi_kepmag                     0
#standarisasi
numeric_data <- imputed_data[, sapply(imputed_data, is.numeric)]
categorical_data <- imputed_data[, sapply(imputed_data, Negate(is.numeric))]

binary_vars <- names(numeric_data)[sapply(numeric_data, function(x) length(unique(x)) == 2)]

numeric_vars <- setdiff(names(numeric_data), binary_vars)

scaled_numeric <- scale(numeric_data[, numeric_vars])

# Gabungkan kembali: scaled + biner + kategorikal
final_data <- cbind(as.data.frame(scaled_numeric),
                    numeric_data[, binary_vars, drop = FALSE],
                    categorical_data)

# ENCODING LABEL
final_data$koi_disposition_enc <- as.integer(factor(final_data$koi_disposition,
                                                    levels = c("FALSE POSITIVE", "CANDIDATE", "CONFIRMED"))) - 1

final_data$koi_pdisposition_enc <- as.integer(factor(final_data$koi_pdisposition,
                                                     levels = c("FALSE POSITIVE", "CANDIDATE", "CONFIRMED"))) - 1

table(final_data$koi_disposition, final_data$koi_disposition_enc)
##                 
##                     0    1    2
##   CANDIDATE         0 2248    0
##   CONFIRMED         0    0 2293
##   FALSE POSITIVE 5023    0    0
table(final_data$koi_pdisposition, final_data$koi_pdisposition_enc)
##                 
##                     0    1
##   CANDIDATE         0 4496
##   FALSE POSITIVE 5068    0
head(final_data[, c("koi_disposition", "koi_disposition_enc",
                    "koi_pdisposition", "koi_pdisposition_enc")])
##   koi_disposition koi_disposition_enc koi_pdisposition koi_pdisposition_enc
## 1       CONFIRMED                   2        CANDIDATE                    1
## 2       CONFIRMED                   2        CANDIDATE                    1
## 3  FALSE POSITIVE                   0   FALSE POSITIVE                    0
## 4  FALSE POSITIVE                   0   FALSE POSITIVE                    0
## 5       CONFIRMED                   2        CANDIDATE                    1
## 6       CONFIRMED                   2        CANDIDATE                    1
final_data <- final_data%>%
  dplyr::select(-koi_disposition, -koi_pdisposition, -koi_pdisposition_enc)

#install.packages('MASS')

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
outlier <- mahalanobis(scaled_numeric, colMeans(scaled_numeric), cov(scaled_numeric))
threshold <- qchisq(p = 0.99, df = ncol(scaled_numeric))
sum(outlier > threshold)
## [1] 600
outlier_index <- which(outlier > threshold)
head(final_data[outlier_index, ])
##      koi_score  koi_period koi_period_err1 koi_period_err2 koi_time0bk
## 12   1.1724578 -0.05504173      -0.2573629       0.2575222 -0.65997344
## 30  -0.9577444  0.44575900      -0.1038511       0.1040320  0.63105059
## 202  1.1724578 -0.05353116      -0.2570476       0.2572069  0.06535999
## 230  1.1724578 -0.04715416      -0.2568226       0.2569819  0.15836231
## 250  1.1724578 -0.04856580      -0.2553441       0.2555037  0.08278379
## 251  1.1724578 -0.05169757      -0.2530879       0.2532478 -0.42320099
##     koi_time0bk_err1 koi_time0bk_err2   koi_impact koi_impact_err1
## 12        -0.4328947        0.4331666 -0.152137993      -0.1865190
## 30        -0.3642798        0.3645617 -0.006944545      -0.2033544
## 202       -0.4119216        0.4121967 -0.005726990      -0.1971633
## 230       -0.4215472        0.4218208  0.026538220      -0.2016165
## 250       -0.3797336        0.3800133 -0.156399435      -0.1802193
## 251       -0.2547786        0.2550765 -0.024294706      -0.1705525
##     koi_impact_err2 koi_duration koi_duration_err1 koi_duration_err2
## 12       0.08853701   -0.2677821        -0.5099980         0.5101836
## 30       0.26137959    7.3846862        -0.3149687         0.3151865
## 202      0.01071690   -0.3919007        -0.4155304         0.4157316
## 230      0.23844314   -0.5274323        -0.4477404         0.4479363
## 250      0.09345197   -0.4052051        -0.4478916         0.4480875
## 251     -0.09905043   -0.5727537        -0.3134565         0.3136745
##       koi_depth koi_depth_err1 koi_depth_err2    koi_prad koi_prad_err1
## 12  -0.20579166    -0.03056899     0.03058046 -0.02756301   -0.04216644
## 30  -0.06525387    -0.02264661     0.02265810 -0.02821230   -0.03872587
## 202 -0.26013783    -0.02653306     0.02654454 -0.03202848   -0.04276593
## 230 -0.20836451    -0.02077813     0.02078962 -0.03128314   -0.04174940
## 250 -0.26733195    -0.02381753     0.02382901 -0.03212124   -0.04313084
## 251 -0.28349904    -0.02349366     0.02350514 -0.03250551   -0.04373033
##     koi_prad_err2    koi_teq   koi_insol koi_insol_err1 koi_insol_err2
## 12     0.02636101  1.1232139 -0.02219936    -0.05901247     0.03809786
## 30     0.02653185 -1.0556407 -0.04870545    -0.07104560     0.04562884
## 202    0.02664290 -0.5862222 -0.04851805    -0.07028697     0.04544482
## 230    0.02631829 -0.7779896 -0.04865517    -0.07083022     0.04557924
## 250    0.02677103 -0.6991650 -0.04861402    -0.07062925     0.04553470
## 251    0.02695041 -0.5956341 -0.04852840    -0.07024578     0.04544816
##     koi_model_snr koi_tce_plnt_num  koi_steff koi_steff_err1 koi_steff_err2
## 12     7.28284217       -0.3612793  0.9343576     -1.4923649      1.0279538
## 30     0.49975558       -0.3612793 -0.2425452     -0.9743788      0.7192296
## 202   -0.12045839       -0.3612793 -2.3955097      7.2486508     -1.3576423
## 230   -0.05305217       -0.3612793 -2.2509553      8.9536885     -1.9470248
## 250   -0.23979425       -0.3612793 -1.9298656     11.4572881     -3.5608104
## 251   -0.30413074        1.1622107 -1.9298656     11.4572881     -3.5608104
##      koi_slogg koi_slogg_err1 koi_slogg_err2    koi_srad koi_srad_err1
## 12  -0.6964601     -0.6575892      1.3904090  0.04174523    -0.2849172
## 30   0.3552572     -0.6652649      0.2949655 -0.11944897    -0.2309724
## 202  1.0164039     -0.1049396      0.6012185 -0.19696857    -0.2684035
## 230  0.9552302     -0.1126153      0.7072292 -0.19297615    -0.2761099
## 250  0.8634696     -0.2354263      0.8603557 -0.18465859    -0.2739081
## 251  0.8634696     -0.2354263      0.8603557 -0.18465859    -0.2739081
##     koi_srad_err2          ra         dec koi_kepmag koi_fpflag_nt
## 12      0.1281354  0.03925539  1.15490354 -2.7441164             0
## 30      0.1569711  0.97310271  1.52892943  0.6802056             1
## 202     0.1262445  0.97212299 -0.32886512  1.0302891             0
## 230     0.1295535 -1.44602029  0.01830088  1.0721548             0
## 250     0.1286081  0.73829245  0.20424225  0.5726542             0
## 251     0.1286081  0.73829245  0.20424225  0.5726542             0
##     koi_fpflag_ss koi_fpflag_co koi_fpflag_ec koi_disposition_enc
## 12              1             0             0                   2
## 30              0             0             0                   0
## 202             0             0             0                   1
## 230             0             0             0                   1
## 250             0             0             0                   2
## 251             0             0             0                   2
final_selected <- final_data[-outlier_index, ]

### 4. FEATURE SELECTION - STEPWISE AIC ###

model_full <- lm(koi_disposition_enc ~ ., data = final_selected)
model_step <- stepAIC(model_full, direction = "both", trace = FALSE)

selected_vars <- names(coef(model_step))[-1]
print("Fitur terpilih:")
## [1] "Fitur terpilih:"
print(selected_vars)
##  [1] "koi_score"         "koi_period"        "koi_period_err1"  
##  [4] "koi_time0bk"       "koi_time0bk_err1"  "koi_time0bk_err2" 
##  [7] "koi_impact"        "koi_impact_err1"   "koi_impact_err2"  
## [10] "koi_duration"      "koi_duration_err2" "koi_depth"        
## [13] "koi_prad"          "koi_prad_err1"     "koi_teq"          
## [16] "koi_insol_err1"    "koi_tce_plnt_num"  "koi_steff"        
## [19] "koi_steff_err1"    "koi_slogg"         "koi_slogg_err1"   
## [22] "koi_slogg_err2"    "koi_srad"          "koi_srad_err1"    
## [25] "koi_srad_err2"     "ra"                "dec"              
## [28] "koi_fpflag_nt"     "koi_fpflag_ss"     "koi_fpflag_co"
final_selected_data <- final_selected[, c("koi_disposition_enc", selected_vars)]

library(tidyr)
library(ggplot2)
library(dplyr)

final_selected_data <- final_selected_data %>%
  mutate(Row = row_number())

long_data <- final_selected_data %>%
  pivot_longer(cols = all_of(selected_vars), names_to = "Variable", values_to = "Value") %>%
  mutate(Outlier = ifelse(Row %in% outlier_index, "Outlier", "Normal"))

# Buat boxplot
ggplot(long_data, aes(x = Variable, y = Value, fill = Outlier)) +
  geom_boxplot(outlier.shape = NA, position = position_dodge(width = 0.75)) +
  geom_jitter(aes(color = Outlier), width = 0.2, alpha = 0.4) +
  scale_fill_manual(values = c("Normal" = "grey80", "Outlier" = "red")) +
  scale_color_manual(values = c("Normal" = "black", "Outlier" = "red")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Boxplot per Variabel dengan Highlight Outlier (Mahalanobis)",
       y = "Value", x = "Variabel")

# Visualisasi boxplot
ggplot(long_data, aes(x = Variable, y = Value, fill = Outlier)) +
  geom_boxplot(outlier.shape = NA, position = position_dodge(width = 0.75)) +
  geom_jitter(aes(color = Outlier), width = 0.2, alpha = 0.4) +
  scale_fill_manual(values = c("Normal" = "grey80", "Outlier" = "red")) +
  scale_color_manual(values = c("Normal" = "black", "Outlier" = "red")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Boxplot per Variabel dengan Highlight Outlier (Mahalanobis)",
       y = "Value", x = "Variabel")

# PCA sebelum outlier dihapus
pca_before <- prcomp(scaled_numeric)
pca_before_df <- as.data.frame(pca_before$x[, 1:2])
pca_before_df$outlier <- outlier > threshold

# PCA sesudah outlier dihapus
scaled_numeric_clean <- final_selected_data[, sapply(final_selected_data, is.numeric)]

pca_after <- prcomp(scaled_numeric_clean)
pca_after_df <- as.data.frame(pca_after$x[, 1:2])

# Visualisasi
library(ggplot2)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.4.3
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
p1 <- ggplot(pca_before_df, aes(x = PC1, y = PC2, color = outlier)) +
  geom_point(alpha = 0.6) +
  scale_color_manual(values = c("black", "red")) +
  labs(title = "Sebelum Outlier Dihapus", color = "Outlier") +
  theme_minimal()

p2 <- ggplot(pca_after_df, aes(x = PC1, y = PC2)) +
  geom_point(alpha = 0.6, color = "blue") +
  labs(title = "Setelah Outlier Dihapus") +
  theme_minimal()

gridExtra::grid.arrange(p1, p2, nrow = 1)

ncol(final_selected_data)
## [1] 32
names(final_selected_data)
##  [1] "koi_disposition_enc" "koi_score"           "koi_period"         
##  [4] "koi_period_err1"     "koi_time0bk"         "koi_time0bk_err1"   
##  [7] "koi_time0bk_err2"    "koi_impact"          "koi_impact_err1"    
## [10] "koi_impact_err2"     "koi_duration"        "koi_duration_err2"  
## [13] "koi_depth"           "koi_prad"            "koi_prad_err1"      
## [16] "koi_teq"             "koi_insol_err1"      "koi_tce_plnt_num"   
## [19] "koi_steff"           "koi_steff_err1"      "koi_slogg"          
## [22] "koi_slogg_err1"      "koi_slogg_err2"      "koi_srad"           
## [25] "koi_srad_err1"       "koi_srad_err2"       "ra"                 
## [28] "dec"                 "koi_fpflag_nt"       "koi_fpflag_ss"      
## [31] "koi_fpflag_co"       "Row"
final_selected_data$kelas <- final_selected_data$koi_disposition_enc
length(final_selected_data$kelas)  # Harus 8964
## [1] 8964
#IQR
iqr_zero <- sapply(final_selected_data[, 1:32], function(x) IQR(x) == 0)
which(iqr_zero)
## koi_tce_plnt_num    koi_fpflag_nt    koi_fpflag_ss    koi_fpflag_co 
##               18               29               30               31
datacek <- final_selected_data[, !iqr_zero]

ncol(datacek)
## [1] 29
names(datacek)
##  [1] "koi_disposition_enc" "koi_score"           "koi_period"         
##  [4] "koi_period_err1"     "koi_time0bk"         "koi_time0bk_err1"   
##  [7] "koi_time0bk_err2"    "koi_impact"          "koi_impact_err1"    
## [10] "koi_impact_err2"     "koi_duration"        "koi_duration_err2"  
## [13] "koi_depth"           "koi_prad"            "koi_prad_err1"      
## [16] "koi_teq"             "koi_insol_err1"      "koi_steff"          
## [19] "koi_steff_err1"      "koi_slogg"           "koi_slogg_err1"     
## [22] "koi_slogg_err2"      "koi_srad"            "koi_srad_err1"      
## [25] "koi_srad_err2"       "ra"                  "dec"                
## [28] "Row"                 "kelas"
#install.packages("caret")
#install.packages("future")
#installed.packages()["caret", ]

#FEATURE SELECTION dengan Korelasi

library(caret, verbose = TRUE)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
cor_matrix <- cor(datacek)
high_corr <- findCorrelation(cor_matrix, cutoff = 0.9)
data_final_for_mvn <- datacek[, -high_corr]

ncol(data_final_for_mvn)
## [1] 26
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
current_data <- data_final_for_mvn

repeat {
  model <- lm(kelas ~ ., data = current_data)

  vif_values <- vif(model)

  max_vif <- max(vif_values)
  if (max_vif <= 5) break

  feature_to_remove <- names(which.max(vif_values))
  cat("Menghapus:", feature_to_remove, "dengan VIF =", max_vif, "\n")

  current_data <- current_data[, !(names(current_data) %in% feature_to_remove)]
}
## Menghapus: koi_srad dengan VIF = 22.78738 
## Menghapus: koi_slogg dengan VIF = 5.583075
#install.packages("rrcov")
#install.packages("biotools")

library(rrcov)
## Warning: package 'rrcov' was built under R version 4.4.3
## Loading required package: robustbase
## Warning: package 'robustbase' was built under R version 4.4.3
## Scalable Robust Estimators with High Breakdown Point (version 1.7-7)
library(biotools)
## Warning: package 'biotools' was built under R version 4.4.3
## ---
## biotools version 4.3
length(data_final_for_mvn)
## [1] 26
ncol(data_final_for_mvn)
## [1] 26
length(data_final_for_mvn$kelas)     # Cek jumlah baris dari kolom `kelas`
## [1] 8964
"kelas" %in% names(data_final_for_mvn)  # Apakah kolom `kelas` ada?
## [1] TRUE
str(data_final_for_mvn$kelas)        # Lihat struktur kolom `kelas`
##  num [1:8964] 2 2 0 0 2 2 2 2 0 2 ...
"kelas" %in% names(current_data)  # Harus TRUE
## [1] TRUE
str(current_data)
## 'data.frame':    8964 obs. of  24 variables:
##  $ koi_score        : num  1.172 1.106 -0.958 -0.958 1.172 ...
##  $ koi_period       : num  -0.0496 -0.0159 -0.0418 -0.0554 -0.0548 ...
##  $ koi_period_err1  : num  -0.254 -0.227 -0.256 -0.257 -0.257 ...
##  $ koi_time0bk      : num  0.0641 -0.054 0.1423 0.0607 0.0797 ...
##  $ koi_time0bk_err2 : num  0.339 0.278 0.408 0.429 0.384 ...
##  $ koi_impact       : num  -0.17588 -0.04195 0.07463 0.16808 -0.00694 ...
##  $ koi_impact_err1  : num  -0.169 -0.197 0.353 -0.191 -0.178 ...
##  $ koi_impact_err2  : num  0.1459 -0.0974 0.2024 0.1901 -0.1261 ...
##  $ koi_duration     : num  -0.412 -0.172 -0.593 -0.497 -0.613 ...
##  $ koi_duration_err2: num  0.389 0.338 0.462 0.505 0.45 ...
##  $ koi_depth        : num  -0.28 -0.277 -0.155 -0.189 -0.281 ...
##  $ koi_prad_err1    : num  -0.0436 -0.0434 -0.0341 -0.0221 -0.042 ...
##  $ koi_teq          : num  -0.353 -0.765 -0.536 0.355 0.368 ...
##  $ koi_insol_err1   : num  -0.0705 -0.071 -0.0705 -0.0587 -0.0549 ...
##  $ koi_steff        : num  -0.326 -0.326 0.183 0.122 0.411 ...
##  $ koi_steff_err1   : num  -1.384 -1.384 0.277 0.256 0.515 ...
##  $ koi_slogg_err1   : num  -0.42 -0.42 -0.573 -0.504 -0.374 ...
##  $ koi_slogg_err2   : num  0.578 0.578 -0.365 -0.27 -0.765 ...
##  $ koi_srad_err1    : num  -0.2783 -0.2783 -0.1374 -0.1726 -0.0262 ...
##  $ koi_srad_err2    : num  0.151 0.151 0.143 0.148 0.117 ...
##  $ ra               : num  -0.0264 -0.0264 1.0373 -1.369 -0.6934 ...
##  $ dec              : num  1.2 1.2 1.2 1.24 1.23 ...
##  $ Row              : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ kelas            : num  2 2 0 0 2 2 2 2 0 2 ...
ncol(current_data)
## [1] 24
#ga transformed
library(MVN)
## Warning: package 'MVN' was built under R version 4.4.3
mvn_result <- mvn(current_data[, -which(names(current_data) == "kelas")], mvnTest = "hz")
print(mvn_result$multivariateNormality)
##            Test       HZ p value MVN
## 1 Henze-Zirkler 10.46139       0  NO
print(mvn_result$univariateNormality)
##                Test          Variable Statistic   p value Normality
## 1  Anderson-Darling     koi_score     1221.0027  <0.001      NO    
## 2  Anderson-Darling    koi_period     1674.9582  <0.001      NO    
## 3  Anderson-Darling  koi_period_err1  2419.1548  <0.001      NO    
## 4  Anderson-Darling    koi_time0bk    1398.9249  <0.001      NO    
## 5  Anderson-Darling koi_time0bk_err2   895.9801  <0.001      NO    
## 6  Anderson-Darling    koi_impact      203.4474  <0.001      NO    
## 7  Anderson-Darling  koi_impact_err1  2802.2317  <0.001      NO    
## 8  Anderson-Darling  koi_impact_err2   240.7686  <0.001      NO    
## 9  Anderson-Darling   koi_duration     696.0201  <0.001      NO    
## 10 Anderson-Darling koi_duration_err2 1000.8796  <0.001      NO    
## 11 Anderson-Darling     koi_depth     2495.1307  <0.001      NO    
## 12 Anderson-Darling   koi_prad_err1   2037.9700  <0.001      NO    
## 13 Anderson-Darling      koi_teq       235.5159  <0.001      NO    
## 14 Anderson-Darling  koi_insol_err1   2215.4798  <0.001      NO    
## 15 Anderson-Darling     koi_steff      131.2111  <0.001      NO    
## 16 Anderson-Darling  koi_steff_err1    244.3561  <0.001      NO    
## 17 Anderson-Darling  koi_slogg_err1    769.9105  <0.001      NO    
## 18 Anderson-Darling  koi_slogg_err2     59.1611  <0.001      NO    
## 19 Anderson-Darling   koi_srad_err1    592.1514  <0.001      NO    
## 20 Anderson-Darling   koi_srad_err2   1527.2079  <0.001      NO    
## 21 Anderson-Darling        ra           43.3719  <0.001      NO    
## 22 Anderson-Darling        dec          58.5571  <0.001      NO    
## 23 Anderson-Darling        Row          99.6392  <0.001      NO
#install.packages("bestNormalize")

library(bestNormalize)
## Warning: package 'bestNormalize' was built under R version 4.4.3
## 
## Attaching package: 'bestNormalize'
## The following object is masked from 'package:MASS':
## 
##     boxcox
# Pisahkan kolom kelas
kelas_col <- current_data$kelas
data_to_transform <- current_data[, names(current_data) != "kelas"]
data_transformed_part <- as.data.frame(lapply(data_to_transform, function(x) bestNormalize(x)$x.t))
## Warning: `progress_estimated()` was deprecated in dplyr 1.0.0.
## ℹ The deprecated feature was likely used in the bestNormalize package.
##   Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
data_transformed <- cbind(data_transformed_part, kelas = kelas_col)

ncol(data_transformed)
## [1] 24
#uji multivariate normalitas HZ test
library(MVN)

mvn_result <- mvn(data_transformed[, -which(names(data_transformed) == "kelas")], mvnTest = "hz")
print(mvn_result$multivariateNormality)
##            Test       HZ p value MVN
## 1 Henze-Zirkler 1.838561       0  NO
print(mvn_result$univariateNormality)
##                Test          Variable Statistic   p value Normality
## 1  Anderson-Darling     koi_score      553.7566  <0.001      NO    
## 2  Anderson-Darling    koi_period        0.0002     1        YES   
## 3  Anderson-Darling  koi_period_err1     0.0091     1        YES   
## 4  Anderson-Darling    koi_time0bk       0.0002     1        YES   
## 5  Anderson-Darling koi_time0bk_err2     0.0129     1        YES   
## 6  Anderson-Darling    koi_impact        0.0651  0.9996      YES   
## 7  Anderson-Darling  koi_impact_err1     0.2552  0.7273      YES   
## 8  Anderson-Darling  koi_impact_err2     0.2658  0.6918      YES   
## 9  Anderson-Darling   koi_duration       0.0003     1        YES   
## 10 Anderson-Darling koi_duration_err2    0.0743  0.9992      YES   
## 11 Anderson-Darling     koi_depth        0.0059     1        YES   
## 12 Anderson-Darling   koi_prad_err1      5.5044  <0.001      NO    
## 13 Anderson-Darling      koi_teq         0.0111     1        YES   
## 14 Anderson-Darling  koi_insol_err1      0.2591  0.7145      YES   
## 15 Anderson-Darling     koi_steff        0.3902  0.3821      YES   
## 16 Anderson-Darling  koi_steff_err1      0.6482  0.0908      YES   
## 17 Anderson-Darling  koi_slogg_err1      0.7421  0.0532      YES   
## 18 Anderson-Darling  koi_slogg_err2      0.5005   0.208      YES   
## 19 Anderson-Darling   koi_srad_err1      0.0541  0.9998      YES   
## 20 Anderson-Darling   koi_srad_err2      0.1462  0.9676      YES   
## 21 Anderson-Darling        ra            0.0005     1        YES   
## 22 Anderson-Darling        dec           0.0004     1        YES   
## 23 Anderson-Darling        Row           0.0002     1        YES
#UJI HOMOGEN
library(biotools)

# kelas hanya sebagai grouping
boxM(data_transformed[, -which(names(data_transformed) == "kelas")],
     grouping = data_transformed$kelas)
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  data_transformed[, -which(names(data_transformed) == "kelas")]
## Chi-Sq (approx.) = 40304, df = 552, p-value < 2.2e-16
hasil_boxM <- boxM(current_data[, 1:22], grouping = current_data$kelas)

print(hasil_boxM)
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  current_data[, 1:22]
## Chi-Sq (approx.) = 99113, df = 506, p-value < 2.2e-16
hasil_boxM$p.value
## [1] 0
###########################ANALISIS DISKRIMINAN#########################
set.seed(123)
train_index <- sample(nrow(data_transformed), size = floor(0.75 * nrow(data_transformed)), replace = FALSE)
training_data <- data_transformed[train_index, ]
test_data <- data_transformed[-train_index, ]

#masuk fungsi diskriminan
linearDA <- lda(formula = kelas ~., data = training_data)
linearDA
## Call:
## lda(kelas ~ ., data = training_data)
## 
## Prior probabilities of groups:
##         0         1         2 
## 0.5061728 0.2411126 0.2527146 
## 
## Group means:
##    koi_score  koi_period koi_period_err1 koi_time0bk koi_time0bk_err2
## 0 -0.6839720 -0.15086598    -0.195069269  -0.1400124       0.07781727
## 1  0.5762906  0.24135010     0.408379972   0.1280168      -0.43851852
## 2  0.9312293  0.07605598     0.009295873   0.1392560       0.23311219
##   koi_impact koi_impact_err1 koi_impact_err2 koi_duration koi_duration_err2
## 0  0.2740853     -0.01269439       0.2015503   0.11276005        0.04458291
## 1 -0.2207740      0.04194091      -0.2171955  -0.13410525       -0.41033633
## 2 -0.3432967     -0.04676345      -0.2192614  -0.07238414        0.28586718
##      koi_depth koi_prad_err1    koi_teq koi_insol_err1   koi_steff
## 0  0.164591477     0.2882493  0.2308983      0.2641781  0.19841377
## 1 -0.366797448    -0.2386574 -0.2485578     -0.2202241 -0.06448707
## 2 -0.009891373    -0.3578376 -0.2044547     -0.3040914 -0.31627542
##   koi_steff_err1 koi_slogg_err1 koi_slogg_err2 koi_srad_err1 koi_srad_err2
## 0      0.3677895     0.12609774    -0.25145637    0.29465025   -0.18160134
## 1     -0.1464673    -0.02689074     0.04295088   -0.02652089    0.01545131
## 2     -0.6061518    -0.22920823     0.49715852   -0.56120799    0.34682786
##            ra         dec        Row
## 0  0.12853300 -0.09118499  0.3624977
## 1 -0.04068517 -0.00388492  0.1154796
## 2 -0.24374048  0.16420387 -0.8167461
## 
## Coefficients of linear discriminants:
##                           LD1         LD2
## koi_score          2.47576492 -0.69217873
## koi_period         0.11483091 -0.44913051
## koi_period_err1    0.14808596  0.75787795
## koi_time0bk       -0.10317538  0.01315609
## koi_time0bk_err2   0.06114391  0.76896924
## koi_impact        -0.17997726 -0.46808872
## koi_impact_err1    0.04412994  0.23074621
## koi_impact_err2   -0.18151638 -0.56664690
## koi_duration       0.20400784  0.64587005
## koi_duration_err2  0.26032490  0.48654323
## koi_depth         -0.13983395  0.35350133
## koi_prad_err1      0.01383702 -0.21484327
## koi_teq            0.06195695  0.26077490
## koi_insol_err1     0.03944878  0.13631152
## koi_steff          0.06031380  0.02007047
## koi_steff_err1    -0.12580034 -0.15429933
## koi_slogg_err1     0.16123525  0.14942134
## koi_slogg_err2     0.03902269  0.03015204
## koi_srad_err1     -0.21517507  0.10358647
## koi_srad_err2      0.09891657  0.34129737
## ra                -0.02043286 -0.12930683
## dec                0.07391155  0.14058807
## Row               -0.16808103 -0.57376638
## 
## Proportion of trace:
##   LD1   LD2 
## 0.952 0.048
plot(linearDA, col = as.integer(training_data$kelas))

#ggplot
library(MASS)
library(ggplot2)

lda_model <- lda(kelas ~ ., data = training_data)
lda_pred <- predict(lda_model, test_data)

plot_df <- data.frame(lda_pred$x, kelas = as.factor(test_data$kelas))
ggplot(plot_df, aes(x = LD1, y = LD2, color = kelas)) +
  geom_point(alpha = 0.5) +
  theme_minimal() +
  labs(title = "LDA Plot", x = "LD1", y = "LD2")

predicted <- predict(object = linearDA, newdata = test_data)
predicted$class
##    [1] 2 2 0 2 0 2 0 2 2 0 2 2 2 2 2 2 2 2 2 2 2 0 0 2 2 2 2 2 2 2 2 0 2 2 2 0 0
##   [38] 2 0 2 2 2 2 2 0 2 2 1 0 2 2 2 0 2 0 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2
##   [75] 0 0 0 2 2 2 2 0 0 2 0 0 0 0 2 2 0 0 2 2 2 0 2 0 0 0 0 1 2 0 2 0 2 2 2 0 0
##  [112] 0 2 1 2 2 0 2 0 0 2 2 2 0 0 2 2 2 0 2 0 0 2 2 2 0 1 0 2 2 0 0 0 1 2 2 2 2
##  [149] 1 0 2 2 2 2 0 0 2 2 2 2 2 2 2 0 2 2 2 2 0 2 2 0 2 0 2 2 0 2 0 0 2 0 0 0 0
##  [186] 2 0 2 2 0 0 2 2 0 0 0 2 2 0 0 0 2 2 2 2 2 0 2 2 0 2 2 2 0 0 2 2 1 1 2 2 0
##  [223] 0 2 0 0 0 2 2 2 0 0 0 2 0 0 0 0 2 1 2 2 2 0 2 2 1 2 2 2 2 2 2 2 2 0 0 0 2
##  [260] 2 1 2 0 2 2 0 2 0 2 2 2 2 1 2 2 2 2 2 2 0 2 1 2 1 0 0 2 2 2 0 0 2 0 0 2 0
##  [297] 2 2 2 1 2 2 2 0 2 2 1 0 2 2 0 2 2 2 2 0 0 0 1 2 0 2 0 0 1 2 2 2 2 2 2 2 1
##  [334] 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 1 2 2 2 2 2
##  [371] 2 2 2 2 2 2 2 2 0 2 2 1 2 2 2 2 1 2 2 2 2 2 2 2 2 2 0 2 2 2 2 0 2 2 0 2 2
##  [408] 1 2 2 2 1 1 0 2 2 2 2 2 2 2 2 2 1 2 2 2 0 2 2 2 2 1 2 2 2 2 0 2 2 2 2 2 2
##  [445] 2 1 2 2 2 2 2 1 2 2 1 2 2 2 1 1 2 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 0 2 2
##  [482] 2 1 2 2 0 2 2 2 0 2 1 2 2 1 1 2 2 2 2 1 0 1 1 2 2 0 0 2 0 2 2 2 1 2 1 2 1
##  [519] 0 0 2 2 2 2 2 1 2 0 2 2 2 2 2 0 0 1 2 1 2 0 0 0 2 0 2 0 2 0 0 2 2 2 0 2 2
##  [556] 2 2 2 1 1 2 0 2 2 2 1 1 2 0 0 1 1 2 0 2 0 2 0 1 2 1 0 0 2 0 0 0 0 1 0 2 1
##  [593] 2 2 2 2 2 0 2 1 2 2 0 1 2 1 2 1 2 0 0 2 1 2 2 0 0 0 2 2 2 0 0 1 2 2 2 2 1
##  [630] 2 1 2 2 2 2 2 2 2 0 0 0 0 1 1 2 2 2 2 2 0 1 2 2 2 0 2 2 2 0 2 2 1 2 0 1 0
##  [667] 2 2 0 1 0 2 1 0 2 1 1 0 2 1 2 2 2 1 1 2 0 0 2 1 2 1 1 2 2 2 0 2 2 2 0 1 2
##  [704] 2 1 2 2 0 2 2 2 0 2 2 0 2 2 0 0 2 2 2 2 0 2 2 1 2 1 2 2 0 2 2 0 2 0 1 2 2
##  [741] 2 2 2 2 2 0 2 0 1 2 2 1 2 2 2 2 2 2 1 0 2 2 2 1 2 2 2 0 0 2 0 2 2 2 0 0 2
##  [778] 2 0 0 2 2 0 1 2 2 2 0 2 2 2 2 1 0 0 2 2 2 1 2 2 0 0 2 1 0 1 1 2 2 0 2 2 2
##  [815] 2 2 1 2 1 2 2 2 0 0 1 0 1 0 2 1 2 2 2 2 1 0 2 1 0 2 0 2 2 2 0 2 2 2 0 2 2
##  [852] 2 1 0 2 0 2 1 2 2 2 2 2 2 2 0 2 2 2 1 2 2 2 2 2 0 0 2 2 1 0 1 0 1 1 0 1 2
##  [889] 0 2 0 0 0 0 1 1 0 0 1 0 1 1 1 1 0 1 2 0 0 0 0 0 0 0 0 2 1 1 0 0 0 0 2 0 0
##  [926] 2 0 2 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 2 2 1 0 0 2 0 0 1 1 1 1 0 0 0 1 1 0 2
##  [963] 1 0 1 1 1 2 1 0 0 0 0 2 1 2 1 0 0 0 1 1 1 0 0 1 0 0 1 1 0 0 0 1 1 0 1 0 0
## [1000] 1 1 0 0 0 1 1 0 0 0 2 1 1 1 2 0 0 2 0 0 1 1 1 0 0 0 0 0 1 1 1 2 0 0 1 1 0
## [1037] 1 0 0 0 0 0 0 1 1 1 0 1 1 0 0 0 0 0 1 0 1 0 2 0 1 1 0 1 0 0 1 0 0 0 1 0 0
## [1074] 0 1 0 1 0 2 2 0 0 1 0 0 0 1 1 0 0 0 1 1 1 1 1 2 1 1 1 0 1 0 2 0 1 0 2 1 0
## [1111] 0 1 0 0 2 0 0 1 1 0 0 1 1 0 0 1 2 1 0 1 0 1 0 1 0 0 0 1 1 2 1 0 0 0 0 1 0
## [1148] 1 1 1 1 2 1 0 1 1 0 1 1 0 1 0 1 0 0 1 0 0 0 1 0 2 1 1 0 0 1 0 0 0 0 0 0 0
## [1185] 1 0 2 0 1 0 1 1 1 0 0 0 2 0 1 2 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0
## [1222] 1 1 1 0 0 1 2 0 0 0 1 1 1 1 0 1 0 0 1 0 0 0 1 1 1 1 0 0 1 0 0 1 0 1 1 0 0
## [1259] 0 1 1 0 1 1 0 0 0 1 0 0 0 1 1 2 0 0 0 0 0 2 0 1 0 0 2 0 0 0 0 1 0 1 1 0 0
## [1296] 0 2 0 1 0 0 1 0 0 0 1 1 0 1 0 0 0 2 0 0 0 0 0 1 0 0 0 0 2 2 0 0 0 0 0 0 0
## [1333] 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 2 0 0 0 0 0 1 0 0 1 0 0 0 0 1 2 0 0 0 0 0
## [1370] 0 0 0 0 0 2 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 2 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0
## [1407] 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 2 0 1 0 0 0 1 0 0 0 1 0 0 0
## [1444] 1 0 0 0 0 0 0 0 0 2 0 1 1 0 1 1 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 0 1 0 1
## [1481] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0
## [1518] 0 2 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0
## [1555] 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 1 1 0 0 1 1 2 0 0 0 0 0 1 0 0 0
## [1592] 1 0 0 1 1 1 0 1 0 1 0 1 1 1 1 0 0 0 1 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 1 0 0
## [1629] 1 0 0 0 1 0 1 0 0 1 1 1 0 1 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0
## [1666] 0 0 0 1 1 1 1 0 1 0 0 2 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1
## [1703] 0 1 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 1 1
## [1740] 0 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1
## [1777] 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 1 0 0 1 1 0 0 0 0 0 1 0 0
## [1814] 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1851] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0
## [1888] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 1 0 0 0 1 0 0 0 0 1 0 0 1 2 0 1 0 2 0
## [1925] 0 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0
## [1962] 0 0 0 0 0 0 0 1 0 0 0 2 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 1 0 0 0 0
## [1999] 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0
## [2036] 1 1 0 0 1 1 0 0 1 1 0 0 1 1 1 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## [2073] 0 2 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 1
## [2110] 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 0
## [2147] 1 0 0 0 1 0 1 0 1 0 1 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 1
## [2184] 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0
## [2221] 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 1 0 0 1
## Levels: 0 1 2
table(actual = test_data$kelas, predicted = predicted$class)
##       predicted
## actual    0    1    2
##      0 1107   36    0
##      1   31  348  142
##      2   10  103  464
conf_matrix <- table(actual = test_data$kelas, predicted = predicted$class)
print(conf_matrix)
##       predicted
## actual    0    1    2
##      0 1107   36    0
##      1   31  348  142
##      2   10  103  464
# Hit ratio (akurasi)
hit_ratio <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("Hit Ratio (Akurasi):", hit_ratio, "\n")
## Hit Ratio (Akurasi): 0.8563141
# === BIPLOT ===

scaling_df <- as.data.frame(lda_model$scaling)
scaling_df$variable <- rownames(scaling_df)
scaling_df$contribution <- sqrt(scaling_df$LD1^2 + ifelse("LD2" %in% colnames(scaling_df), scaling_df$LD2^2, 0))

scaling_sorted <- scaling_df[order(-scaling_df$contribution), ]

top_scaling_df <- scaling_sorted[1:5, ]
top_scaling_df$LD1 <- top_scaling_df$LD1 * 5  # scaling biar panah terlihat
top_scaling_df$LD2 <- ifelse("LD2" %in% colnames(top_scaling_df), top_scaling_df$LD2 * 5, 0)

ggplot(plot_df, aes(x = LD1, y = LD2, color = kelas)) +
  geom_point(alpha = 0.6, size = 2) +
  geom_segment(data = top_scaling_df,
               aes(x = 0, y = 0, xend = LD1, yend = LD2),
               arrow = arrow(length = unit(0.2, "cm")),
               color = "black") +
  geom_text_repel(data = top_scaling_df,
                  aes(x = LD1, y = LD2, label = variable),
                  size = 3.5,
                  color = "black") +
  theme_minimal() +
  labs(title = "LDA Biplot (Top 5 Fitur)", x = "LD1", y = "LD2")