# ==========================================================
# 1. INSTALL & LOAD PACKAGES
# ==========================================================
if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
## Warning: package 'pacman' was built under R version 4.5.3
pacman::p_load(tidyverse, nnet, car, broom)

# ==========================================================
# 2. LOAD DATASET
# ==========================================================
# Memastikan kolom teks dibaca sebagai factor
df_pm <- read.csv("predictive_maintenance (1).csv", stringsAsFactors = TRUE)

# Membersihkan nama kolom agar tidak ada spasi atau karakter spesial
colnames(df_pm) <- make.names(colnames(df_pm))

# Menampilkan struktur data
str(df_pm)
## 'data.frame':    10000 obs. of  10 variables:
##  $ UDI                    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Product.ID             : Factor w/ 10000 levels "H29424","H29425",..: 7004 1004 1005 1006 1007 7005 1008 1009 7006 7007 ...
##  $ Type                   : Factor w/ 3 levels "H","L","M": 3 2 2 2 2 3 2 2 3 3 ...
##  $ Air.temperature..K.    : num  298 298 298 298 298 ...
##  $ Process.temperature..K.: num  309 309 308 309 309 ...
##  $ Rotational.speed..rpm. : int  1551 1408 1498 1433 1408 1425 1558 1527 1667 1741 ...
##  $ Torque..Nm.            : num  42.8 46.3 49.4 39.5 40 41.9 42.4 40.2 28.6 28 ...
##  $ Tool.wear..min.        : int  0 3 5 7 9 11 14 16 18 21 ...
##  $ Target                 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Failure.Type           : Factor w/ 6 levels "Heat Dissipation Failure",..: 2 2 2 2 2 2 2 2 2 2 ...
# ==========================================================
# 3. PENGECEKAN TARGET (Failure.Type)
# ==========================================================
# Menghapus baris jika ada Failure.Type yang tidak jelas (jika ada)
# Menampilkan jumlah kategori (Nominal > 2)
table(df_pm$Failure.Type)
## 
## Heat Dissipation Failure               No Failure       Overstrain Failure 
##                      112                     9652                       78 
##            Power Failure          Random Failures        Tool Wear Failure 
##                       95                       18                       45
# ==========================================================
# 4. UJI ASUMSI MULTIKOLINEARITAS (VIF)
# ==========================================================
# Kita menggunakan model linear bantuan untuk mengecek nilai VIF
# Mengecek apakah variabel suhu, kecepatan, torsi, dll saling tumpang tindih
vif_model <- lm(as.numeric(Failure.Type) ~ Type + Air.temperature..K. + 
                Process.temperature..K. + Rotational.speed..rpm. + 
                Torque..Nm. + Tool.wear..min., data = df_pm)

print("--- Hasil Uji VIF ---")
## [1] "--- Hasil Uji VIF ---"
vif(vif_model)
##                             GVIF Df GVIF^(1/(2*Df))
## Type                    1.000842  2        1.000211
## Air.temperature..K.     4.304971  1        2.074842
## Process.temperature..K. 4.302882  1        2.074339
## Rotational.speed..rpm.  4.270917  1        2.066620
## Torque..Nm.             4.269660  1        2.066316
## Tool.wear..min.         1.000306  1        1.000153
# Interpretasi: Jika VIF < 5 atau 10, maka asumsi terpenuhi.

# ==========================================================
# 5. UJI LINEARITAS LOGIT (BOX-TIDWELL)
# ==========================================================
# Mengecek apakah variabel numerik linear dengan logitnya
# Kita buat variabel interaksi: Variabel * log(Variabel)
df_linear <- df_pm %>%
  # Filter nilai 0 pada Tool.wear agar tidak error saat di-log
  filter(Tool.wear..min. > 0) %>% 
  mutate(
    air_temp_log = Air.temperature..K. * log(Air.temperature..K.),
    torque_log = Torque..Nm. * log(Torque..Nm.),
    tool_wear_log = Tool.wear..min. * log(Tool.wear..min.)
  )

# Lakukan cek pada salah satu kategori utama kegagalan (misal: Power Failure)
# Jika p-value variabel *_log > 0.05, maka asumsi linearitas terpenuhi
model_linear_check <- glm(I(Failure.Type == "Power Failure") ~ 
                          Air.temperature..K. + air_temp_log + 
                          Torque..Nm. + torque_log, 
                          data = df_linear, family = binomial)

print("--- Hasil Uji Linearitas (Box-Tidwell) ---")
## [1] "--- Hasil Uji Linearitas (Box-Tidwell) ---"
summary(model_linear_check)
## 
## Call:
## glm(formula = I(Failure.Type == "Power Failure") ~ Air.temperature..K. + 
##     air_temp_log + Torque..Nm. + torque_log, family = binomial, 
##     data = df_linear)
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         4649.15939 6867.45180   0.677    0.498    
## Air.temperature..K. -102.40052  153.42395  -0.667    0.504    
## air_temp_log          15.25347   22.88522   0.667    0.505    
## Torque..Nm.           -5.24687    0.41740 -12.570   <2e-16 ***
## torque_log             1.14412    0.09098  12.576   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1043.61  on 9879  degrees of freedom
## Residual deviance:  253.92  on 9875  degrees of freedom
## AIC: 263.92
## 
## Number of Fisher Scoring iterations: 11
# ==========================================================
# 6. UJI OUTLIER (Z-SCORE)
# ==========================================================
# Mengecek apakah ada data ekstrem pada variabel numerik utama (misal: Torque)
mean_torque <- mean(df_pm$Torque..Nm., na.rm = TRUE)
sd_torque <- sd(df_pm$Torque..Nm., na.rm = TRUE)

outliers <- which(abs(df_pm$Torque..Nm. - mean_torque) > 3 * sd_torque)

cat("Jumlah outlier yang ditemukan pada variabel Torque:", length(outliers), "\n")
## Jumlah outlier yang ditemukan pada variabel Torque: 25
# Opsional: Visualisasi Boxplot untuk melihat outlier secara manual
boxplot(df_pm$Torque..Nm., main="Boxplot Torque untuk Deteksi Outlier", col="orange")

# ==========================================================
# 1. TRANSFORMASI DATA (LOG TRANSFORMATION)
# ==========================================================

# Kita buat kolom baru 'Log_Torque'
# Catatan: Torsi di dataset ini tidak ada yang nol, jadi aman untuk di-log
df_pm <- df_pm %>%
  mutate(Log_Torque = log(Torque..Nm.))

# ==========================================================
# 2. CEK ULANG LINEARITAS (BOX-TIDWELL) SETELAH TRANSFORMASI
# ==========================================================

# Kita buat variabel interaksi baru untuk Log_Torque
df_linear_check <- df_pm %>%
  mutate(log_torque_interaction = Log_Torque * log(Log_Torque))

# Jalankan model pengecekan pada kategori yang sama (Power Failure)
model_transformed_check <- glm(I(Failure.Type == "Power Failure") ~ 
                               Log_Torque + log_torque_interaction, 
                               data = df_linear_check, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
print("--- Hasil Uji Linearitas Setelah Transformasi ---")
## [1] "--- Hasil Uji Linearitas Setelah Transformasi ---"
summary(model_transformed_check)
## 
## Call:
## glm(formula = I(Failure.Type == "Power Failure") ~ Log_Torque + 
##     log_torque_interaction, family = binomial, data = df_linear_check)
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              506.48      41.67   12.15   <2e-16 ***
## Log_Torque              -343.83      28.23  -12.18   <2e-16 ***
## log_torque_interaction   155.58      12.78   12.17   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1073.8  on 9999  degrees of freedom
## Residual deviance:  255.4  on 9997  degrees of freedom
## AIC: 261.4
## 
## Number of Fisher Scoring iterations: 11
# ==========================================================
# 3. VISUALISASI PERBANDINGAN (OPSIONAL)
# ==========================================================
par(mfrow=c(1,2))
hist(df_pm$Torque..Nm., main="Original Torque", col="gray")
hist(df_pm$Log_Torque, main="Log Transformed Torque", col="lightblue")

# ==========================================================
# 7. PEMODELAN MULTINOMIAL LOGISTIC REGRESSION (FINAL)
# ==========================================================

# Set Referensi ke "No Failure"
df_pm$Failure.Type <- relevel(df_pm$Failure.Type, ref = "No Failure")

# Menjalankan Model dengan Log_Torque
model_final <- multinom(Failure.Type ~ Type + Air.temperature..K. + 
                        Process.temperature..K. + Rotational.speed..rpm. + 
                        Log_Torque + Tool.wear..min., data = df_pm)
## # weights:  54 (40 variable)
## initial  value 17917.594692 
## iter  10 value 2915.954527
## iter  20 value 2296.310647
## iter  30 value 1429.837113
## iter  40 value 1075.766345
## iter  50 value 868.208668
## iter  60 value 846.334052
## iter  70 value 838.201515
## iter  80 value 833.045876
## iter  90 value 831.207365
## iter 100 value 830.551558
## final  value 830.551558 
## stopped after 100 iterations
# Menghitung P-Value
summary_mnl <- summary(model_final)
z_stats <- summary_mnl$coefficients / summary_mnl$standard.errors
p_values <- (1 - pnorm(abs(z_stats), 0, 1)) * 2

# Menghitung Odds Ratio (OR)
OR <- exp(coef(model_final))

# ==========================================================
# 8. OUTPUT UNTUK LAPORAN
# ==========================================================
print("--- HASIL ODDS RATIO ---")
## [1] "--- HASIL ODDS RATIO ---"
round(OR, 4)
##                          (Intercept)   TypeL  TypeM Air.temperature..K.
## Heat Dissipation Failure      0.0000  1.3282 1.0615            503.8088
## Overstrain Failure            0.0000 78.0099 0.9511              0.7548
## Power Failure                 0.0000  3.2690 3.3032              1.1100
## Random Failures               0.0000  0.5040 0.1626              1.0089
## Tool Wear Failure             5.0446  0.6869 0.9427              1.1442
##                          Process.temperature..K. Rotational.speed..rpm.
## Heat Dissipation Failure                  0.0033                 0.9493
## Overstrain Failure                        1.1433                 0.9956
## Power Failure                             0.8373                 1.0355
## Random Failures                           1.1483                 0.9979
## Tool Wear Failure                         0.8503                 0.9967
##                            Log_Torque Tool.wear..min.
## Heat Dissipation Failure 1.800000e-02          0.9989
## Overstrain Failure       8.318628e+09          1.1528
## Power Failure            4.053522e+10          1.0016
## Random Failures          1.322000e+00          1.0035
## Tool Wear Failure        7.730000e-02          1.0968
print("--- HASIL P-VALUE ---")
## [1] "--- HASIL P-VALUE ---"
round(p_values, 4)
##                          (Intercept)  TypeL  TypeM Air.temperature..K.
## Heat Dissipation Failure           0 0.0930 0.7094              0.0000
## Overstrain Failure                 0 0.0000 0.8885              0.1264
## Power Failure                      0 0.0000 0.0000              0.4118
## Random Failures                    0 0.1053 0.0000              0.9697
## Tool Wear Failure                  0 0.0411 0.7291              0.3539
##                          Process.temperature..K. Rotational.speed..rpm.
## Heat Dissipation Failure                  0.0000                 0.0000
## Overstrain Failure                        0.4537                 0.2281
## Power Failure                             0.1488                 0.0000
## Random Failures                           0.5389                 0.2598
## Tool Wear Failure                         0.2502                 0.0003
##                          Log_Torque Tool.wear..min.
## Heat Dissipation Failure          0          0.6302
## Overstrain Failure                0          0.0000
## Power Failure                     0          0.4237
## Random Failures                   0          0.3560
## Tool Wear Failure                 0          0.0000
# Menghitung Pseudo R-Square
log_null <- multinom(Failure.Type ~ 1, data = df_pm)$Deviance
## # weights:  12 (5 variable)
## initial  value 17917.594692 
## iter  10 value 2637.138759
## iter  20 value 2033.611318
## iter  30 value 2022.831790
## iter  30 value 2022.831790
## iter  30 value 2022.831790
## final  value 2022.831790 
## converged
log_model <- model_final$Deviance
pseudo_r2 <- 1 - (log_model / log_null)
cat("McFadden Pseudo R-Square:", round(pseudo_r2, 4))
## McFadden Pseudo R-Square:
# 1. Bagi data menjadi 80% Latih, 20% Uji
# ==========================================================
# 9. PEMBAGIAN DATA & CEK AKURASI (DATA SPLITTING)
# ==========================================================

# Memuat paket yang diperlukan
if (!require("caret")) install.packages("caret")
## Loading required package: caret
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(caret)

# Membagi data: 80% Latih (Train), 20% Uji (Test)
set.seed(123) # Agar hasil pembagian konsisten jika dirunning ulang
train_index <- createDataPartition(df_pm$Failure.Type, p = 0.8, list = FALSE)
train_data  <- df_pm[train_index, ]
test_data   <- df_pm[-train_index, ]

# Latih ulang model menggunakan train_data saja
model_train <- nnet::multinom(Failure.Type ~ Type + Air.temperature..K. + 
                             Process.temperature..K. + Rotational.speed..rpm. + 
                             Log_Torque + Tool.wear..min., data = train_data, trace = FALSE)

# Melakukan prediksi pada "New Data" (test_data)
prediksi_uji <- predict(model_train, newdata = test_data)

# Membuat Confusion Matrix
cm <- confusionMatrix(prediksi_uji, test_data$Failure.Type)

# Menampilkan Hasil Akurasi
print(cm)
## Confusion Matrix and Statistics
## 
##                           Reference
## Prediction                 No Failure Heat Dissipation Failure
##   No Failure                     1927                       11
##   Heat Dissipation Failure          2                       10
##   Overstrain Failure                0                        1
##   Power Failure                     1                        0
##   Random Failures                   0                        0
##   Tool Wear Failure                 0                        0
##                           Reference
## Prediction                 Overstrain Failure Power Failure Random Failures
##   No Failure                                6             8               3
##   Heat Dissipation Failure                  0             2               0
##   Overstrain Failure                        9             3               0
##   Power Failure                             0             6               0
##   Random Failures                           0             0               0
##   Tool Wear Failure                         0             0               0
##                           Reference
## Prediction                 Tool Wear Failure
##   No Failure                               9
##   Heat Dissipation Failure                 0
##   Overstrain Failure                       0
##   Power Failure                            0
##   Random Failures                          0
##   Tool Wear Failure                        0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.977           
##                  95% CI : (0.9694, 0.9831)
##     No Information Rate : 0.966           
##     P-Value [Acc > NIR] : 0.002643        
##                                           
##                   Kappa : 0.5424          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: No Failure Class: Heat Dissipation Failure
## Sensitivity                     0.9984                        0.454545
## Specificity                     0.4559                        0.997976
## Pos Pred Value                  0.9812                        0.714286
## Neg Pred Value                  0.9118                        0.993952
## Prevalence                      0.9660                        0.011011
## Detection Rate                  0.9645                        0.005005
## Detection Prevalence            0.9830                        0.007007
## Balanced Accuracy               0.7272                        0.726261
##                      Class: Overstrain Failure Class: Power Failure
## Sensitivity                           0.600000             0.315789
## Specificity                           0.997983             0.999495
## Pos Pred Value                        0.692308             0.857143
## Neg Pred Value                        0.996977             0.993471
## Prevalence                            0.007508             0.009510
## Detection Rate                        0.004505             0.003003
## Detection Prevalence                  0.006507             0.003504
## Balanced Accuracy                     0.798991             0.657642
##                      Class: Random Failures Class: Tool Wear Failure
## Sensitivity                        0.000000                 0.000000
## Specificity                        1.000000                 1.000000
## Pos Pred Value                          NaN                      NaN
## Neg Pred Value                     0.998498                 0.995495
## Prevalence                         0.001502                 0.004505
## Detection Rate                     0.000000                 0.000000
## Detection Prevalence               0.000000                 0.000000
## Balanced Accuracy                  0.500000                 0.500000