1. Import dan Eksplorasi Data
# 1. Import Data
data <- read_excel("C:/Analisis Data Multivariat/Zhongshan January Air Quality.xlsx")

# 2. Cek Struktur dan Ringkasan Data
str(data)
## tibble [5,844 × 26] (S3: tbl_df/tbl/data.frame)
##  $ date     : POSIXct[1:5844], format: "2024-08-31" "2024-08-31" ...
##  $ Time     : POSIXct[1:5844], format: "1899-12-31 23:00:00" "1899-12-31 22:00:00" ...
##  $ sitename : chr [1:5844] "Zhongshan" "Zhongshan" "Zhongshan" "Zhongshan" ...
##  $ county   : chr [1:5844] "Taipei City" "Taipei City" "Taipei City" "Taipei City" ...
##  $ aqi      : chr [1:5844] "64.0" "65.0" "67.0" "64.0" ...
##  $ pollutant: chr [1:5844] "PM2.5" "PM2.5" "PM2.5" "PM2.5" ...
##  $ status   : chr [1:5844] "Moderate" "Moderate" "Moderate" "Moderate" ...
##  $ so2      : chr [1:5844] "1.1" "1.2" "1.2" "1.4" ...
##  $ co       : chr [1:5844] "0.44" "0.38" "0.37" "0.37" ...
##  $ o3       : chr [1:5844] "28.0" "37.7" "40.8" "43.5" ...
##  $ o3_8hr   : chr [1:5844] "44.2" "46.0" "47.5" "51.5" ...
##  $ pm10     : chr [1:5844] "29.0" "31.0" "30.0" "30.0" ...
##  $ pm2.5    : chr [1:5844] "20.0" "18.0" "26.0" "22.0" ...
##  $ no2      : chr [1:5844] "19.5" "15.7" "15.7" "15.5" ...
##  $ nox      : chr [1:5844] "20.2" "16.5" "16.6" "16.5" ...
##  $ no       : chr [1:5844] "0.6" "0.7" "0.8" "0.9" ...
##  $ windspeed: chr [1:5844] "1.5" "1" "2.1" "2.1" ...
##  $ winddirec: num [1:5844] 260 80 73 84 82 71 78 85 88 85 ...
##  $ unit     : logi [1:5844] NA NA NA NA NA NA ...
##  $ co_8hr   : chr [1:5844] "0.3" "0.3" "0.3" "0.3" ...
##  $ pm2.5_avg: chr [1:5844] "20.9" "21.0" "21.9" "20.6" ...
##  $ pm10_avg : chr [1:5844] "30.0" "31.0" "31.0" "32.0" ...
##  $ so2_avg  : chr [1:5844] "1.0" "1.0" "1.0" "1.0" ...
##  $ longitude: num [1:5844] 1.22e+08 1.22e+08 1.22e+08 1.22e+08 1.22e+08 ...
##  $ latitude : num [1:5844] 25062361 25062361 25062361 25062361 25062361 ...
##  $ siteid   : chr [1:5844] "12.0" "12.0" "12.0" "12.0" ...
summary(data)
##       date                          Time                       sitename        
##  Min.   :2024-01-01 00:00:00   Min.   :1899-12-31 00:00:00   Length:5844       
##  1st Qu.:2024-03-02 00:00:00   1st Qu.:1899-12-31 05:00:00   Class :character  
##  Median :2024-05-01 00:00:00   Median :1899-12-31 12:00:00   Mode  :character  
##  Mean   :2024-05-01 12:18:43   Mean   :1899-12-31 11:30:24                     
##  3rd Qu.:2024-07-02 00:00:00   3rd Qu.:1899-12-31 18:00:00                     
##  Max.   :2024-08-31 00:00:00   Max.   :1899-12-31 23:00:00                     
##                                                                                
##     county              aqi             pollutant            status         
##  Length:5844        Length:5844        Length:5844        Length:5844       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##      so2                 co                 o3               o3_8hr         
##  Length:5844        Length:5844        Length:5844        Length:5844       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##      pm10              pm2.5               no2                nox           
##  Length:5844        Length:5844        Length:5844        Length:5844       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##       no             windspeed           winddirec       unit        
##  Length:5844        Length:5844        Min.   :  0.0   Mode:logical  
##  Class :character   Class :character   1st Qu.:105.0   NA's:5844     
##  Mode  :character   Mode  :character   Median :121.0                 
##                                        Mean   :157.9                 
##                                        3rd Qu.:208.5                 
##                                        Max.   :360.0                 
##                                        NA's   :17                    
##     co_8hr           pm2.5_avg           pm10_avg           so2_avg         
##  Length:5844        Length:5844        Length:5844        Length:5844       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    longitude            latitude           siteid         
##  Min.   :121526528   Min.   :25062361   Length:5844       
##  1st Qu.:121526528   1st Qu.:25062361   Class :character  
##  Median :121526528   Median :25062361   Mode  :character  
##  Mean   :121526528   Mean   :25062361                     
##  3rd Qu.:121526528   3rd Qu.:25062361                     
##  Max.   :121526528   Max.   :25062361                     
## 

Data Cleaning

# 3. Memilih variabel numerik & hapus kolom tidak relevan
target_vars <- c("so2", "co", "o3", "pm10", "pm2.5", "no2", "nox", "no", "windspeed", "winddirec", "aqi")
df_raw <- as.data.frame(data[, intersect(target_vars, colnames(data))])

# Paksa numerik (Variabel non-numerik otomatis jadi NA dan ditangani di step berikutnya)
df_raw[] <- lapply(df_raw, function(x) as.numeric(as.character(x)))

# 4. Cek Missing Value per Variabel
cat("--- Jumlah Missing Value per Variabel ---\n")
## --- Jumlah Missing Value per Variabel ---
print(colSums(is.na(df_raw)))
##       so2        co        o3      pm10     pm2.5       no2       nox        no 
##       106        65        75        58        67        96        96        96 
## windspeed winddirec       aqi 
##        17        17         0
# 5. Cek Skewness (Kemiringan Data)
cat("\n--- Skewness Variabel ---\n")
## 
## --- Skewness Variabel ---
print(apply(df_raw, 2, e1071::skewness, na.rm = TRUE))
##       so2        co        o3      pm10     pm2.5       no2       nox        no 
## 2.2795157 1.7820152 0.5524136 1.3726201 1.2899806 1.1129996 2.2160163 4.4884860 
## windspeed winddirec       aqi 
## 0.5873836 0.7939939 1.0253725
# 6. Tangani Missing Value dengan Median Imputation
df_imputed <- df_raw
for(i in 1:ncol(df_imputed)){
  df_imputed[is.na(df_imputed[,i]), i] <- median(df_imputed[,i], na.rm = TRUE)
}

# 7. Cek ulang missing value
sum(is.na(df_imputed))
## [1] 0

Analisis Skewness

apply(df_raw,2,e1071::skewness,na.rm=TRUE)
##       so2        co        o3      pm10     pm2.5       no2       nox        no 
## 2.2795157 1.7820152 0.5524136 1.3726201 1.2899806 1.1129996 2.2160163 4.4884860 
## windspeed winddirec       aqi 
## 0.5873836 0.7939939 1.0253725

Analisis dan Penanganan Outlier

# 8. Visualisasi Outlier awal
boxplot(df_imputed,
        main="Boxplot Sebelum Penanganan Outlier",
        col="orange",
        las=2)

# 9. Handle Outlier (Metode Winsorizing)
handle_outliers <- function(x){
  
  qnt <- quantile(x, probs=c(.25,.75))
  caps <- quantile(x, probs=c(.05,.95))
  
  H <- 1.5 * IQR(x)
  
  x[x < (qnt[1] - H)] <- caps[1]
  x[x > (qnt[2] + H)] <- caps[2]
  
  return(x)
}

# 10. Cek Outlier Setelah dihandle
df_clean <- as.data.frame(lapply(df_imputed, handle_outliers))

boxplot(df_clean,
        main="Boxplot Setelah Penanganan Outlier",
        col="lightgreen",
        las=2)

Matriks Korelasi

## Karakteristik Data dan Standarisasi

# 2. Menghitung matriks korelasi dari data yang sudah dibersihkan (df_clean)
# Kita gunakan df_clean karena sudah melalui tahap penanganan outlier
cor_matrix <- cor(df_clean)

round(cor_matrix,2)
##             so2    co    o3 pm10 pm2.5   no2   nox    no windspeed winddirec
## so2        1.00  0.22  0.22 0.32  0.34  0.25  0.23  0.13      0.12     -0.02
## co         0.22  1.00 -0.27 0.36  0.46  0.87  0.90  0.70     -0.34      0.17
## o3         0.22 -0.27  1.00 0.25  0.16 -0.25 -0.35 -0.42      0.51     -0.04
## pm10       0.32  0.36  0.25 1.00  0.83  0.35  0.30  0.14      0.00      0.05
## pm2.5      0.34  0.46  0.16 0.83  1.00  0.40  0.35  0.17     -0.12      0.07
## no2        0.25  0.87 -0.25 0.35  0.40  1.00  0.94  0.60     -0.30      0.13
## nox        0.23  0.90 -0.35 0.30  0.35  0.94  1.00  0.82     -0.30      0.16
## no         0.13  0.70 -0.42 0.14  0.17  0.60  0.82  1.00     -0.20      0.16
## windspeed  0.12 -0.34  0.51 0.00 -0.12 -0.30 -0.30 -0.20      1.00     -0.16
## winddirec -0.02  0.17 -0.04 0.05  0.07  0.13  0.16  0.16     -0.16      1.00
## aqi        0.30  0.48  0.19 0.76  0.84  0.45  0.38  0.16     -0.11      0.04
##             aqi
## so2        0.30
## co         0.48
## o3         0.19
## pm10       0.76
## pm2.5      0.84
## no2        0.45
## nox        0.38
## no         0.16
## windspeed -0.11
## winddirec  0.04
## aqi        1.00
corrplot(cor_matrix,
         method="color",
         type="upper",
         addCoef.col="black",
         tl.col="black",
         tl.srt=45,
         diag=FALSE,
         title="Matriks Korelasi Kualitas Udara Zhongshan",
         mar=c(0,0,1,0))

# 11. Statistik Deskriptif (Karakteristik Data)
cat("Jumlah Observasi:", nrow(df_clean))
## Jumlah Observasi: 5844
cat("Jumlah Variabel:", ncol(df_clean))
## Jumlah Variabel: 11
stats_desc <- describe(df_clean)

kable(stats_desc, digits=2,
      caption="Statistik Deskriptif") %>%
  kable_styling(full_width=FALSE)
Statistik Deskriptif
vars n mean sd median trimmed mad min max range skew kurtosis se
so2 1 5844 1.01 0.50 0.90 0.97 0.44 0.00 2.20 2.20 0.56 -0.25 0.01
co 2 5844 0.44 0.18 0.41 0.42 0.16 0.07 0.88 0.81 0.64 -0.18 0.00
o3 3 5844 24.98 16.28 23.30 24.02 18.83 0.00 75.30 75.30 0.44 -0.62 0.21
pm10 4 5844 25.91 12.69 24.00 25.04 11.86 0.00 57.00 57.00 0.56 -0.27 0.17
pm2.5 5 5844 13.77 7.27 12.00 13.17 5.93 0.00 33.00 33.00 0.67 -0.15 0.10
no2 6 5844 17.15 8.42 15.70 16.48 8.30 0.40 39.90 39.50 0.62 -0.30 0.11
nox 7 5844 21.84 11.89 19.30 20.53 10.67 1.10 50.88 49.78 0.88 0.15 0.16
no 8 5844 4.60 4.68 2.90 3.64 2.67 0.00 17.20 17.20 1.63 1.80 0.06
windspeed 9 5844 1.61 0.78 1.50 1.57 0.74 0.20 3.70 3.50 0.40 -0.45 0.01
winddirec 10 5844 157.75 92.62 121.00 150.28 44.48 0.00 360.00 360.00 0.80 -0.28 1.21
aqi 11 5844 46.68 17.47 44.00 45.49 17.79 -1.00 93.00 94.00 0.53 -0.25 0.23
# 12. Standarisasi Data (Z-Score)
df_scaled <- as.data.frame(scale(df_clean))

Uji Kelayakan

# Gunakan df_clean dari tahap penanganan outlier
# Menghitung korelasi awal
cor_mat_initial <- cor(df_clean)

# Uji KMO & MSA Awal
res_kmo <- KMO(df_clean)

# Variabel data_final dibuat di sini untuk iterasi MSA
data_final <- df_clean

# Loop otomatis: Hapus variabel dengan MSA < 0.5 satu per satu
while(any(res_kmo$MSAi < 0.5)){
  
  drop_var <- names(which.min(res_kmo$MSAi))
  
  cat("Menghapus variabel:", drop_var,"\n")
  
  data_final <- data_final[, !(names(data_final) %in% drop_var)]
  
  res_kmo <- KMO(data_final)
}

kable(data.frame(MSA_Final=res_kmo$MSAi),
      digits=3,
      caption="Nilai MSA Final")
Nilai MSA Final
MSA_Final
so2 0.880
co 0.944
o3 0.720
pm10 0.823
pm2.5 0.763
no2 0.622
nox 0.626
no 0.538
windspeed 0.688
winddirec 0.726
aqi 0.855
# Uji Bartlett Final
cat("\n--- Hasil Uji Bartlett ---\n")
## 
## --- Hasil Uji Bartlett ---
cortest.bartlett(cor(data_final), n=nrow(data_final))
## $chisq
## [1] 60658.13
## 
## $p.value
## [1] 0
## 
## $df
## [1] 55

PCA

# 1. Pastikan variabel 'no' dihapus karena MSA < 0.5
data_final <- df_clean[, !(names(df_clean) %in% c("no"))]

# 2. Hitung PCA (Simpan ke object pca_obj agar bisa dipanggil di chunk FA)
pca_obj <- principal(data_final,
                     nfactors=ncol(data_final),
                     rotate="none")

pca_obj
## Principal Components Analysis
## Call: principal(r = data_final, nfactors = ncol(data_final), rotate = "none")
## Standardized loadings (pattern matrix) based upon correlation matrix
##             PC1   PC2   PC3   PC4   PC5   PC6   PC7   PC8   PC9  PC10 h2
## so2        0.39  0.38  0.48  0.21 -0.65  0.05  0.01 -0.01 -0.02  0.00  1
## co         0.87 -0.32  0.17  0.07  0.13 -0.07  0.04  0.23 -0.18 -0.05  1
## o3        -0.12  0.80  0.09  0.27  0.17 -0.48 -0.04  0.03  0.01  0.02  1
## pm10       0.70  0.54 -0.23 -0.11  0.05  0.14 -0.33 -0.08 -0.10 -0.01  1
## pm2.5      0.77  0.46 -0.27 -0.15 -0.04  0.11  0.04  0.20  0.20  0.01  1
## no2        0.85 -0.33  0.27  0.07  0.15 -0.10 -0.04 -0.15  0.14 -0.11  1
## nox        0.83 -0.41  0.28  0.09  0.15  0.00 -0.05 -0.04  0.03  0.16  1
## windspeed -0.32  0.58  0.47  0.25  0.40  0.35  0.07  0.01  0.00 -0.01  1
## winddirec  0.18 -0.18 -0.48  0.83 -0.04  0.09  0.02 -0.02  0.00  0.00  1
## aqi        0.77  0.44 -0.23 -0.16  0.05 -0.01  0.32 -0.16 -0.09  0.01  1
##                u2 com
## so2       6.7e-16 3.5
## co        3.2e-15 1.7
## o3        8.9e-16 2.1
## pm10      7.8e-16 2.9
## pm2.5     6.7e-16 2.5
## no2       1.6e-15 1.8
## nox       1.8e-15 2.0
## windspeed 1.3e-15 4.7
## winddirec 1.1e-15 1.9
## aqi       3.3e-16 2.5
## 
##                        PC1  PC2  PC3  PC4  PC5  PC6  PC7  PC8  PC9 PC10
## SS loadings           4.14 2.24 1.05 0.95 0.68 0.41 0.23 0.15 0.11 0.04
## Proportion Var        0.41 0.22 0.10 0.10 0.07 0.04 0.02 0.01 0.01 0.00
## Cumulative Var        0.41 0.64 0.74 0.84 0.91 0.95 0.97 0.98 1.00 1.00
## Proportion Explained  0.41 0.22 0.10 0.10 0.07 0.04 0.02 0.01 0.01 0.00
## Cumulative Proportion 0.41 0.64 0.74 0.84 0.91 0.95 0.97 0.98 1.00 1.00
## 
## Mean item complexity =  2.6
## Test of the hypothesis that 10 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0 
##  with the empirical chi square  0  with prob <  NA 
## 
## Fit based upon off diagonal values = 1

Variance PCA

eigenvalues <- pca_obj$values

prop_variance <- eigenvalues / sum(eigenvalues)

cum_variance <- cumsum(prop_variance)

data.frame(
  Eigenvalue = eigenvalues,
  Proporsi_Varian = prop_variance,
  Kumulatif = cum_variance
)
##    Eigenvalue Proporsi_Varian Kumulatif
## 1  4.13964392     0.413964392 0.4139644
## 2  2.23884327     0.223884327 0.6378487
## 3  1.04817328     0.104817328 0.7426660
## 4  0.95001813     0.095001813 0.8376679
## 5  0.68480484     0.068480484 0.9061483
## 6  0.40632750     0.040632750 0.9467811
## 7  0.22766755     0.022766755 0.9695478
## 8  0.14977979     0.014977979 0.9845258
## 9  0.11236067     0.011236067 0.9957619
## 10 0.04238105     0.004238105 1.0000000

Scree Plot

eigenvalues <- pca_obj$values

plot(eigenvalues,
     type="b",
     pch=19,
     main="Scree Plot PCA",
     xlab="Komponen Utama",
     ylab="Eigenvalue")

abline(h=1, col="red", lty=2)

LOading PCA

# PCA tanpa rotasi
pca_obj <- principal(data_final, nfactors = ncol(data_final), rotate = "none")

# Tabel Loading PCA
pca_loadings <- as.data.frame(pca_obj$loadings[ , 1:3])  # misal tampilkan 3 komponen pertama

kable(round(pca_loadings,3),
      caption="Tabel Loading PCA") %>%
  kable_styling(full_width = FALSE)
Tabel Loading PCA
PC1 PC2 PC3
so2 0.393 0.377 0.480
co 0.866 -0.325 0.168
o3 -0.121 0.805 0.093
pm10 0.697 0.544 -0.234
pm2.5 0.769 0.463 -0.274
no2 0.851 -0.332 0.266
nox 0.830 -0.411 0.283
windspeed -0.323 0.576 0.469
winddirec 0.178 -0.180 -0.479
aqi 0.772 0.436 -0.225
loadings_matrix <- as.data.frame(unclass(pca_obj$loadings))

kable(round(loadings_matrix[,1:3],3),
      caption="Tabel Component Loading PCA")
Tabel Component Loading PCA
PC1 PC2 PC3
so2 0.393 0.377 0.480
co 0.866 -0.325 0.168
o3 -0.121 0.805 0.093
pm10 0.697 0.544 -0.234
pm2.5 0.769 0.463 -0.274
no2 0.851 -0.332 0.266
nox 0.830 -0.411 0.283
windspeed -0.323 0.576 0.469
winddirec 0.178 -0.180 -0.479
aqi 0.772 0.436 -0.225

Parallel Analysis

fa.parallel(data_final,
            fa="fa",                                                               
            main="Parallel Analysis")

## Parallel analysis suggests that the number of factors =  3  and the number of components =  NA

Menentukan Jumlah Faktor

n_factors_final <- sum(eigenvalues > 1)

n_factors_final
## [1] 3

Factor Analysis (Rotasi Varimax)

fa_res <- fa(data_final,
             nfactors=n_factors_final,
             rotate="varimax",
             fm="minres")

fa_res
## Factor Analysis using method =  minres
## Call: fa(r = data_final, nfactors = n_factors_final, rotate = "varimax", 
##     fm = "minres")
## Standardized loadings (pattern matrix) based upon correlation matrix
##             MR2   MR1   MR3    h2      u2 com
## so2        0.31  0.24  0.26 0.221 0.77896 2.8
## co         0.31  0.83 -0.26 0.848 0.15235 1.5
## o3         0.28 -0.24  0.65 0.563 0.43737 1.7
## pm10       0.85  0.17  0.09 0.756 0.24375 1.1
## pm2.5      0.94  0.18 -0.06 0.922 0.07813 1.1
## no2        0.25  0.89 -0.18 0.897 0.10254 1.2
## nox        0.17  0.96 -0.21 0.999 0.00091 1.2
## windspeed -0.06 -0.15  0.75 0.589 0.41132 1.1
## winddirec  0.04  0.12 -0.14 0.035 0.96541 2.2
## aqi        0.85  0.25 -0.01 0.782 0.21813 1.2
## 
##                        MR2  MR1  MR3
## SS loadings           2.69 2.69 1.23
## Proportion Var        0.27 0.27 0.12
## Cumulative Var        0.27 0.54 0.66
## Proportion Explained  0.41 0.41 0.19
## Cumulative Proportion 0.41 0.81 1.00
## 
## Mean item complexity =  1.5
## Test of the hypothesis that 3 factors are sufficient.
## 
## df null model =  45  with the objective function =  7.78 with Chi Square =  45440.29
## df of  the model are 18  and the objective function was  0.19 
## 
## The root mean square of the residuals (RMSR) is  0.02 
## The df corrected root mean square of the residuals is  0.03 
## 
## The harmonic n.obs is  5844 with the empirical chi square  80.93  with prob <  5.9e-10 
## The total n.obs was  5844  with Likelihood Chi Square =  1111.23  with prob <  1.1e-224 
## 
## Tucker Lewis Index of factoring reliability =  0.94
## RMSEA index =  0.102  and the 90 % confidence intervals are  0.097 0.107
## BIC =  955.12
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                    MR2  MR1  MR3
## Correlation of (regression) scores with factors   0.97 0.99 0.84
## Multiple R square of scores with factors          0.94 0.98 0.70
## Minimum correlation of possible factor scores     0.89 0.96 0.40

Factor Loading

print(fa_res$loadings, cutoff=0.4)
## 
## Loadings:
##           MR2    MR1    MR3   
## so2                           
## co                0.829       
## o3                       0.652
## pm10       0.848              
## pm2.5      0.941              
## no2               0.895       
## nox               0.962       
## windspeed                0.750
## winddirec                     
## aqi        0.849              
## 
##                  MR2   MR1   MR3
## SS loadings    2.692 2.686 1.233
## Proportion Var 0.269 0.269 0.123
## Cumulative Var 0.269 0.538 0.661

Communality

fa_res$communality
##        so2         co         o3       pm10      pm2.5        no2        nox 
## 0.22104499 0.84764896 0.56263402 0.75625341 0.92186889 0.89745833 0.99908979 
##  windspeed  winddirec        aqi 
## 0.58867824 0.03459078 0.78186572

Factor Scores

factor_scores <- factor.scores(data_final, fa_res)

head(factor_scores$scores)
##            MR2        MR1        MR3
## [1,] 0.9439009 -0.3616013 -0.2331109
## [2,] 0.8416427 -0.5577845 -0.1841706
## [3,] 1.5688878 -0.5552198  0.5226293
## [4,] 1.1604722 -0.4364075  0.7930175
## [5,] 1.0996139 -0.1624342  1.0551315
## [6,] 1.4276382 -0.5355026  1.1435491

Diagram Struktur Faktor

fa.diagram(fa_res,
           main="Diagram Struktur Faktor")