- Import dan Eksplorasi Data
# 1. Import Data
data <- read_excel("C:/Analisis Data Multivariat/Zhongshan January Air Quality.xlsx")
# 2. Cek Struktur dan Ringkasan Data
str(data)
## tibble [5,844 × 26] (S3: tbl_df/tbl/data.frame)
## $ date : POSIXct[1:5844], format: "2024-08-31" "2024-08-31" ...
## $ Time : POSIXct[1:5844], format: "1899-12-31 23:00:00" "1899-12-31 22:00:00" ...
## $ sitename : chr [1:5844] "Zhongshan" "Zhongshan" "Zhongshan" "Zhongshan" ...
## $ county : chr [1:5844] "Taipei City" "Taipei City" "Taipei City" "Taipei City" ...
## $ aqi : chr [1:5844] "64.0" "65.0" "67.0" "64.0" ...
## $ pollutant: chr [1:5844] "PM2.5" "PM2.5" "PM2.5" "PM2.5" ...
## $ status : chr [1:5844] "Moderate" "Moderate" "Moderate" "Moderate" ...
## $ so2 : chr [1:5844] "1.1" "1.2" "1.2" "1.4" ...
## $ co : chr [1:5844] "0.44" "0.38" "0.37" "0.37" ...
## $ o3 : chr [1:5844] "28.0" "37.7" "40.8" "43.5" ...
## $ o3_8hr : chr [1:5844] "44.2" "46.0" "47.5" "51.5" ...
## $ pm10 : chr [1:5844] "29.0" "31.0" "30.0" "30.0" ...
## $ pm2.5 : chr [1:5844] "20.0" "18.0" "26.0" "22.0" ...
## $ no2 : chr [1:5844] "19.5" "15.7" "15.7" "15.5" ...
## $ nox : chr [1:5844] "20.2" "16.5" "16.6" "16.5" ...
## $ no : chr [1:5844] "0.6" "0.7" "0.8" "0.9" ...
## $ windspeed: chr [1:5844] "1.5" "1" "2.1" "2.1" ...
## $ winddirec: num [1:5844] 260 80 73 84 82 71 78 85 88 85 ...
## $ unit : logi [1:5844] NA NA NA NA NA NA ...
## $ co_8hr : chr [1:5844] "0.3" "0.3" "0.3" "0.3" ...
## $ pm2.5_avg: chr [1:5844] "20.9" "21.0" "21.9" "20.6" ...
## $ pm10_avg : chr [1:5844] "30.0" "31.0" "31.0" "32.0" ...
## $ so2_avg : chr [1:5844] "1.0" "1.0" "1.0" "1.0" ...
## $ longitude: num [1:5844] 1.22e+08 1.22e+08 1.22e+08 1.22e+08 1.22e+08 ...
## $ latitude : num [1:5844] 25062361 25062361 25062361 25062361 25062361 ...
## $ siteid : chr [1:5844] "12.0" "12.0" "12.0" "12.0" ...
summary(data)
## date Time sitename
## Min. :2024-01-01 00:00:00 Min. :1899-12-31 00:00:00 Length:5844
## 1st Qu.:2024-03-02 00:00:00 1st Qu.:1899-12-31 05:00:00 Class :character
## Median :2024-05-01 00:00:00 Median :1899-12-31 12:00:00 Mode :character
## Mean :2024-05-01 12:18:43 Mean :1899-12-31 11:30:24
## 3rd Qu.:2024-07-02 00:00:00 3rd Qu.:1899-12-31 18:00:00
## Max. :2024-08-31 00:00:00 Max. :1899-12-31 23:00:00
##
## county aqi pollutant status
## Length:5844 Length:5844 Length:5844 Length:5844
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## so2 co o3 o3_8hr
## Length:5844 Length:5844 Length:5844 Length:5844
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## pm10 pm2.5 no2 nox
## Length:5844 Length:5844 Length:5844 Length:5844
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## no windspeed winddirec unit
## Length:5844 Length:5844 Min. : 0.0 Mode:logical
## Class :character Class :character 1st Qu.:105.0 NA's:5844
## Mode :character Mode :character Median :121.0
## Mean :157.9
## 3rd Qu.:208.5
## Max. :360.0
## NA's :17
## co_8hr pm2.5_avg pm10_avg so2_avg
## Length:5844 Length:5844 Length:5844 Length:5844
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## longitude latitude siteid
## Min. :121526528 Min. :25062361 Length:5844
## 1st Qu.:121526528 1st Qu.:25062361 Class :character
## Median :121526528 Median :25062361 Mode :character
## Mean :121526528 Mean :25062361
## 3rd Qu.:121526528 3rd Qu.:25062361
## Max. :121526528 Max. :25062361
##
Data Cleaning
# 3. Memilih variabel numerik & hapus kolom tidak relevan
target_vars <- c("so2", "co", "o3", "pm10", "pm2.5", "no2", "nox", "no", "windspeed", "winddirec", "aqi")
df_raw <- as.data.frame(data[, intersect(target_vars, colnames(data))])
# Paksa numerik (Variabel non-numerik otomatis jadi NA dan ditangani di step berikutnya)
df_raw[] <- lapply(df_raw, function(x) as.numeric(as.character(x)))
# 4. Cek Missing Value per Variabel
cat("--- Jumlah Missing Value per Variabel ---\n")
## --- Jumlah Missing Value per Variabel ---
print(colSums(is.na(df_raw)))
## so2 co o3 pm10 pm2.5 no2 nox no
## 106 65 75 58 67 96 96 96
## windspeed winddirec aqi
## 17 17 0
# 5. Cek Skewness (Kemiringan Data)
cat("\n--- Skewness Variabel ---\n")
##
## --- Skewness Variabel ---
print(apply(df_raw, 2, e1071::skewness, na.rm = TRUE))
## so2 co o3 pm10 pm2.5 no2 nox no
## 2.2795157 1.7820152 0.5524136 1.3726201 1.2899806 1.1129996 2.2160163 4.4884860
## windspeed winddirec aqi
## 0.5873836 0.7939939 1.0253725
# 6. Tangani Missing Value dengan Median Imputation
df_imputed <- df_raw
for(i in 1:ncol(df_imputed)){
df_imputed[is.na(df_imputed[,i]), i] <- median(df_imputed[,i], na.rm = TRUE)
}
# 7. Cek ulang missing value
sum(is.na(df_imputed))
## [1] 0
Analisis Skewness
apply(df_raw,2,e1071::skewness,na.rm=TRUE)
## so2 co o3 pm10 pm2.5 no2 nox no
## 2.2795157 1.7820152 0.5524136 1.3726201 1.2899806 1.1129996 2.2160163 4.4884860
## windspeed winddirec aqi
## 0.5873836 0.7939939 1.0253725
Analisis dan Penanganan Outlier
# 8. Visualisasi Outlier awal
boxplot(df_imputed,
main="Boxplot Sebelum Penanganan Outlier",
col="orange",
las=2)

# 9. Handle Outlier (Metode Winsorizing)
handle_outliers <- function(x){
qnt <- quantile(x, probs=c(.25,.75))
caps <- quantile(x, probs=c(.05,.95))
H <- 1.5 * IQR(x)
x[x < (qnt[1] - H)] <- caps[1]
x[x > (qnt[2] + H)] <- caps[2]
return(x)
}
# 10. Cek Outlier Setelah dihandle
df_clean <- as.data.frame(lapply(df_imputed, handle_outliers))
boxplot(df_clean,
main="Boxplot Setelah Penanganan Outlier",
col="lightgreen",
las=2)

Matriks Korelasi
## Karakteristik Data dan Standarisasi
# 2. Menghitung matriks korelasi dari data yang sudah dibersihkan (df_clean)
# Kita gunakan df_clean karena sudah melalui tahap penanganan outlier
cor_matrix <- cor(df_clean)
round(cor_matrix,2)
## so2 co o3 pm10 pm2.5 no2 nox no windspeed winddirec
## so2 1.00 0.22 0.22 0.32 0.34 0.25 0.23 0.13 0.12 -0.02
## co 0.22 1.00 -0.27 0.36 0.46 0.87 0.90 0.70 -0.34 0.17
## o3 0.22 -0.27 1.00 0.25 0.16 -0.25 -0.35 -0.42 0.51 -0.04
## pm10 0.32 0.36 0.25 1.00 0.83 0.35 0.30 0.14 0.00 0.05
## pm2.5 0.34 0.46 0.16 0.83 1.00 0.40 0.35 0.17 -0.12 0.07
## no2 0.25 0.87 -0.25 0.35 0.40 1.00 0.94 0.60 -0.30 0.13
## nox 0.23 0.90 -0.35 0.30 0.35 0.94 1.00 0.82 -0.30 0.16
## no 0.13 0.70 -0.42 0.14 0.17 0.60 0.82 1.00 -0.20 0.16
## windspeed 0.12 -0.34 0.51 0.00 -0.12 -0.30 -0.30 -0.20 1.00 -0.16
## winddirec -0.02 0.17 -0.04 0.05 0.07 0.13 0.16 0.16 -0.16 1.00
## aqi 0.30 0.48 0.19 0.76 0.84 0.45 0.38 0.16 -0.11 0.04
## aqi
## so2 0.30
## co 0.48
## o3 0.19
## pm10 0.76
## pm2.5 0.84
## no2 0.45
## nox 0.38
## no 0.16
## windspeed -0.11
## winddirec 0.04
## aqi 1.00
corrplot(cor_matrix,
method="color",
type="upper",
addCoef.col="black",
tl.col="black",
tl.srt=45,
diag=FALSE,
title="Matriks Korelasi Kualitas Udara Zhongshan",
mar=c(0,0,1,0))

# 11. Statistik Deskriptif (Karakteristik Data)
cat("Jumlah Observasi:", nrow(df_clean))
## Jumlah Observasi: 5844
cat("Jumlah Variabel:", ncol(df_clean))
## Jumlah Variabel: 11
stats_desc <- describe(df_clean)
kable(stats_desc, digits=2,
caption="Statistik Deskriptif") %>%
kable_styling(full_width=FALSE)
Statistik Deskriptif
|
|
vars
|
n
|
mean
|
sd
|
median
|
trimmed
|
mad
|
min
|
max
|
range
|
skew
|
kurtosis
|
se
|
|
so2
|
1
|
5844
|
1.01
|
0.50
|
0.90
|
0.97
|
0.44
|
0.00
|
2.20
|
2.20
|
0.56
|
-0.25
|
0.01
|
|
co
|
2
|
5844
|
0.44
|
0.18
|
0.41
|
0.42
|
0.16
|
0.07
|
0.88
|
0.81
|
0.64
|
-0.18
|
0.00
|
|
o3
|
3
|
5844
|
24.98
|
16.28
|
23.30
|
24.02
|
18.83
|
0.00
|
75.30
|
75.30
|
0.44
|
-0.62
|
0.21
|
|
pm10
|
4
|
5844
|
25.91
|
12.69
|
24.00
|
25.04
|
11.86
|
0.00
|
57.00
|
57.00
|
0.56
|
-0.27
|
0.17
|
|
pm2.5
|
5
|
5844
|
13.77
|
7.27
|
12.00
|
13.17
|
5.93
|
0.00
|
33.00
|
33.00
|
0.67
|
-0.15
|
0.10
|
|
no2
|
6
|
5844
|
17.15
|
8.42
|
15.70
|
16.48
|
8.30
|
0.40
|
39.90
|
39.50
|
0.62
|
-0.30
|
0.11
|
|
nox
|
7
|
5844
|
21.84
|
11.89
|
19.30
|
20.53
|
10.67
|
1.10
|
50.88
|
49.78
|
0.88
|
0.15
|
0.16
|
|
no
|
8
|
5844
|
4.60
|
4.68
|
2.90
|
3.64
|
2.67
|
0.00
|
17.20
|
17.20
|
1.63
|
1.80
|
0.06
|
|
windspeed
|
9
|
5844
|
1.61
|
0.78
|
1.50
|
1.57
|
0.74
|
0.20
|
3.70
|
3.50
|
0.40
|
-0.45
|
0.01
|
|
winddirec
|
10
|
5844
|
157.75
|
92.62
|
121.00
|
150.28
|
44.48
|
0.00
|
360.00
|
360.00
|
0.80
|
-0.28
|
1.21
|
|
aqi
|
11
|
5844
|
46.68
|
17.47
|
44.00
|
45.49
|
17.79
|
-1.00
|
93.00
|
94.00
|
0.53
|
-0.25
|
0.23
|
# 12. Standarisasi Data (Z-Score)
df_scaled <- as.data.frame(scale(df_clean))
Uji Kelayakan
# Gunakan df_clean dari tahap penanganan outlier
# Menghitung korelasi awal
cor_mat_initial <- cor(df_clean)
# Uji KMO & MSA Awal
res_kmo <- KMO(df_clean)
# Variabel data_final dibuat di sini untuk iterasi MSA
data_final <- df_clean
# Loop otomatis: Hapus variabel dengan MSA < 0.5 satu per satu
while(any(res_kmo$MSAi < 0.5)){
drop_var <- names(which.min(res_kmo$MSAi))
cat("Menghapus variabel:", drop_var,"\n")
data_final <- data_final[, !(names(data_final) %in% drop_var)]
res_kmo <- KMO(data_final)
}
kable(data.frame(MSA_Final=res_kmo$MSAi),
digits=3,
caption="Nilai MSA Final")
Nilai MSA Final
| so2 |
0.880 |
| co |
0.944 |
| o3 |
0.720 |
| pm10 |
0.823 |
| pm2.5 |
0.763 |
| no2 |
0.622 |
| nox |
0.626 |
| no |
0.538 |
| windspeed |
0.688 |
| winddirec |
0.726 |
| aqi |
0.855 |
# Uji Bartlett Final
cat("\n--- Hasil Uji Bartlett ---\n")
##
## --- Hasil Uji Bartlett ---
cortest.bartlett(cor(data_final), n=nrow(data_final))
## $chisq
## [1] 60658.13
##
## $p.value
## [1] 0
##
## $df
## [1] 55
PCA
# 1. Pastikan variabel 'no' dihapus karena MSA < 0.5
data_final <- df_clean[, !(names(df_clean) %in% c("no"))]
# 2. Hitung PCA (Simpan ke object pca_obj agar bisa dipanggil di chunk FA)
pca_obj <- principal(data_final,
nfactors=ncol(data_final),
rotate="none")
pca_obj
## Principal Components Analysis
## Call: principal(r = data_final, nfactors = ncol(data_final), rotate = "none")
## Standardized loadings (pattern matrix) based upon correlation matrix
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 h2
## so2 0.39 0.38 0.48 0.21 -0.65 0.05 0.01 -0.01 -0.02 0.00 1
## co 0.87 -0.32 0.17 0.07 0.13 -0.07 0.04 0.23 -0.18 -0.05 1
## o3 -0.12 0.80 0.09 0.27 0.17 -0.48 -0.04 0.03 0.01 0.02 1
## pm10 0.70 0.54 -0.23 -0.11 0.05 0.14 -0.33 -0.08 -0.10 -0.01 1
## pm2.5 0.77 0.46 -0.27 -0.15 -0.04 0.11 0.04 0.20 0.20 0.01 1
## no2 0.85 -0.33 0.27 0.07 0.15 -0.10 -0.04 -0.15 0.14 -0.11 1
## nox 0.83 -0.41 0.28 0.09 0.15 0.00 -0.05 -0.04 0.03 0.16 1
## windspeed -0.32 0.58 0.47 0.25 0.40 0.35 0.07 0.01 0.00 -0.01 1
## winddirec 0.18 -0.18 -0.48 0.83 -0.04 0.09 0.02 -0.02 0.00 0.00 1
## aqi 0.77 0.44 -0.23 -0.16 0.05 -0.01 0.32 -0.16 -0.09 0.01 1
## u2 com
## so2 6.7e-16 3.5
## co 3.2e-15 1.7
## o3 8.9e-16 2.1
## pm10 7.8e-16 2.9
## pm2.5 6.7e-16 2.5
## no2 1.6e-15 1.8
## nox 1.8e-15 2.0
## windspeed 1.3e-15 4.7
## winddirec 1.1e-15 1.9
## aqi 3.3e-16 2.5
##
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
## SS loadings 4.14 2.24 1.05 0.95 0.68 0.41 0.23 0.15 0.11 0.04
## Proportion Var 0.41 0.22 0.10 0.10 0.07 0.04 0.02 0.01 0.01 0.00
## Cumulative Var 0.41 0.64 0.74 0.84 0.91 0.95 0.97 0.98 1.00 1.00
## Proportion Explained 0.41 0.22 0.10 0.10 0.07 0.04 0.02 0.01 0.01 0.00
## Cumulative Proportion 0.41 0.64 0.74 0.84 0.91 0.95 0.97 0.98 1.00 1.00
##
## Mean item complexity = 2.6
## Test of the hypothesis that 10 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0
## with the empirical chi square 0 with prob < NA
##
## Fit based upon off diagonal values = 1
Variance PCA
eigenvalues <- pca_obj$values
prop_variance <- eigenvalues / sum(eigenvalues)
cum_variance <- cumsum(prop_variance)
data.frame(
Eigenvalue = eigenvalues,
Proporsi_Varian = prop_variance,
Kumulatif = cum_variance
)
## Eigenvalue Proporsi_Varian Kumulatif
## 1 4.13964392 0.413964392 0.4139644
## 2 2.23884327 0.223884327 0.6378487
## 3 1.04817328 0.104817328 0.7426660
## 4 0.95001813 0.095001813 0.8376679
## 5 0.68480484 0.068480484 0.9061483
## 6 0.40632750 0.040632750 0.9467811
## 7 0.22766755 0.022766755 0.9695478
## 8 0.14977979 0.014977979 0.9845258
## 9 0.11236067 0.011236067 0.9957619
## 10 0.04238105 0.004238105 1.0000000
Scree Plot
eigenvalues <- pca_obj$values
plot(eigenvalues,
type="b",
pch=19,
main="Scree Plot PCA",
xlab="Komponen Utama",
ylab="Eigenvalue")
abline(h=1, col="red", lty=2)

LOading PCA
# PCA tanpa rotasi
pca_obj <- principal(data_final, nfactors = ncol(data_final), rotate = "none")
# Tabel Loading PCA
pca_loadings <- as.data.frame(pca_obj$loadings[ , 1:3]) # misal tampilkan 3 komponen pertama
kable(round(pca_loadings,3),
caption="Tabel Loading PCA") %>%
kable_styling(full_width = FALSE)
Tabel Loading PCA
|
|
PC1
|
PC2
|
PC3
|
|
so2
|
0.393
|
0.377
|
0.480
|
|
co
|
0.866
|
-0.325
|
0.168
|
|
o3
|
-0.121
|
0.805
|
0.093
|
|
pm10
|
0.697
|
0.544
|
-0.234
|
|
pm2.5
|
0.769
|
0.463
|
-0.274
|
|
no2
|
0.851
|
-0.332
|
0.266
|
|
nox
|
0.830
|
-0.411
|
0.283
|
|
windspeed
|
-0.323
|
0.576
|
0.469
|
|
winddirec
|
0.178
|
-0.180
|
-0.479
|
|
aqi
|
0.772
|
0.436
|
-0.225
|
loadings_matrix <- as.data.frame(unclass(pca_obj$loadings))
kable(round(loadings_matrix[,1:3],3),
caption="Tabel Component Loading PCA")
Tabel Component Loading PCA
| so2 |
0.393 |
0.377 |
0.480 |
| co |
0.866 |
-0.325 |
0.168 |
| o3 |
-0.121 |
0.805 |
0.093 |
| pm10 |
0.697 |
0.544 |
-0.234 |
| pm2.5 |
0.769 |
0.463 |
-0.274 |
| no2 |
0.851 |
-0.332 |
0.266 |
| nox |
0.830 |
-0.411 |
0.283 |
| windspeed |
-0.323 |
0.576 |
0.469 |
| winddirec |
0.178 |
-0.180 |
-0.479 |
| aqi |
0.772 |
0.436 |
-0.225 |
Parallel Analysis
fa.parallel(data_final,
fa="fa",
main="Parallel Analysis")

## Parallel analysis suggests that the number of factors = 3 and the number of components = NA
Menentukan Jumlah Faktor
n_factors_final <- sum(eigenvalues > 1)
n_factors_final
## [1] 3
Factor Analysis (Rotasi Varimax)
fa_res <- fa(data_final,
nfactors=n_factors_final,
rotate="varimax",
fm="minres")
fa_res
## Factor Analysis using method = minres
## Call: fa(r = data_final, nfactors = n_factors_final, rotate = "varimax",
## fm = "minres")
## Standardized loadings (pattern matrix) based upon correlation matrix
## MR2 MR1 MR3 h2 u2 com
## so2 0.31 0.24 0.26 0.221 0.77896 2.8
## co 0.31 0.83 -0.26 0.848 0.15235 1.5
## o3 0.28 -0.24 0.65 0.563 0.43737 1.7
## pm10 0.85 0.17 0.09 0.756 0.24375 1.1
## pm2.5 0.94 0.18 -0.06 0.922 0.07813 1.1
## no2 0.25 0.89 -0.18 0.897 0.10254 1.2
## nox 0.17 0.96 -0.21 0.999 0.00091 1.2
## windspeed -0.06 -0.15 0.75 0.589 0.41132 1.1
## winddirec 0.04 0.12 -0.14 0.035 0.96541 2.2
## aqi 0.85 0.25 -0.01 0.782 0.21813 1.2
##
## MR2 MR1 MR3
## SS loadings 2.69 2.69 1.23
## Proportion Var 0.27 0.27 0.12
## Cumulative Var 0.27 0.54 0.66
## Proportion Explained 0.41 0.41 0.19
## Cumulative Proportion 0.41 0.81 1.00
##
## Mean item complexity = 1.5
## Test of the hypothesis that 3 factors are sufficient.
##
## df null model = 45 with the objective function = 7.78 with Chi Square = 45440.29
## df of the model are 18 and the objective function was 0.19
##
## The root mean square of the residuals (RMSR) is 0.02
## The df corrected root mean square of the residuals is 0.03
##
## The harmonic n.obs is 5844 with the empirical chi square 80.93 with prob < 5.9e-10
## The total n.obs was 5844 with Likelihood Chi Square = 1111.23 with prob < 1.1e-224
##
## Tucker Lewis Index of factoring reliability = 0.94
## RMSEA index = 0.102 and the 90 % confidence intervals are 0.097 0.107
## BIC = 955.12
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## MR2 MR1 MR3
## Correlation of (regression) scores with factors 0.97 0.99 0.84
## Multiple R square of scores with factors 0.94 0.98 0.70
## Minimum correlation of possible factor scores 0.89 0.96 0.40
Factor Loading
print(fa_res$loadings, cutoff=0.4)
##
## Loadings:
## MR2 MR1 MR3
## so2
## co 0.829
## o3 0.652
## pm10 0.848
## pm2.5 0.941
## no2 0.895
## nox 0.962
## windspeed 0.750
## winddirec
## aqi 0.849
##
## MR2 MR1 MR3
## SS loadings 2.692 2.686 1.233
## Proportion Var 0.269 0.269 0.123
## Cumulative Var 0.269 0.538 0.661
Communality
fa_res$communality
## so2 co o3 pm10 pm2.5 no2 nox
## 0.22104499 0.84764896 0.56263402 0.75625341 0.92186889 0.89745833 0.99908979
## windspeed winddirec aqi
## 0.58867824 0.03459078 0.78186572
Factor Scores
factor_scores <- factor.scores(data_final, fa_res)
head(factor_scores$scores)
## MR2 MR1 MR3
## [1,] 0.9439009 -0.3616013 -0.2331109
## [2,] 0.8416427 -0.5577845 -0.1841706
## [3,] 1.5688878 -0.5552198 0.5226293
## [4,] 1.1604722 -0.4364075 0.7930175
## [5,] 1.0996139 -0.1624342 1.0551315
## [6,] 1.4276382 -0.5355026 1.1435491
Diagram Struktur Faktor
fa.diagram(fa_res,
main="Diagram Struktur Faktor")
