Analisis pada modul ini bertujuan untuk mengeksplorasi data, menguji asumsi, serta menerapkan metode Principal Component Analysis (PCA) dan Factor Analysis (FA) untuk melihat hubungan antar variabel dan mereduksi dimensi data. Dataset yang digunakan berisi data kualitas udara dan indikator kesehatan yang kemudian dianalisis melalui tahap karakteristik data, standarisasi variabel, serta pembentukan komponen dan faktor utama.

Dataset dapat diakses melalui link berikut: https://www.kaggle.com/datasets/tfisthis/global-air-quality-and-respiratory-health-outcomes

1. Karakteristik Data

1.1 Import Library

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(corrplot)
## corrplot 0.95 loaded
library(tidyr)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

1.2 Import Dataset

getwd()
## [1] "C:/Users/lenovo loq/OneDrive/Dokumen/Modul 1 Analisis Multivariat"

1.3 Membaca Dataet

df <- read.csv("air_quality_health_dataset.csv")
head(df)
##          city       date aqi pm2_5 pm10  no2   o3 temperature humidity
## 1 Los Angeles 2020-01-01  65  34.0 52.7  2.2 38.5        33.5       33
## 2     Beijing 2020-01-02 137  33.7 31.5 36.7 27.5        -1.6       32
## 3      London 2020-01-03 266  43.0 59.6 30.4 57.3        36.4       25
## 4 Mexico City 2020-01-04 293  33.7 37.9 12.3 42.7        -1.0       67
## 5       Delhi 2020-01-05 493  50.3 34.8 31.2 35.6        33.5       72
## 6       Cairo 2020-01-06  28  67.2 44.9 41.9 47.8         7.9       89
##   hospital_admissions population_density hospital_capacity
## 1                   5              Rural              1337
## 2                   4              Urban              1545
## 3                  10           Suburban              1539
## 4                  10              Urban               552
## 5                   9           Suburban              1631
## 6                  11              Urban              1291

1.4 Struktur Data

print(paste("Jumlah baris dan kolom:", dim(df)[1], "baris", dim(df)[2], "kolom"))
## [1] "Jumlah baris dan kolom: 88489 baris 12 kolom"
str(df)
## 'data.frame':    88489 obs. of  12 variables:
##  $ city               : chr  "Los Angeles" "Beijing" "London" "Mexico City" ...
##  $ date               : chr  "2020-01-01" "2020-01-02" "2020-01-03" "2020-01-04" ...
##  $ aqi                : int  65 137 266 293 493 28 217 449 342 279 ...
##  $ pm2_5              : num  34 33.7 43 33.7 50.3 67.2 29 60.8 44.9 27.1 ...
##  $ pm10               : num  52.7 31.5 59.6 37.9 34.8 44.9 63.7 56.2 63.4 101 ...
##  $ no2                : num  2.2 36.7 30.4 12.3 31.2 41.9 22.3 40 31 47.8 ...
##  $ o3                 : num  38.5 27.5 57.3 42.7 35.6 47.8 56 18.1 34.9 42.2 ...
##  $ temperature        : num  33.5 -1.6 36.4 -1 33.5 7.9 27.7 26.3 28 -2.1 ...
##  $ humidity           : int  33 32 25 67 72 89 22 46 34 24 ...
##  $ hospital_admissions: int  5 4 10 10 9 11 8 14 7 8 ...
##  $ population_density : chr  "Rural" "Urban" "Suburban" "Urban" ...
##  $ hospital_capacity  : int  1337 1545 1539 552 1631 1291 1852 350 878 179 ...

1.5 Mengecek Missing Value

colSums(is.na(df))
##                city                date                 aqi               pm2_5 
##                   0                   0                   0                   0 
##                pm10                 no2                  o3         temperature 
##                   0                   0                   0                   0 
##            humidity hospital_admissions  population_density   hospital_capacity 
##                   0                   0                   0                   0

1.6 Mengambil Variabel Numerik

data_numerik <- df %>%
  select(where(is.numeric))
head(data_numerik)
##   aqi pm2_5 pm10  no2   o3 temperature humidity hospital_admissions
## 1  65  34.0 52.7  2.2 38.5        33.5       33                   5
## 2 137  33.7 31.5 36.7 27.5        -1.6       32                   4
## 3 266  43.0 59.6 30.4 57.3        36.4       25                  10
## 4 293  33.7 37.9 12.3 42.7        -1.0       67                  10
## 5 493  50.3 34.8 31.2 35.6        33.5       72                   9
## 6  28  67.2 44.9 41.9 47.8         7.9       89                  11
##   hospital_capacity
## 1              1337
## 2              1545
## 3              1539
## 4               552
## 5              1631
## 6              1291

1.7 Statistik Deskriptif

statistik <- data.frame(
  Mean = sapply(data_numerik, mean),
  Median = sapply(data_numerik, median),
  Std_Deviasi = sapply(data_numerik, sd),
  Minimum = sapply(data_numerik, min),
  Maksimum = sapply(data_numerik, max)
)

statistik
##                            Mean Median Std_Deviasi Minimum Maksimum
## aqi                  249.370182  249.0  144.479132       0    499.0
## pm2_5                 35.144951   35.1   14.767994       0    109.9
## pm10                  50.118654   50.0   19.796392       0    143.5
## no2                   30.006211   30.0    9.963139       0     71.4
## o3                    39.978895   40.0   12.007258       0     93.5
## temperature           17.522962   17.5   12.961024      -5     40.0
## humidity              56.950966   57.0   21.629675      20     94.0
## hospital_admissions    8.049385    8.0    3.715458       0     25.0
## hospital_capacity   1024.463165 1026.0  561.978071      50   1999.0

1.8 Histogram Distribusi Data

data_numerik %>%
  pivot_longer(cols = everything()) %>%
  ggplot(aes(value)) +
  geom_histogram(bins = 20, fill="skyblue") +
  facet_wrap(~name, scales="free") +
  theme_minimal()

1.9 Boxplot ( Deteksi Outlier)

data_numerik %>%
  pivot_longer(cols = everything()) %>%
  ggplot(aes(x=name, y=value)) +
  geom_boxplot(fill="sky blue") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle=45, hjust=1))

1.10 Korelasi Antar Variabel (Heatmap)

cor_matrix <- cor(data_numerik)

cor_long <- melt(cor_matrix)

ggplot(cor_long, aes(x = Var2, y = Var1, fill = value)) +
  geom_tile(color = "white") +
  
  geom_text(aes(label = round(value,4)), color = "black", size = 3) +
  
  scale_fill_gradient2(
    low = "blue",
    mid = "white",
    high = "red",
    midpoint = mean(cor_long$value)   
  ) +
  
  labs(
    title = "Heatmap Korelasi",
    x = "",
    y = ""
  ) +
  
  theme_minimal() +
  
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
    panel.grid = element_blank()
  )

2. Uji Asumsi PCA dan FA

2.1 Import library

library(psych)
library(GPArotation)
## 
## Attaching package: 'GPArotation'
## The following objects are masked from 'package:psych':
## 
##     equamax, varimin

2.2 Data yang Digunakan untuk Analisis

X <- data_numerik
head(X)
##   aqi pm2_5 pm10  no2   o3 temperature humidity hospital_admissions
## 1  65  34.0 52.7  2.2 38.5        33.5       33                   5
## 2 137  33.7 31.5 36.7 27.5        -1.6       32                   4
## 3 266  43.0 59.6 30.4 57.3        36.4       25                  10
## 4 293  33.7 37.9 12.3 42.7        -1.0       67                  10
## 5 493  50.3 34.8 31.2 35.6        33.5       72                   9
## 6  28  67.2 44.9 41.9 47.8         7.9       89                  11
##   hospital_capacity
## 1              1337
## 2              1545
## 3              1539
## 4               552
## 5              1631
## 6              1291

2.3 Uji KMO

kmo_result <- KMO(X)

print("Nilai KMO tiap variabel:")
## [1] "Nilai KMO tiap variabel:"
kmo_result$MSAi
##                 aqi               pm2_5                pm10                 no2 
##           0.4802833           0.5000364           0.5011362           0.5092699 
##                  o3         temperature            humidity hospital_admissions 
##           0.5061713           0.5413704           0.4961910           0.5000348 
##   hospital_capacity 
##           0.5076825
print("Nilai KMO keseluruhan:")
## [1] "Nilai KMO keseluruhan:"
kmo_result$MSA
## [1] 0.5000559

2.4 Uji Bartlett

bartlett <- cortest.bartlett(cor(X), n = nrow(X))

print("Chi-Square:")
## [1] "Chi-Square:"
bartlett$chisq
## [1] 14826.19
print("P-Value:")
## [1] "P-Value:"
bartlett$p.value
## [1] 0

3. PCA (Principal Component Analysis)

3.1 Standarisasi Data

X_scaled <- scale(X)

head(X_scaled)
##             aqi       pm2_5       pm10        no2         o3 temperature
## [1,] -1.2761025 -0.07752924  0.1303948 -2.7909088 -0.1231667   1.2326987
## [2,] -0.7777606 -0.09784344 -0.9405075  0.6718555 -1.0392793  -1.4754206
## [3,]  0.1151019  0.53189681  0.4789431  0.0395246  1.4425529   1.4564465
## [4,]  0.3019801 -0.09784344 -0.6172162 -1.7771720  0.2266217  -1.4291280
## [5,]  1.6862630  1.02620904 -0.7738104  0.1198206 -0.3646873   1.2326987
## [6,] -1.5321949  2.17057573 -0.2636164  1.1937793  0.6513648  -0.7424538
##        humidity hospital_admissions hospital_capacity
## [1,] -1.1073197          -0.8207291         0.5561371
## [2,] -1.1535525          -1.0898749         0.9262583
## [3,] -1.4771820           0.5249999         0.9155817
## [4,]  0.4645948           0.5249999        -0.8407146
## [5,]  0.6957587           0.2558541         1.0792892
## [6,]  1.4817160           0.7941457         0.4742833

3.2 Menjalankan PCA

pca <- prcomp(X_scaled)

summary(pca)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5    PC6    PC7    PC8
## Standard deviation     1.1801 1.0081 1.0036 1.0016 1.0008 0.9971 0.9965 0.9921
## Proportion of Variance 0.1547 0.1129 0.1119 0.1115 0.1113 0.1105 0.1103 0.1094
## Cumulative Proportion  0.1547 0.2676 0.3796 0.4910 0.6023 0.7128 0.8231 0.9325
##                            PC9
## Standard deviation     0.77951
## Proportion of Variance 0.06752
## Cumulative Proportion  1.00000

3.3 Eigenvalue

eigenvalues <- pca$sdev^2

eigenvalues
## [1] 1.3925464 1.0162024 1.0073050 1.0032155 1.0016148 0.9942710 0.9929363
## [8] 0.9842733 0.6076352

3.4 Cumulative Variance

variance <- eigenvalues / sum(eigenvalues)

cumulative_variance <- cumsum(variance)

tabel_pca <- data.frame(
  Komponen = 1:length(cumulative_variance),
  Cumulative_Variance = cumulative_variance
)

tabel_pca
##   Komponen Cumulative_Variance
## 1        1           0.1547274
## 2        2           0.2676388
## 3        3           0.3795615
## 4        4           0.4910299
## 5        5           0.6023205
## 6        6           0.7127950
## 7        7           0.8231213
## 8        8           0.9324850
## 9        9           1.0000000

3.5 Scree Plot PCA

plot(
  1:length(cumulative_variance),
  cumulative_variance,
  type = "b",
  pch = 19,
  xlab = "Komponen",
  ylab = "Cumulative Variance",
  main = "Scree Plot PCA"
)

3.6 Menentukan Jumlah Komponen

jumlah_komponen <- which(cumulative_variance >= 0.80)[1]

print(paste("Jumlah komponen yang dipilih:", jumlah_komponen))
## [1] "Jumlah komponen yang dipilih: 7"

3.7 PCA Akhir

pca_final <- prcomp(X_scaled)

principal_components <- pca_final$x[,1:jumlah_komponen]

head(principal_components)
##             PC1         PC2        PC3         PC4        PC5          PC6
## [1,] -0.6954021 -1.61665706  1.3370414  0.75521907  0.8261413 -2.291876754
## [2,] -0.8022406 -0.01489562  0.2195584 -0.62384272  1.0909681  1.046552147
## [3,]  0.7113288 -0.31169351  1.7017844 -0.08430627 -1.8870353 -0.714702802
## [4,]  0.3027201 -1.44883858 -1.5020883 -0.40152444  0.9635191 -0.014527909
## [5,]  0.9077235  1.27058292  1.1816741 -1.24535265  0.2320035 -0.001604094
## [6,]  2.1220266  0.91583922 -0.4972262  1.74561880  0.0658516  1.601789856
##             PC7
## [1,] -1.1016165
## [2,]  0.3685704
## [3,] -0.5211250
## [4,] -1.0115142
## [5,] -0.6716781
## [6,] -0.4904387

3.8 Component Loading

loading <- pca_final$rotation[,1:jumlah_komponen]

loading
##                              PC1          PC2          PC3          PC4
## aqi                  0.004865307  0.183172917 -0.126481948 -0.840540645
## pm2_5                0.706959264 -0.005775048  0.005819455  0.003058129
## pm10                 0.001096697  0.317508817 -0.402155085  0.196548748
## no2                  0.011262232  0.485380750  0.100873320  0.030199038
## o3                  -0.007064914 -0.286569269 -0.033428957  0.195277022
## temperature         -0.017886512  0.101601670  0.742660763  0.222581578
## humidity             0.005506994  0.491605409 -0.361722057  0.382004086
## hospital_admissions  0.706821888 -0.013770877  0.011432129  0.008103238
## hospital_capacity    0.007716071  0.543780884  0.358368899 -0.142373528
##                               PC5          PC6          PC7
## aqi                 -0.2178026767 -0.072776063 -0.060630832
## pm2_5               -0.0058801857 -0.001781499 -0.002467229
## pm10                -0.4402926131 -0.661481505  0.190695037
## no2                 -0.3476120079  0.592858656  0.516077636
## o3                  -0.7370966792  0.260088498 -0.509013987
## temperature         -0.1569462203 -0.325557696  0.129484527
## humidity             0.2603002910  0.164440051 -0.300787834
## hospital_admissions -0.0002692731 -0.012298052  0.001143806
## hospital_capacity    0.0462144960 -0.069539132 -0.572084767

4. Factor Analysis (FA)

4.1 Menentukan Jumlah Faktor

jumlah_faktor <- 3

print(paste("Jumlah faktor yang digunakan:", jumlah_faktor))
## [1] "Jumlah faktor yang digunakan: 3"

4.2 Menjalankan Factor Analysis

fa <- factanal(X, factors = jumlah_faktor)

fa
## 
## Call:
## factanal(x = X, factors = jumlah_faktor)
## 
## Uniquenesses:
##                 aqi               pm2_5                pm10                 no2 
##               0.999               0.606               0.998               0.995 
##                  o3         temperature            humidity hospital_admissions 
##               0.999               0.886               0.993               0.608 
##   hospital_capacity 
##               0.993 
## 
## Loadings:
##                     Factor1 Factor2 Factor3
## aqi                                        
## pm2_5                0.627                 
## pm10                                       
## no2                                        
## o3                                         
## temperature                  0.338         
## humidity                                   
## hospital_admissions  0.626                 
## hospital_capacity                          
## 
##                Factor1 Factor2 Factor3
## SS loadings      0.784   0.115   0.022
## Proportion Var   0.087   0.013   0.002
## Cumulative Var   0.087   0.100   0.102
## 
## Test of the hypothesis that 3 factors are sufficient.
## The chi square statistic is 7.71 on 12 degrees of freedom.
## The p-value is 0.807

4.3 Eigenvalue Factor Analysis

ev <- eigen(cor(X))$values

ev
## [1] 1.3925464 1.0162024 1.0073050 1.0032155 1.0016148 0.9942710 0.9929363
## [8] 0.9842733 0.6076352

4.5 Scree Plot FA

plot(
  ev,
  type = "b",
  pch = 19,
  xlab = "Faktor",
  ylab = "Eigenvalue",
  main = "Scree Plot Factor Analysis"
)

4.5 Menjalankan Factor Analysis

X <- scale(data_numerik)

jumlah_faktor <- 7

fa_model <- fa(X, nfactors = jumlah_faktor, rotate = "varimax")

4.6 Membuat tabel Factor Loading

loading_faktor <- data.frame(
  fa_model$loadings[1:ncol(X),]
)

rownames(loading_faktor) <- colnames(X)

colnames(loading_faktor) <- paste("Faktor", 1:jumlah_faktor)

loading_faktor <- round(loading_faktor, 6)

loading_faktor
##                      Faktor 1  Faktor 2  Faktor 3  Faktor 4  Faktor 5  Faktor 6
## aqi                  0.000566 -0.018028  0.122903  0.016474  0.013256  0.008368
## pm2_5                0.626753 -0.029285  0.012379  0.003917  0.046726 -0.013920
## pm10                -0.000216 -0.009251  0.005748  0.001795  0.021763  0.099041
## no2                  0.003648  0.012170  0.011320  0.025300  0.066041  0.023008
## o3                  -0.001752  0.003020 -0.005651 -0.014374 -0.003760  0.006458
## temperature         -0.005940  0.113326 -0.024523  0.029701  0.019639 -0.007672
## humidity            -0.001257 -0.074864 -0.056563  0.056688  0.060468  0.047604
## hospital_admissions  0.628372  0.001427 -0.006847  0.005767 -0.022338  0.013852
## hospital_capacity    0.002082  0.029092  0.021075  0.107536  0.038501  0.001412
##                      Faktor 7
## aqi                 -0.009239
## pm2_5                0.008398
## pm10                 0.007737
## no2                 -0.002597
## o3                   0.093781
## temperature          0.000296
## humidity            -0.040142
## hospital_admissions -0.022014
## hospital_capacity   -0.024910

4.7 Variance Tiap Faktor

variance_matrix <- fa_model$Vaccounted

variance_faktor <- data.frame(variance_matrix)

colnames(variance_faktor) <- paste("Faktor", 1:ncol(variance_faktor))

variance_faktor <- round(variance_faktor, 6)

variance_faktor
##                       Faktor 1 Faktor 2 Faktor 3 Faktor 4 Faktor 5 Faktor 6
## SS loadings           0.787729 0.020721 0.019743 0.016830 0.013232 0.013163
## Proportion Var        0.087525 0.002302 0.002194 0.001870 0.001470 0.001463
## Cumulative Var        0.087525 0.089828 0.092022 0.093891 0.095362 0.096824
## Proportion Explained  0.891953 0.023463 0.022355 0.019056 0.014982 0.014904
## Cumulative Proportion 0.891953 0.915415 0.937771 0.956827 0.971809 0.986714
##                       Faktor 7
## SS loadings           0.011734
## Proportion Var        0.001304
## Cumulative Var        0.098128
## Proportion Explained  0.013286
## Cumulative Proportion 1.000000