Analisis pada modul ini bertujuan untuk mengeksplorasi data, menguji asumsi, serta menerapkan metode Principal Component Analysis (PCA) dan Factor Analysis (FA) untuk melihat hubungan antar variabel dan mereduksi dimensi data. Dataset yang digunakan berisi data kualitas udara dan indikator kesehatan yang kemudian dianalisis melalui tahap karakteristik data, standarisasi variabel, serta pembentukan komponen dan faktor utama.
Dataset dapat diakses melalui link berikut: https://www.kaggle.com/datasets/tfisthis/global-air-quality-and-respiratory-health-outcomes
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(corrplot)
## corrplot 0.95 loaded
library(tidyr)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
getwd()
## [1] "C:/Users/lenovo loq/OneDrive/Dokumen/Modul 1 Analisis Multivariat"
df <- read.csv("air_quality_health_dataset.csv")
head(df)
## city date aqi pm2_5 pm10 no2 o3 temperature humidity
## 1 Los Angeles 2020-01-01 65 34.0 52.7 2.2 38.5 33.5 33
## 2 Beijing 2020-01-02 137 33.7 31.5 36.7 27.5 -1.6 32
## 3 London 2020-01-03 266 43.0 59.6 30.4 57.3 36.4 25
## 4 Mexico City 2020-01-04 293 33.7 37.9 12.3 42.7 -1.0 67
## 5 Delhi 2020-01-05 493 50.3 34.8 31.2 35.6 33.5 72
## 6 Cairo 2020-01-06 28 67.2 44.9 41.9 47.8 7.9 89
## hospital_admissions population_density hospital_capacity
## 1 5 Rural 1337
## 2 4 Urban 1545
## 3 10 Suburban 1539
## 4 10 Urban 552
## 5 9 Suburban 1631
## 6 11 Urban 1291
print(paste("Jumlah baris dan kolom:", dim(df)[1], "baris", dim(df)[2], "kolom"))
## [1] "Jumlah baris dan kolom: 88489 baris 12 kolom"
str(df)
## 'data.frame': 88489 obs. of 12 variables:
## $ city : chr "Los Angeles" "Beijing" "London" "Mexico City" ...
## $ date : chr "2020-01-01" "2020-01-02" "2020-01-03" "2020-01-04" ...
## $ aqi : int 65 137 266 293 493 28 217 449 342 279 ...
## $ pm2_5 : num 34 33.7 43 33.7 50.3 67.2 29 60.8 44.9 27.1 ...
## $ pm10 : num 52.7 31.5 59.6 37.9 34.8 44.9 63.7 56.2 63.4 101 ...
## $ no2 : num 2.2 36.7 30.4 12.3 31.2 41.9 22.3 40 31 47.8 ...
## $ o3 : num 38.5 27.5 57.3 42.7 35.6 47.8 56 18.1 34.9 42.2 ...
## $ temperature : num 33.5 -1.6 36.4 -1 33.5 7.9 27.7 26.3 28 -2.1 ...
## $ humidity : int 33 32 25 67 72 89 22 46 34 24 ...
## $ hospital_admissions: int 5 4 10 10 9 11 8 14 7 8 ...
## $ population_density : chr "Rural" "Urban" "Suburban" "Urban" ...
## $ hospital_capacity : int 1337 1545 1539 552 1631 1291 1852 350 878 179 ...
colSums(is.na(df))
## city date aqi pm2_5
## 0 0 0 0
## pm10 no2 o3 temperature
## 0 0 0 0
## humidity hospital_admissions population_density hospital_capacity
## 0 0 0 0
data_numerik <- df %>%
select(where(is.numeric))
head(data_numerik)
## aqi pm2_5 pm10 no2 o3 temperature humidity hospital_admissions
## 1 65 34.0 52.7 2.2 38.5 33.5 33 5
## 2 137 33.7 31.5 36.7 27.5 -1.6 32 4
## 3 266 43.0 59.6 30.4 57.3 36.4 25 10
## 4 293 33.7 37.9 12.3 42.7 -1.0 67 10
## 5 493 50.3 34.8 31.2 35.6 33.5 72 9
## 6 28 67.2 44.9 41.9 47.8 7.9 89 11
## hospital_capacity
## 1 1337
## 2 1545
## 3 1539
## 4 552
## 5 1631
## 6 1291
statistik <- data.frame(
Mean = sapply(data_numerik, mean),
Median = sapply(data_numerik, median),
Std_Deviasi = sapply(data_numerik, sd),
Minimum = sapply(data_numerik, min),
Maksimum = sapply(data_numerik, max)
)
statistik
## Mean Median Std_Deviasi Minimum Maksimum
## aqi 249.370182 249.0 144.479132 0 499.0
## pm2_5 35.144951 35.1 14.767994 0 109.9
## pm10 50.118654 50.0 19.796392 0 143.5
## no2 30.006211 30.0 9.963139 0 71.4
## o3 39.978895 40.0 12.007258 0 93.5
## temperature 17.522962 17.5 12.961024 -5 40.0
## humidity 56.950966 57.0 21.629675 20 94.0
## hospital_admissions 8.049385 8.0 3.715458 0 25.0
## hospital_capacity 1024.463165 1026.0 561.978071 50 1999.0
data_numerik %>%
pivot_longer(cols = everything()) %>%
ggplot(aes(value)) +
geom_histogram(bins = 20, fill="skyblue") +
facet_wrap(~name, scales="free") +
theme_minimal()
data_numerik %>%
pivot_longer(cols = everything()) %>%
ggplot(aes(x=name, y=value)) +
geom_boxplot(fill="sky blue") +
theme_minimal() +
theme(axis.text.x = element_text(angle=45, hjust=1))
cor_matrix <- cor(data_numerik)
cor_long <- melt(cor_matrix)
ggplot(cor_long, aes(x = Var2, y = Var1, fill = value)) +
geom_tile(color = "white") +
geom_text(aes(label = round(value,4)), color = "black", size = 3) +
scale_fill_gradient2(
low = "blue",
mid = "white",
high = "red",
midpoint = mean(cor_long$value)
) +
labs(
title = "Heatmap Korelasi",
x = "",
y = ""
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
panel.grid = element_blank()
)
library(psych)
library(GPArotation)
##
## Attaching package: 'GPArotation'
## The following objects are masked from 'package:psych':
##
## equamax, varimin
X <- data_numerik
head(X)
## aqi pm2_5 pm10 no2 o3 temperature humidity hospital_admissions
## 1 65 34.0 52.7 2.2 38.5 33.5 33 5
## 2 137 33.7 31.5 36.7 27.5 -1.6 32 4
## 3 266 43.0 59.6 30.4 57.3 36.4 25 10
## 4 293 33.7 37.9 12.3 42.7 -1.0 67 10
## 5 493 50.3 34.8 31.2 35.6 33.5 72 9
## 6 28 67.2 44.9 41.9 47.8 7.9 89 11
## hospital_capacity
## 1 1337
## 2 1545
## 3 1539
## 4 552
## 5 1631
## 6 1291
kmo_result <- KMO(X)
print("Nilai KMO tiap variabel:")
## [1] "Nilai KMO tiap variabel:"
kmo_result$MSAi
## aqi pm2_5 pm10 no2
## 0.4802833 0.5000364 0.5011362 0.5092699
## o3 temperature humidity hospital_admissions
## 0.5061713 0.5413704 0.4961910 0.5000348
## hospital_capacity
## 0.5076825
print("Nilai KMO keseluruhan:")
## [1] "Nilai KMO keseluruhan:"
kmo_result$MSA
## [1] 0.5000559
bartlett <- cortest.bartlett(cor(X), n = nrow(X))
print("Chi-Square:")
## [1] "Chi-Square:"
bartlett$chisq
## [1] 14826.19
print("P-Value:")
## [1] "P-Value:"
bartlett$p.value
## [1] 0
X_scaled <- scale(X)
head(X_scaled)
## aqi pm2_5 pm10 no2 o3 temperature
## [1,] -1.2761025 -0.07752924 0.1303948 -2.7909088 -0.1231667 1.2326987
## [2,] -0.7777606 -0.09784344 -0.9405075 0.6718555 -1.0392793 -1.4754206
## [3,] 0.1151019 0.53189681 0.4789431 0.0395246 1.4425529 1.4564465
## [4,] 0.3019801 -0.09784344 -0.6172162 -1.7771720 0.2266217 -1.4291280
## [5,] 1.6862630 1.02620904 -0.7738104 0.1198206 -0.3646873 1.2326987
## [6,] -1.5321949 2.17057573 -0.2636164 1.1937793 0.6513648 -0.7424538
## humidity hospital_admissions hospital_capacity
## [1,] -1.1073197 -0.8207291 0.5561371
## [2,] -1.1535525 -1.0898749 0.9262583
## [3,] -1.4771820 0.5249999 0.9155817
## [4,] 0.4645948 0.5249999 -0.8407146
## [5,] 0.6957587 0.2558541 1.0792892
## [6,] 1.4817160 0.7941457 0.4742833
pca <- prcomp(X_scaled)
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## Standard deviation 1.1801 1.0081 1.0036 1.0016 1.0008 0.9971 0.9965 0.9921
## Proportion of Variance 0.1547 0.1129 0.1119 0.1115 0.1113 0.1105 0.1103 0.1094
## Cumulative Proportion 0.1547 0.2676 0.3796 0.4910 0.6023 0.7128 0.8231 0.9325
## PC9
## Standard deviation 0.77951
## Proportion of Variance 0.06752
## Cumulative Proportion 1.00000
eigenvalues <- pca$sdev^2
eigenvalues
## [1] 1.3925464 1.0162024 1.0073050 1.0032155 1.0016148 0.9942710 0.9929363
## [8] 0.9842733 0.6076352
variance <- eigenvalues / sum(eigenvalues)
cumulative_variance <- cumsum(variance)
tabel_pca <- data.frame(
Komponen = 1:length(cumulative_variance),
Cumulative_Variance = cumulative_variance
)
tabel_pca
## Komponen Cumulative_Variance
## 1 1 0.1547274
## 2 2 0.2676388
## 3 3 0.3795615
## 4 4 0.4910299
## 5 5 0.6023205
## 6 6 0.7127950
## 7 7 0.8231213
## 8 8 0.9324850
## 9 9 1.0000000
plot(
1:length(cumulative_variance),
cumulative_variance,
type = "b",
pch = 19,
xlab = "Komponen",
ylab = "Cumulative Variance",
main = "Scree Plot PCA"
)
jumlah_komponen <- which(cumulative_variance >= 0.80)[1]
print(paste("Jumlah komponen yang dipilih:", jumlah_komponen))
## [1] "Jumlah komponen yang dipilih: 7"
pca_final <- prcomp(X_scaled)
principal_components <- pca_final$x[,1:jumlah_komponen]
head(principal_components)
## PC1 PC2 PC3 PC4 PC5 PC6
## [1,] -0.6954021 -1.61665706 1.3370414 0.75521907 0.8261413 -2.291876754
## [2,] -0.8022406 -0.01489562 0.2195584 -0.62384272 1.0909681 1.046552147
## [3,] 0.7113288 -0.31169351 1.7017844 -0.08430627 -1.8870353 -0.714702802
## [4,] 0.3027201 -1.44883858 -1.5020883 -0.40152444 0.9635191 -0.014527909
## [5,] 0.9077235 1.27058292 1.1816741 -1.24535265 0.2320035 -0.001604094
## [6,] 2.1220266 0.91583922 -0.4972262 1.74561880 0.0658516 1.601789856
## PC7
## [1,] -1.1016165
## [2,] 0.3685704
## [3,] -0.5211250
## [4,] -1.0115142
## [5,] -0.6716781
## [6,] -0.4904387
loading <- pca_final$rotation[,1:jumlah_komponen]
loading
## PC1 PC2 PC3 PC4
## aqi 0.004865307 0.183172917 -0.126481948 -0.840540645
## pm2_5 0.706959264 -0.005775048 0.005819455 0.003058129
## pm10 0.001096697 0.317508817 -0.402155085 0.196548748
## no2 0.011262232 0.485380750 0.100873320 0.030199038
## o3 -0.007064914 -0.286569269 -0.033428957 0.195277022
## temperature -0.017886512 0.101601670 0.742660763 0.222581578
## humidity 0.005506994 0.491605409 -0.361722057 0.382004086
## hospital_admissions 0.706821888 -0.013770877 0.011432129 0.008103238
## hospital_capacity 0.007716071 0.543780884 0.358368899 -0.142373528
## PC5 PC6 PC7
## aqi -0.2178026767 -0.072776063 -0.060630832
## pm2_5 -0.0058801857 -0.001781499 -0.002467229
## pm10 -0.4402926131 -0.661481505 0.190695037
## no2 -0.3476120079 0.592858656 0.516077636
## o3 -0.7370966792 0.260088498 -0.509013987
## temperature -0.1569462203 -0.325557696 0.129484527
## humidity 0.2603002910 0.164440051 -0.300787834
## hospital_admissions -0.0002692731 -0.012298052 0.001143806
## hospital_capacity 0.0462144960 -0.069539132 -0.572084767
jumlah_faktor <- 3
print(paste("Jumlah faktor yang digunakan:", jumlah_faktor))
## [1] "Jumlah faktor yang digunakan: 3"
fa <- factanal(X, factors = jumlah_faktor)
fa
##
## Call:
## factanal(x = X, factors = jumlah_faktor)
##
## Uniquenesses:
## aqi pm2_5 pm10 no2
## 0.999 0.606 0.998 0.995
## o3 temperature humidity hospital_admissions
## 0.999 0.886 0.993 0.608
## hospital_capacity
## 0.993
##
## Loadings:
## Factor1 Factor2 Factor3
## aqi
## pm2_5 0.627
## pm10
## no2
## o3
## temperature 0.338
## humidity
## hospital_admissions 0.626
## hospital_capacity
##
## Factor1 Factor2 Factor3
## SS loadings 0.784 0.115 0.022
## Proportion Var 0.087 0.013 0.002
## Cumulative Var 0.087 0.100 0.102
##
## Test of the hypothesis that 3 factors are sufficient.
## The chi square statistic is 7.71 on 12 degrees of freedom.
## The p-value is 0.807
ev <- eigen(cor(X))$values
ev
## [1] 1.3925464 1.0162024 1.0073050 1.0032155 1.0016148 0.9942710 0.9929363
## [8] 0.9842733 0.6076352
plot(
ev,
type = "b",
pch = 19,
xlab = "Faktor",
ylab = "Eigenvalue",
main = "Scree Plot Factor Analysis"
)
X <- scale(data_numerik)
jumlah_faktor <- 7
fa_model <- fa(X, nfactors = jumlah_faktor, rotate = "varimax")
loading_faktor <- data.frame(
fa_model$loadings[1:ncol(X),]
)
rownames(loading_faktor) <- colnames(X)
colnames(loading_faktor) <- paste("Faktor", 1:jumlah_faktor)
loading_faktor <- round(loading_faktor, 6)
loading_faktor
## Faktor 1 Faktor 2 Faktor 3 Faktor 4 Faktor 5 Faktor 6
## aqi 0.000566 -0.018028 0.122903 0.016474 0.013256 0.008368
## pm2_5 0.626753 -0.029285 0.012379 0.003917 0.046726 -0.013920
## pm10 -0.000216 -0.009251 0.005748 0.001795 0.021763 0.099041
## no2 0.003648 0.012170 0.011320 0.025300 0.066041 0.023008
## o3 -0.001752 0.003020 -0.005651 -0.014374 -0.003760 0.006458
## temperature -0.005940 0.113326 -0.024523 0.029701 0.019639 -0.007672
## humidity -0.001257 -0.074864 -0.056563 0.056688 0.060468 0.047604
## hospital_admissions 0.628372 0.001427 -0.006847 0.005767 -0.022338 0.013852
## hospital_capacity 0.002082 0.029092 0.021075 0.107536 0.038501 0.001412
## Faktor 7
## aqi -0.009239
## pm2_5 0.008398
## pm10 0.007737
## no2 -0.002597
## o3 0.093781
## temperature 0.000296
## humidity -0.040142
## hospital_admissions -0.022014
## hospital_capacity -0.024910
variance_matrix <- fa_model$Vaccounted
variance_faktor <- data.frame(variance_matrix)
colnames(variance_faktor) <- paste("Faktor", 1:ncol(variance_faktor))
variance_faktor <- round(variance_faktor, 6)
variance_faktor
## Faktor 1 Faktor 2 Faktor 3 Faktor 4 Faktor 5 Faktor 6
## SS loadings 0.787729 0.020721 0.019743 0.016830 0.013232 0.013163
## Proportion Var 0.087525 0.002302 0.002194 0.001870 0.001470 0.001463
## Cumulative Var 0.087525 0.089828 0.092022 0.093891 0.095362 0.096824
## Proportion Explained 0.891953 0.023463 0.022355 0.019056 0.014982 0.014904
## Cumulative Proportion 0.891953 0.915415 0.937771 0.956827 0.971809 0.986714
## Faktor 7
## SS loadings 0.011734
## Proportion Var 0.001304
## Cumulative Var 0.098128
## Proportion Explained 0.013286
## Cumulative Proportion 1.000000