df <- read_csv("Global Economy Indicators.csv")
| CountryID | Country | Year | AMA exchange rate | IMF based exchange rate | Population | Currency | Per capita GNI | Agriculture, hunting, forestry, fishing (ISIC A-B) | Changes in inventories | Construction (ISIC F) | Exports of goods and services | Final consumption expenditure | General government final consumption expenditure | Gross capital formation | Gross fixed capital formation (including Acquisitions less disposals of valuables) | Household consumption expenditure (including Non-profit institutions serving households) | Imports of goods and services | Manufacturing (ISIC D) | Mining, Manufacturing, Utilities (ISIC C-E) | Other Activities (ISIC J-P) | Total Value Added | Transport, storage and communication (ISIC I) | Wholesale, retail trade, restaurants and hotels (ISIC G-H) | Gross National Income(GNI) in USD | Gross Domestic Product (GDP) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | Afghanistan | 1970 | 0.0449984 | 0.0449984 | 10752971 | Afghani | 164 | 869917407 | NA | 46793902 | 165618722 | 1663221269 | 112126986 | 94611818 | 94611818 | 1551094283 | 195277226 | 370146827 | 376690811 | 127747843 | 1731454254 | 83917200 | 226387091 | 1766527525 | 1731435587 |
| 4 | Afghanistan | 1971 | 0.0449984 | 0.0449984 | 11015857 | Afghani | 168 | 910828104 | NA | 48994113 | 193580300 | 1796541240 | 121114833 | 99012350 | 99012350 | 1675426407 | 276296480 | 387549502 | 394401164 | 133754097 | 1812857077 | 87860382 | 237019196 | 1850121913 | 1812837521 |
| 4 | Afghanistan | 1972 | 0.0449984 | 0.0449984 | 11286753 | Afghani | 149 | 827945340 | NA | 44535223 | 227654380 | 1607159399 | 108347543 | 103456794 | 103456794 | 1498811856 | 290370350 | 352284669 | 358512865 | 121582672 | 1647917912 | 79864525 | 215477287 | 1683947905 | 1647900178 |
| 4 | Afghanistan | 1973 | 0.0449984 | 0.0449984 | 11575305 | Afghani | 150 | 855486925 | NA | 46018542 | 226913554 | 1617037233 | 109013455 | 121728433 | 121728433 | 1508023800 | 262962880 | 364010279 | 370445793 | 125630236 | 1702734673 | 82528885 | 222624293 | 1739998153 | 1702716294 |
| 4 | Afghanistan | 1974 | 0.0449984 | 0.0449984 | 11869879 | Afghani | 177 | 1035913365 | NA | 55721659 | 284938449 | 1907408182 | 128588961 | 175061875 | 175061875 | 1778819221 | 305679151 | 440760406 | 448552790 | 152119162 | 2061751510 | 99918604 | 269525910 | 2106420227 | 2061729287 |
| 4 | Afghanistan | 1975 | 0.0449984 | 0.0449984 | 12157386 | Afghani | 195 | 1165441381 | NA | 62686658 | 300493815 | 2131358499 | 143686711 | 221728484 | 221728484 | 1987671788 | 333827202 | 495891889 | 504659018 | 171142804 | 2319778374 | 112416374 | 303432162 | 2369877014 | 2319753506 |
# Ukuran data
dim(df)
## [1] 10512 26
# Struktur dan tipe data
str(df)
## spc_tbl_ [10,512 × 26] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ CountryID : num [1:10512] 4 4 4 4 4 4 4 4 4 4 ...
## $ Country : chr [1:10512] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Year : num [1:10512] 1970 1971 1972 1973 1974 ...
## $ AMA exchange rate : num [1:10512] 0.045 0.045 0.045 0.045 0.045 ...
## $ IMF based exchange rate : num [1:10512] 0.045 0.045 0.045 0.045 0.045 ...
## $ Population : num [1:10512] 10752971 11015857 11286753 11575305 11869879 ...
## $ Currency : chr [1:10512] "Afghani" "Afghani" "Afghani" "Afghani" ...
## $ Per capita GNI : num [1:10512] 164 168 149 150 177 195 207 231 254 285 ...
## $ Agriculture, hunting, forestry, fishing (ISIC A-B) : num [1:10512] 8.70e+08 9.11e+08 8.28e+08 8.55e+08 1.04e+09 ...
## $ Changes in inventories : num [1:10512] NA NA NA NA NA NA NA NA NA NA ...
## $ Construction (ISIC F) : num [1:10512] 46793902 48994113 44535223 46018542 55721659 ...
## $ Exports of goods and services : num [1:10512] 1.66e+08 1.94e+08 2.28e+08 2.27e+08 2.85e+08 ...
## $ Final consumption expenditure : num [1:10512] 1.66e+09 1.80e+09 1.61e+09 1.62e+09 1.91e+09 ...
## $ General government final consumption expenditure : num [1:10512] 1.12e+08 1.21e+08 1.08e+08 1.09e+08 1.29e+08 ...
## $ Gross capital formation : num [1:10512] 9.46e+07 9.90e+07 1.03e+08 1.22e+08 1.75e+08 ...
## $ Gross fixed capital formation (including Acquisitions less disposals of valuables) : num [1:10512] 9.46e+07 9.90e+07 1.03e+08 1.22e+08 1.75e+08 ...
## $ Household consumption expenditure (including Non-profit institutions serving households): num [1:10512] 1.55e+09 1.68e+09 1.50e+09 1.51e+09 1.78e+09 ...
## $ Imports of goods and services : num [1:10512] 1.95e+08 2.76e+08 2.90e+08 2.63e+08 3.06e+08 ...
## $ Manufacturing (ISIC D) : num [1:10512] 3.70e+08 3.88e+08 3.52e+08 3.64e+08 4.41e+08 ...
## $ Mining, Manufacturing, Utilities (ISIC C-E) : num [1:10512] 3.77e+08 3.94e+08 3.59e+08 3.70e+08 4.49e+08 ...
## $ Other Activities (ISIC J-P) : num [1:10512] 1.28e+08 1.34e+08 1.22e+08 1.26e+08 1.52e+08 ...
## $ Total Value Added : num [1:10512] 1.73e+09 1.81e+09 1.65e+09 1.70e+09 2.06e+09 ...
## $ Transport, storage and communication (ISIC I) : num [1:10512] 83917200 87860382 79864525 82528885 99918604 ...
## $ Wholesale, retail trade, restaurants and hotels (ISIC G-H) : num [1:10512] 2.26e+08 2.37e+08 2.15e+08 2.23e+08 2.70e+08 ...
## $ Gross National Income(GNI) in USD : num [1:10512] 1.77e+09 1.85e+09 1.68e+09 1.74e+09 2.11e+09 ...
## $ Gross Domestic Product (GDP) : num [1:10512] 1.73e+09 1.81e+09 1.65e+09 1.70e+09 2.06e+09 ...
## - attr(*, "spec")=
## .. cols(
## .. CountryID = col_double(),
## .. Country = col_character(),
## .. Year = col_double(),
## .. `AMA exchange rate` = col_double(),
## .. `IMF based exchange rate` = col_double(),
## .. Population = col_double(),
## .. Currency = col_character(),
## .. `Per capita GNI` = col_double(),
## .. `Agriculture, hunting, forestry, fishing (ISIC A-B)` = col_double(),
## .. `Changes in inventories` = col_double(),
## .. `Construction (ISIC F)` = col_double(),
## .. `Exports of goods and services` = col_double(),
## .. `Final consumption expenditure` = col_double(),
## .. `General government final consumption expenditure` = col_double(),
## .. `Gross capital formation` = col_double(),
## .. `Gross fixed capital formation (including Acquisitions less disposals of valuables)` = col_double(),
## .. `Household consumption expenditure (including Non-profit institutions serving households)` = col_double(),
## .. `Imports of goods and services` = col_double(),
## .. `Manufacturing (ISIC D)` = col_double(),
## .. `Mining, Manufacturing, Utilities (ISIC C-E)` = col_double(),
## .. `Other Activities (ISIC J-P)` = col_double(),
## .. `Total Value Added` = col_double(),
## .. `Transport, storage and communication (ISIC I)` = col_double(),
## .. `Wholesale, retail trade, restaurants and hotels (ISIC G-H)` = col_double(),
## .. `Gross National Income(GNI) in USD` = col_double(),
## .. `Gross Domestic Product (GDP)` = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# Statistik deskriptif
summary(df)
## CountryID Country Year AMA exchange rate
## Min. : 4.0 Length:10512 Min. :1970 Min. :0.000e+00
## 1st Qu.:212.0 Class :character 1st Qu.:1984 1st Qu.:1.000e+00
## Median :430.0 Mode :character Median :1997 Median :2.813e+00
## Mean :431.1 Mean :1996 Mean :3.574e+02
## 3rd Qu.:643.0 3rd Qu.:2009 3rd Qu.:5.134e+01
## Max. :894.0 Max. :2021 Max. :1.116e+05
##
## IMF based exchange rate Population Currency
## Min. : 0.000 Min. :4.359e+03 Length:10512
## 1st Qu.: 1.000 1st Qu.:6.331e+05 Class :character
## Median : 2.761 Median :5.052e+06 Mode :character
## Mean : 341.985 Mean :2.852e+07
## 3rd Qu.: 48.067 3rd Qu.:1.679e+07
## Max. :42000.000 Max. :1.426e+09
##
## Per capita GNI Agriculture, hunting, forestry, fishing (ISIC A-B)
## Min. : 34 Min. :2.814e+04
## 1st Qu.: 730 1st Qu.:1.305e+08
## Median : 2316 Median :9.314e+08
## Mean : 8966 Mean :7.793e+09
## 3rd Qu.: 8966 3rd Qu.:4.023e+09
## Max. :234317 Max. :1.350e+12
## NA's :121
## Changes in inventories Construction (ISIC F) Exports of goods and services
## Min. :-1.510e+11 Min. :1.577e+05 Min. :-1.670e+09
## 1st Qu.: 4.674e+05 1st Qu.:6.816e+07 1st Qu.: 4.027e+08
## Median : 6.830e+07 Median :3.718e+08 Median : 2.408e+09
## Mean : 1.627e+09 Mean :1.002e+10 Mean : 4.671e+10
## 3rd Qu.: 6.911e+08 3rd Qu.:2.816e+09 3rd Qu.: 1.542e+10
## Max. : 2.110e+11 Max. :1.240e+12 Max. : 3.530e+12
## NA's :1841 NA's :21
## Final consumption expenditure General government final consumption expenditure
## Min. :2.044e+06 Min. :5.001e+05
## 1st Qu.:1.210e+09 1st Qu.:2.132e+08
## Median :6.524e+09 Median :1.122e+09
## Mean :1.358e+11 Mean :3.085e+10
## 3rd Qu.:3.784e+10 3rd Qu.:7.423e+09
## Max. :1.930e+13 Max. :3.350e+12
## NA's :52
## Gross capital formation
## Min. :-4.397e+10
## 1st Qu.: 2.808e+08
## Median : 1.766e+09
## Mean : 4.654e+10
## 3rd Qu.: 1.241e+10
## Max. : 7.600e+12
## NA's :52
## Gross fixed capital formation (including Acquisitions less disposals of valuables)
## Min. :2.920e+05
## 1st Qu.:2.735e+08
## Median :1.632e+09
## Mean :4.519e+10
## 3rd Qu.:1.166e+10
## Max. :7.430e+12
## NA's :52
## Household consumption expenditure (including Non-profit institutions serving households)
## Min. :7.960e+05
## 1st Qu.:9.390e+08
## Median :5.154e+09
## Mean :1.055e+11
## 3rd Qu.:3.009e+10
## Max. :1.590e+13
## NA's :52
## Imports of goods and services Manufacturing (ISIC D)
## Min. :1.982e+06 Min. :-2.485e+05
## 1st Qu.:5.891e+08 1st Qu.: 1.109e+08
## Median :2.892e+09 Median : 9.114e+08
## Mean :4.592e+10 Mean : 3.093e+10
## 3rd Qu.:1.602e+10 3rd Qu.: 7.405e+09
## Max. :3.400e+12 Max. : 4.870e+12
## NA's :42 NA's :43
## Mining, Manufacturing, Utilities (ISIC C-E) Other Activities (ISIC J-P)
## Min. :-2.581e+06 Min. :1.044e+06
## 1st Qu.: 1.857e+08 1st Qu.:3.556e+08
## Median : 1.597e+09 Median :2.107e+09
## Mean : 4.153e+10 Mean :7.474e+10
## 3rd Qu.: 1.251e+10 3rd Qu.:1.442e+10
## Max. : 5.780e+12 Max. :1.290e+13
##
## Total Value Added Transport, storage and communication (ISIC I)
## Min. :2.411e+06 Min. :-6.479e+06
## 1st Qu.:1.356e+09 1st Qu.: 8.999e+07
## Median :7.494e+09 Median : 5.442e+08
## Mean :1.745e+11 Mean : 1.557e+10
## 3rd Qu.:4.819e+10 3rd Qu.: 3.976e+09
## Max. :2.330e+13 Max. : 2.490e+12
## NA's :49
## Wholesale, retail trade, restaurants and hotels (ISIC G-H)
## Min. :2.302e+05
## 1st Qu.:2.079e+08
## Median :1.056e+09
## Mean :2.510e+10
## 3rd Qu.:6.707e+09
## Max. :3.520e+12
## NA's :49
## Gross National Income(GNI) in USD Gross Domestic Product (GDP)
## Min. :3.565e+06 Min. :2.585e+06
## 1st Qu.:1.411e+09 1st Qu.:1.439e+09
## Median :7.887e+09 Median :8.071e+09
## Mean :1.825e+11 Mean :1.829e+11
## 3rd Qu.:5.006e+10 3rd Qu.:5.173e+10
## Max. :2.360e+13 Max. :2.330e+13
##
# Range tahun
range(df$Year, na.rm = TRUE)
## [1] 1970 2021
# Jumlah missing value
colSums(is.na(df))
## CountryID
## 0
## Country
## 0
## Year
## 0
## AMA exchange rate
## 0
## IMF based exchange rate
## 0
## Population
## 0
## Currency
## 0
## Per capita GNI
## 0
## Agriculture, hunting, forestry, fishing (ISIC A-B)
## 121
## Changes in inventories
## 1841
## Construction (ISIC F)
## 0
## Exports of goods and services
## 21
## Final consumption expenditure
## 0
## General government final consumption expenditure
## 52
## Gross capital formation
## 52
## Gross fixed capital formation (including Acquisitions less disposals of valuables)
## 52
## Household consumption expenditure (including Non-profit institutions serving households)
## 52
## Imports of goods and services
## 42
## Manufacturing (ISIC D)
## 43
## Mining, Manufacturing, Utilities (ISIC C-E)
## 0
## Other Activities (ISIC J-P)
## 0
## Total Value Added
## 0
## Transport, storage and communication (ISIC I)
## 49
## Wholesale, retail trade, restaurants and hotels (ISIC G-H)
## 49
## Gross National Income(GNI) in USD
## 0
## Gross Domestic Product (GDP)
## 0
# Kelengkapan data per tahun
df %>% group_by(Year) %>% summarise(country_total = n(), missing_percent = mean(is.na(unlist(across()))) * 100) %>%
kable() %>% kable_styling(full_width = FALSE) %>% scroll_box(width = "100%", height = "400px")
| Year | country_total | missing_percent |
|---|---|---|
| 1970 | 187 | 1.1976048 |
| 1971 | 187 | 1.1976048 |
| 1972 | 187 | 1.1548332 |
| 1973 | 187 | 1.1120616 |
| 1974 | 187 | 1.1548332 |
| 1975 | 187 | 1.1120616 |
| 1976 | 187 | 1.1334474 |
| 1977 | 187 | 1.1334474 |
| 1978 | 187 | 1.1334474 |
| 1979 | 187 | 1.0692900 |
| 1980 | 187 | 1.0692900 |
| 1981 | 187 | 1.0479042 |
| 1982 | 187 | 1.0265184 |
| 1983 | 187 | 1.0265184 |
| 1984 | 187 | 1.0051326 |
| 1985 | 187 | 0.9837468 |
| 1986 | 187 | 1.0051326 |
| 1987 | 187 | 0.9409752 |
| 1988 | 187 | 0.9409752 |
| 1989 | 188 | 0.9572431 |
| 1990 | 216 | 0.8887243 |
| 1991 | 211 | 0.8150114 |
| 1992 | 211 | 0.9097801 |
| 1993 | 211 | 0.8529189 |
| 1994 | 210 | 0.8379356 |
| 1995 | 210 | 0.8569796 |
| 1996 | 210 | 0.8379356 |
| 1997 | 210 | 0.8188916 |
| 1998 | 210 | 0.8950676 |
| 1999 | 210 | 0.8950676 |
| 2000 | 210 | 0.8760236 |
| 2001 | 210 | 0.8379356 |
| 2002 | 210 | 0.7617597 |
| 2003 | 210 | 0.7808037 |
| 2004 | 210 | 0.6855837 |
| 2005 | 212 | 0.7168459 |
| 2006 | 212 | 0.6979815 |
| 2007 | 212 | 0.7168459 |
| 2008 | 214 | 0.7101476 |
| 2009 | 214 | 0.7475238 |
| 2010 | 214 | 0.7475238 |
| 2011 | 213 | 0.7322569 |
| 2012 | 213 | 0.7510327 |
| 2013 | 212 | 0.7357102 |
| 2014 | 212 | 0.7545746 |
| 2015 | 212 | 0.7734390 |
| 2016 | 212 | 0.7923033 |
| 2017 | 212 | 0.8111677 |
| 2018 | 212 | 0.8488964 |
| 2019 | 212 | 0.8866252 |
| 2020 | 212 | 0.9432183 |
| 2021 | 212 | 1.0375401 |
# cek duplikasi
sum(duplicated(df))
## [1] 0
Penelitian difokuskan pada tahun 2021 agar analisis merepresentasikan kondisi ekonomi global pada periode terbaru dalam dataset. Oleh karena itu, data difilter hanya untuk observasi tahun 2021.
df_2021 <- df %>% filter(Year == 2021)
dim(df_2021)
## [1] 212 26
Outlier univariat dideteksi menggunakan metode boxplot berbasis IQR. Transformasi log (log1p) digunakan untuk mengurangi skewness pada data ekonomi yang biasanya memiliki rentang sangat besar.
df_2021_numeric <- df_2021 %>%
select(where(is.numeric)) %>%
select(-Year)
colnames(df_2021_numeric) <- paste0("X", 1:ncol(df_2021_numeric))
# Boxplot univariat (metode IQR)
df_2021_numeric %>%
mutate(across(everything(), log1p)) %>%
pivot_longer(everything()) %>%
ggplot(aes(x = name, y = value)) +
geom_boxplot(fill = "steelblue",
outlier.color = "red",
outlier.size = 2) +
theme_minimal(base_size = 14) +
theme(axis.text.x = element_text(angle = 0)) +
labs(title = "Deteksi Outlier Univariat (Boxplot - IQR)",
x = "Indikator (X1–X22)",
y = "Nilai (log scale)")
Visualisasi di atas membantu mengidentifikasi nilai ekstrem pada masing-masing indikator.
Tiga metode imputasi dibandingkan:
# Median
df_median <- df_2021 %>%
mutate(across(where(is.numeric),
~ ifelse(is.na(.), median(., na.rm = TRUE), .)))
Mengganti nilai hilang dengan median variabel.
# Mean
df_mean <- df_2021 %>%
mutate(across(where(is.numeric),
~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))
Mengganti nilai hilang dengan rata-rata variabel.
# KNN
df_knn <- df_2021 %>%
select(where(is.numeric)) %>%
kNN(k = 5) %>%
select(-ends_with("_imp"))
Menggunakan metode K-Nearest Neighbors (k = 5) untuk memperkirakan nilai hilang berdasarkan kedekatan observasi.
Perbandingan ini dilakukan untuk menentukan metode imputasi terbaik berdasarkan uji kelayakan data.
Variabel non-numerik dan identitas negara dihapus karena tidak relevan dalam analisis PCA dan FA.
df_med_clean <- df_median %>% select(-Country, -Currency, -CountryID, -Year)
df_mean_clean <- df_mean %>% select(-Country, -Currency, -CountryID, -Year)
df_knn_clean <- df_knn %>% select(-Year)
Variabel dengan varians nol dihapus karena tidak memberikan informasi dalam analisis multivariat.
clean_var <- function(x){
x %>% select(where(~ sd(., na.rm = TRUE) > 0))
}
df_med_clean <- clean_var(df_med_clean)
df_mean_clean <- clean_var(df_mean_clean)
df_knn_clean <- clean_var(df_knn_clean)
Data distandarisasi menggunakan fungsi scale() sehingga memiliki: Mean = 0 Standar deviasi = 1 Standardisasi diperlukan karena PCA dan FA sensitif terhadap perbedaan skala variabel.
df_med_scaled <- as.data.frame(scale(df_med_clean))
df_mean_scaled <- as.data.frame(scale(df_mean_clean))
df_knn_scaled <- as.data.frame(scale(df_knn_clean))
Data penelitian terdiri dari 22 indikator ekonomi yang digunakan dalam analisis Principal Component Analysis (PCA) dan Factor Analysis (FA). Berikut adalah daftar variabel yang digunakan:
| Simbol | Nama.Variabel |
|---|---|
| X1 | AMA exchange rate |
| X2 | IMF based exchange rate |
| X3 | Population |
| X4 | Per capita GNI |
| X5 | Agriculture, hunting, forestry, fishing (ISIC A-B) |
| X6 | Changes in inventories |
| X7 | Construction (ISIC F) |
| X8 | Exports of goods and services |
| X9 | Final consumption expenditure |
| X10 | General government final consumption expenditure |
| X11 | Gross capital formation |
| X12 | Gross fixed capital formation (including Acquisitions less disposals of valuables) |
| X13 | Household consumption expenditure (including Non-profit institutions serving households) |
| X14 | Imports of goods and services |
| X15 | Manufacturing (ISIC D) |
| X16 | Mining, Manufacturing, Utilities (ISIC C-E) |
| X17 | Other Activities (ISIC J-P) |
| X18 | Total Value Added |
| X19 | Transport, storage and communication (ISIC I) |
| X20 | Wholesale, retail trade, restaurants and hotels (ISIC G-H) |
| X21 | Gross National Income (GNI) in USD |
| X22 | Gross Domestic Product (GDP) |
Outlier multivariat dideteksi menggunakan Mahalanobis Distance. Cut-off ditentukan berdasarkan distribusi Chi-Square dengan derajat bebas sebesar jumlah variabel.
Observasi dengan nilai Mahalanobis melebihi cut-off dianggap sebagai outlier multivariat.
# Cek outlier (Mahalanobis Distance)
center <- colMeans(df_med_scaled)
cov_mat <- cov(df_med_scaled)
mahal_dist <- mahalanobis(df_med_scaled, center, cov_mat)
cutoff <- qchisq(0.975, df = ncol(df_med_scaled))
outlier_index <- which(mahal_dist > cutoff)
length(outlier_index)
## [1] 33
outlier_index
## [1] 8 9 22 33 40 63 70 81 84 85 86 88 90 93 98 108 120 121 132
## [20] 142 152 159 167 172 174 176 179 185 188 194 202 205 209
plot(mahal_dist,
type = "h",
main = "Mahalanobis Distance Plot",
xlab = "Observasi",
ylab = "Mahalanobis Distance")
abline(h = cutoff, col = "red", lwd = 2)
Berdasarkan deteksi outlier secara univariat dan multivariat, ditemukan beberapa observasi yang teridentifikasi sebagai outlier. Namun karena jumlahnya relatif besar dan masih merepresentasikan variasi alami antar negara, observasi tersebut tidak dihapus agar tidak menghilangkan informasi penting dalam analisis.
KMO mengukur kecukupan sampel untuk analisis faktor.
Kriteria interpretasi: 0.90 → Sangat baik 0.80–0.89 → Baik 0.70–0.79 → Cukup < 0.60 → Tidak layak
kmo_med <- KMO(df_med_scaled)
kmo_mean <- KMO(df_mean_scaled)
kmo_knn <- KMO(df_knn_scaled)
kmo_compare <- data.frame(
Metode = c("Median","Mean","KNN"),
KMO = c(kmo_med$MSA, kmo_mean$MSA, kmo_knn$MSA)
)
kable(kmo_compare) %>% kable_styling(full_width = FALSE)
| Metode | KMO |
|---|---|
| Median | 0.7436081 |
| Mean | 0.7529374 |
| KNN | 0.7392948 |
Meskipun nilai KMO metode mean sedikit lebih tinggi, imputasi median dipilih karena data ekonomi cenderung mengandung outlier. Mean sensitif terhadap nilai ekstrem, sedangkan median lebih robust dan lebih stabil dalam merepresentasikan kecenderungan sentral data. Oleh karena itu, median dinilai lebih sesuai untuk menjaga struktur asli data sebelum dilakukan PCA dan FA.
Bartlett menguji apakah matriks korelasi berbeda signifikan dari matriks identitas. Jika p-value < 0.05 → data layak untuk FA.
bart_med <- cortest.bartlett(df_med_scaled)
bart_mean <- cortest.bartlett(df_mean_scaled)
bart_knn <- cortest.bartlett(df_knn_scaled)
bart_compare <- data.frame(
Metode = c("Median","Mean","KNN"),
p_value = c(bart_med$p.value,
bart_mean$p.value,
bart_knn$p.value)
)
kable(bart_compare) %>% kable_styling(full_width = FALSE)
| Metode | p_value |
|---|---|
| Median | 0 |
| Mean | 0 |
| KNN | 0 |
Visualisasi korelasi dilakukan untuk melihat pola hubungan antar indikator ekonomi.
Adanya korelasi tinggi menunjukkan kemungkinan adanya struktur faktor laten.
kor <- cor(df_med_scaled)
corrplot(kor,
method = "color",
type = "upper",
addCoef.col = "black",
number.cex = 0.7,
tl.cex = 0.8)
PCA dilakukan menggunakan fungsi prcomp() pada data yang telah distandarisasi.
pca_final <- prcomp(df_med_scaled, scale. = FALSE)
summary(pca_final)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 4.0268 1.5081 1.28843 0.99357 0.62321 0.4965 0.37645
## Proportion of Variance 0.7371 0.1034 0.07546 0.04487 0.01765 0.0112 0.00644
## Cumulative Proportion 0.7371 0.8404 0.91590 0.96077 0.97843 0.9896 0.99607
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.22122 0.1244 0.09507 0.07229 0.04948 0.04821 0.03651
## Proportion of Variance 0.00222 0.0007 0.00041 0.00024 0.00011 0.00011 0.00006
## Cumulative Proportion 0.99830 0.9990 0.99941 0.99965 0.99976 0.99987 0.99993
## PC15 PC16 PC17 PC18 PC19 PC20
## Standard deviation 0.02956 0.02388 0.01222 0.005283 0.001145 0.0007479
## Proportion of Variance 0.00004 0.00003 0.00001 0.000000 0.000000 0.0000000
## Cumulative Proportion 0.99997 0.99999 1.00000 1.000000 1.000000 1.0000000
## PC21 PC22
## Standard deviation 0.000403 0.0002022
## Proportion of Variance 0.000000 0.0000000
## Cumulative Proportion 1.000000 1.0000000
Eigenvalue dihitung dari kuadrat standar deviasi komponen utama. Eigenvalue menunjukkan jumlah variasi yang dijelaskan oleh setiap komponen.
eigen_values <- pca_final$sdev^2
Komponen dipilih jika memiliki eigenvalue > 1.
sum(eigen_values > 1)
## [1] 3
Menghitung proporsi dan kumulatif variasi yang dijelaskan oleh komponen utama.
prop_var <- eigen_values / sum(eigen_values)
cum_var <- cumsum(prop_var)
cum_var
## [1] 0.7370582 0.8404448 0.9159012 0.9607731 0.9784273 0.9896310 0.9960727
## [8] 0.9982971 0.9990003 0.9994111 0.9996487 0.9997600 0.9998656 0.9999262
## [15] 0.9999659 0.9999919 0.9999986 0.9999999 1.0000000 1.0000000 1.0000000
## [22] 1.0000000
Scree plot digunakan untuk mengidentifikasi titik elbow sebagai alternatif penentuan jumlah komponen.
fviz_eig(pca_final, addlabels = TRUE)
Menunjukkan variabel mana yang paling berkontribusi pada PC1 dan PC2.
# Ke PC1
fviz_contrib(
pca_final,
choice = "var",
axes = 1,
top = 15,
fill = "steelblue"
) +
geom_hline(yintercept = 100/22, linetype = 2)
# Ke PC2
fviz_contrib(
pca_final,
choice = "var",
axes = 2,
top = 15,
fill = "steelblue"
) +
geom_hline(yintercept = 100/22, linetype = 2)
Loading menunjukkan korelasi antara variabel asli dan komponen utama.
fviz_pca_var(pca_final,
col.var = "contrib",
gradient.cols = c("blue","orange","red"),
repel = TRUE)
cov_mat <- cov(df_med_scaled)
eig_manual <- eigen(cov_mat)
eigen_values_manual <- eig_manual$values
eigen_vectors_manual <- eig_manual$vectors
prop_var_manual <- eigen_values_manual / sum(eigen_values_manual)
cum_var_manual <- cumsum(prop_var_manual)
pca_manual <- data.frame(
Komponen = paste0("PC", 1:length(eigen_values_manual)),
Eigenvalue = eigen_values_manual,
Proporsi = prop_var_manual,
Kumulatif = cum_var_manual
)
kable(pca_manual) %>% kable_styling(full_width = FALSE)
| Komponen | Eigenvalue | Proporsi | Kumulatif |
|---|---|---|---|
| PC1 | 16.2152794 | 0.7370582 | 0.7370582 |
| PC2 | 2.2745056 | 0.1033866 | 0.8404448 |
| PC3 | 1.6600415 | 0.0754564 | 0.9159012 |
| PC4 | 0.9871808 | 0.0448719 | 0.9607731 |
| PC5 | 0.3883928 | 0.0176542 | 0.9784273 |
| PC6 | 0.2464823 | 0.0112037 | 0.9896310 |
| PC7 | 0.1417162 | 0.0064416 | 0.9960727 |
| PC8 | 0.0489369 | 0.0022244 | 0.9982971 |
| PC9 | 0.0154703 | 0.0007032 | 0.9990003 |
| PC10 | 0.0090389 | 0.0004109 | 0.9994111 |
| PC11 | 0.0052266 | 0.0002376 | 0.9996487 |
| PC12 | 0.0024478 | 0.0001113 | 0.9997600 |
| PC13 | 0.0023246 | 0.0001057 | 0.9998656 |
| PC14 | 0.0013328 | 0.0000606 | 0.9999262 |
| PC15 | 0.0008739 | 0.0000397 | 0.9999659 |
| PC16 | 0.0005704 | 0.0000259 | 0.9999919 |
| PC17 | 0.0001493 | 0.0000068 | 0.9999986 |
| PC18 | 0.0000279 | 0.0000013 | 0.9999999 |
| PC19 | 0.0000013 | 0.0000001 | 1.0000000 |
| PC20 | 0.0000006 | 0.0000000 | 1.0000000 |
| PC21 | 0.0000002 | 0.0000000 | 1.0000000 |
| PC22 | 0.0000000 | 0.0000000 | 1.0000000 |
scores_pca_manual <- as.matrix(df_med_scaled) %*% eigen_vectors_manual
head(scores_pca_manual)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.9004235 0.05877780 0.27018743 -0.6244589 0.0396896761 -0.0036307050
## [2,] 0.9314044 0.16200982 0.16020171 -0.4156180 -0.0922514766 -0.1307730500
## [3,] 0.5246835 -0.08441405 0.35949071 -0.4586007 -0.0639643027 0.0001875561
## [4,] 0.9266989 0.43298766 0.04928632 0.7332024 0.2180295981 -0.3256663295
## [5,] 0.7942831 -0.04994436 0.17697253 -0.5428798 0.0003539282 -0.0157782489
## [6,] 0.9614402 0.24339753 0.14050150 -0.1440502 -0.0350638940 -0.1870070519
## [,7] [,8] [,9] [,10] [,11]
## [1,] -0.08364019 0.034466137 -0.0177028130 -0.043133450 -0.039212126
## [2,] -0.05205450 0.027330484 0.0080991108 0.026044955 -0.001634054
## [3,] -0.18120661 -0.085107606 -0.0175093865 0.041663003 0.038968272
## [4,] -0.09050471 -0.027470648 0.0021297420 -0.007725320 -0.013454682
## [5,] -0.03174048 0.003393824 -0.0006248975 -0.033945374 0.019844532
## [6,] -0.07421066 0.010253036 0.0042237660 0.007338755 -0.010164589
## [,12] [,13] [,14] [,15] [,16]
## [1,] 0.024369993 -0.014146473 -0.001306507 0.024413201 -0.002352774
## [2,] 0.001386426 0.002547264 0.001514197 -0.011078662 -0.005025632
## [3,] 0.045179399 -0.003628634 -0.033445316 0.023108627 0.023788100
## [4,] 0.001958845 0.005176775 0.007850534 -0.004750845 -0.006181017
## [5,] -0.001966991 -0.036140211 -0.005267282 0.039935478 -0.012960562
## [6,] 0.005168492 -0.001429394 0.002656469 -0.003531774 -0.006977464
## [,17] [,18] [,19] [,20] [,21]
## [1,] 0.0009698838 -0.0027126927 1.720017e-04 0.0002565293 2.278286e-04
## [2,] -0.0016964745 0.0021087536 2.037386e-04 0.0002072516 -8.101792e-05
## [3,] 0.0034269602 0.0004365102 -1.103219e-03 -0.0001453891 -8.365906e-05
## [4,] 0.0002028357 0.0011149791 2.219595e-04 -0.0001484701 -3.060593e-05
## [5,] 0.0039967225 0.0028070864 -5.339321e-05 -0.0002025450 -7.509037e-05
## [6,] -0.0002649622 0.0012519337 1.830709e-04 0.0002145693 -6.217044e-05
## [,22]
## [1,] 3.466698e-05
## [2,] -3.285652e-05
## [3,] 1.615282e-04
## [4,] -9.593451e-05
## [5,] 1.881789e-05
## [6,] -5.504854e-05
Jumlah faktor ditentukan menggunakan Parallel Analysis. Metode ini membandingkan eigenvalue aktual dengan eigenvalue acak.
fa_par <- fa.parallel(df_med_scaled, fa = "fa", plot = TRUE)
## Parallel analysis suggests that the number of factors = 3 and the number of components = NA
k <- fa_par$nfact
FA dilakukan menggunakan metode Principal Axis Factoring (PAF) dengan rotasi varimax. Rotasi varimax digunakan untuk mempermudah interpretasi faktor.
fa_result <- fa(df_med_scaled,
nfactors = k,
rotate = "varimax",
fm = "pa")
print(fa_result$loadings, cutoff = 0.4)
##
## Loadings:
## PA1 PA3 PA2
## X1 0.979
## X2 0.922
## X3 0.783
## X4
## X5 0.956
## X6 0.837
## X7 0.733 0.677
## X8 0.678 0.609
## X9 0.949
## X10 0.865 0.485
## X11 0.682 0.722
## X12 0.691 0.712
## X13 0.958
## X14 0.795 0.507
## X15 0.615 0.769
## X16 0.644 0.752
## X17 0.962
## X18 0.885 0.463
## X19 0.939
## X20 0.935
## X21 0.889 0.458
## X22 0.884 0.466
##
## PA1 PA3 PA2
## SS loadings 11.152 6.845 1.896
## Proportion Var 0.507 0.311 0.086
## Cumulative Var 0.507 0.818 0.904
fa_par <- fa.parallel(df_med_scaled, fa = "fa", plot = FALSE)
## Parallel analysis suggests that the number of factors = 3 and the number of components = NA
k <- fa_par$nfact
# Hitung matriks korelasi
R <- cor(df_med_scaled)
# Hitung communalities awal
invR <- solve(R)
smc <- 1 - 1/diag(invR)
# Bentuk reduced correlation matrix
R_reduced <- R
diag(R_reduced) <- smc
# Eigen decomposition
eig_fa <- eigen(R_reduced)
lambda_fa <- eig_fa$values[1:k]
V_fa <- eig_fa$vectors[,1:k]
# Hitung factor loading
loadings_fa_manual <- sweep(V_fa, 2, sqrt(lambda_fa), "*")
# Hitung unique variance
psi <- 1 - rowSums(loadings_fa_manual^2)