1. Load Library

library(readr)
library(dplyr)
library(ggplot2)
library(corrplot)
library(psych)
library(FactoMineR)
library(factoextra)

2. Load Dataset

finance_data <- read_csv("C:/Users/diyan/Downloads/5k.csv")
## Rows: 5000 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (17): Occupation, Risk Tolerance, Investment Goals, Income Level, Addres...
## dbl  (2): Age, Loan Term (Months)
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Standarisasi nama kolom (menghilangkan spasi dan simbol)
names(finance_data) <- make.names(names(finance_data))

head(finance_data)
## # A tibble: 6 × 19
##     Age Occupation Risk.Tolerance Investment.Goals    Income.Level Address      
##   <dbl> <chr>      <chr>          <chr>               <chr>        <chr>        
## 1    40 Lawyer     High           Wealth Preservation $46044.94    "7168 Moody …
## 2    30 Teacher    Low            Wealth Preservation $57169.50    "50001 Hecto…
## 3    37 Teacher    Low            Speculation         $71760.86    "997 James I…
## 4    27 Student    Medium         Speculation         $-25488.15   "1607 Joshua…
## 5    36 Engineer   Low            Income Generation   $106777.95   "96690 Campb…
## 6    77 Doctor     Low            Income Generation   $59157.22    "23020 Jacks…
## # ℹ 13 more variables: Account.Balance <chr>, Deposits <chr>,
## #   Withdrawals <chr>, Transfers <chr>, International.Transfers <chr>,
## #   Investments <chr>, Loan.Amount <chr>, Loan.Purpose <chr>,
## #   Employment.Status <chr>, Loan.Term..Months. <dbl>, Interest.Rate <chr>,
## #   Loan.Status <chr>, Transaction.Description <chr>
str(finance_data)
## spc_tbl_ [5,000 × 19] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Age                    : num [1:5000] 40 30 37 27 36 77 70 45 56 61 ...
##  $ Occupation             : chr [1:5000] "Lawyer" "Teacher" "Teacher" "Student" ...
##  $ Risk.Tolerance         : chr [1:5000] "High" "Low" "Low" "Medium" ...
##  $ Investment.Goals       : chr [1:5000] "Wealth Preservation" "Wealth Preservation" "Speculation" "Speculation" ...
##  $ Income.Level           : chr [1:5000] "$46044.94" "$57169.50" "$71760.86" "$-25488.15" ...
##  $ Address                : chr [1:5000] "7168 Moody Meadow\nHernandezshire, PW 06016" "50001 Hector Square\nWest Luisfurt, MA 51935" "997 James Isle\nNorth Rebeccafurt, RI 13366" "1607 Joshua Camp Apt. 634\nConleymouth, CT 66479" ...
##  $ Account.Balance        : chr [1:5000] "$44653.26" "$29175.47" "$86141.59" "$1000.00" ...
##  $ Deposits               : chr [1:5000] "$9156.01" "$5933.22" "$22583.11" "$299.47" ...
##  $ Withdrawals            : chr [1:5000] "$9327.70" "$8671.60" "$16468.58" "$289.09" ...
##  $ Transfers              : chr [1:5000] "$3647.92" "$6729.86" "$6032.53" "$109.83" ...
##  $ International.Transfers: chr [1:5000] "$82.80" "$819.69" "$1526.33" "$3.51" ...
##  $ Investments            : chr [1:5000] "$8729.30" "$4545.18" "$8251.45" "$195.73" ...
##  $ Loan.Amount            : chr [1:5000] "$27010.93" "$31266.97" "$41260.58" "$5000.00" ...
##  $ Loan.Purpose           : chr [1:5000] "Medical Expenses" "Auto Purchase" "Auto Purchase" "Small Business" ...
##  $ Employment.Status      : chr [1:5000] "Retired" "Retired" "Employed" "Retired" ...
##  $ Loan.Term..Months.     : num [1:5000] 36 36 12 60 24 24 24 48 48 12 ...
##  $ Interest.Rate          : chr [1:5000] "11.94%" "8.08%" "13.07%" "6.73%" ...
##  $ Loan.Status            : chr [1:5000] "pending" "approved" "pending" "approved" ...
##  $ Transaction.Description: chr [1:5000] "Electronics transaction of $706.18 at Sanders, Roberts and Hughes" "Transaction at Evans-Smith for $2250.03" "Purchase at Taylor-Gutierrez for $615.85 on 2024-02-21" "Travel transaction of $4852.39 at Jones-Russell" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Age = col_double(),
##   ..   Occupation = col_character(),
##   ..   `Risk Tolerance` = col_character(),
##   ..   `Investment Goals` = col_character(),
##   ..   `Income Level` = col_character(),
##   ..   Address = col_character(),
##   ..   `Account Balance` = col_character(),
##   ..   Deposits = col_character(),
##   ..   Withdrawals = col_character(),
##   ..   Transfers = col_character(),
##   ..   `International Transfers` = col_character(),
##   ..   Investments = col_character(),
##   ..   `Loan Amount` = col_character(),
##   ..   `Loan Purpose` = col_character(),
##   ..   `Employment Status` = col_character(),
##   ..   `Loan Term (Months)` = col_double(),
##   ..   `Interest Rate` = col_character(),
##   ..   `Loan Status` = col_character(),
##   ..   `Transaction Description` = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

3. Data Preprocessing

3.1 Pembersihan Format Angka

currency_columns <- c("Income.Level", "Account.Balance", "Deposits", 
                      "Withdrawals", "Transfers", "International.Transfers", 
                      "Investments", "Loan.Amount")

for (col in currency_columns) {
  finance_data[[col]] <- gsub("\\$", "", finance_data[[col]])
  finance_data[[col]] <- gsub(",", "", finance_data[[col]])
  finance_data[[col]] <- as.numeric(finance_data[[col]])
}

finance_data$Interest.Rate <- gsub("%", "", finance_data$Interest.Rate)
finance_data$Interest.Rate <- as.numeric(finance_data$Interest.Rate) / 100

str(finance_data)
## spc_tbl_ [5,000 × 19] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Age                    : num [1:5000] 40 30 37 27 36 77 70 45 56 61 ...
##  $ Occupation             : chr [1:5000] "Lawyer" "Teacher" "Teacher" "Student" ...
##  $ Risk.Tolerance         : chr [1:5000] "High" "Low" "Low" "Medium" ...
##  $ Investment.Goals       : chr [1:5000] "Wealth Preservation" "Wealth Preservation" "Speculation" "Speculation" ...
##  $ Income.Level           : num [1:5000] 46045 57170 71761 -25488 106778 ...
##  $ Address                : chr [1:5000] "7168 Moody Meadow\nHernandezshire, PW 06016" "50001 Hector Square\nWest Luisfurt, MA 51935" "997 James Isle\nNorth Rebeccafurt, RI 13366" "1607 Joshua Camp Apt. 634\nConleymouth, CT 66479" ...
##  $ Account.Balance        : num [1:5000] 44653 29175 86142 1000 77919 ...
##  $ Deposits               : num [1:5000] 9156 5933 22583 299 13853 ...
##  $ Withdrawals            : num [1:5000] 9328 8672 16469 289 22699 ...
##  $ Transfers              : num [1:5000] 3648 6730 6033 110 12189 ...
##  $ International.Transfers: num [1:5000] 82.8 819.69 1526.33 3.51 2152.51 ...
##  $ Investments            : num [1:5000] 8729 4545 8251 196 9725 ...
##  $ Loan.Amount            : num [1:5000] 27011 31267 41261 5000 50000 ...
##  $ Loan.Purpose           : chr [1:5000] "Medical Expenses" "Auto Purchase" "Auto Purchase" "Small Business" ...
##  $ Employment.Status      : chr [1:5000] "Retired" "Retired" "Employed" "Retired" ...
##  $ Loan.Term..Months.     : num [1:5000] 36 36 12 60 24 24 24 48 48 12 ...
##  $ Interest.Rate          : num [1:5000] 0.1194 0.0808 0.1307 0.0673 0.106 ...
##  $ Loan.Status            : chr [1:5000] "pending" "approved" "pending" "approved" ...
##  $ Transaction.Description: chr [1:5000] "Electronics transaction of $706.18 at Sanders, Roberts and Hughes" "Transaction at Evans-Smith for $2250.03" "Purchase at Taylor-Gutierrez for $615.85 on 2024-02-21" "Travel transaction of $4852.39 at Jones-Russell" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Age = col_double(),
##   ..   Occupation = col_character(),
##   ..   `Risk Tolerance` = col_character(),
##   ..   `Investment Goals` = col_character(),
##   ..   `Income Level` = col_character(),
##   ..   Address = col_character(),
##   ..   `Account Balance` = col_character(),
##   ..   Deposits = col_character(),
##   ..   Withdrawals = col_character(),
##   ..   Transfers = col_character(),
##   ..   `International Transfers` = col_character(),
##   ..   Investments = col_character(),
##   ..   `Loan Amount` = col_character(),
##   ..   `Loan Purpose` = col_character(),
##   ..   `Employment Status` = col_character(),
##   ..   `Loan Term (Months)` = col_double(),
##   ..   `Interest Rate` = col_character(),
##   ..   `Loan Status` = col_character(),
##   ..   `Transaction Description` = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

3.2 Menghapus Kolom Tidak Digunakan

finance_data <- finance_data %>%
  select(-Occupation, -Investment.Goals, -Address, -Transaction.Description)

head(finance_data)
## # A tibble: 6 × 15
##     Age Risk.Tolerance Income.Level Account.Balance Deposits Withdrawals
##   <dbl> <chr>                 <dbl>           <dbl>    <dbl>       <dbl>
## 1    40 High                 46045.          44653.    9156.       9328.
## 2    30 Low                  57170.          29175.    5933.       8672.
## 3    37 Low                  71761.          86142.   22583.      16469.
## 4    27 Medium              -25488.           1000      299.        289.
## 5    36 Low                 106778.          77919.   13853.      22699.
## 6    77 Low                  59157.          61521.   22053.      16970.
## # ℹ 9 more variables: Transfers <dbl>, International.Transfers <dbl>,
## #   Investments <dbl>, Loan.Amount <dbl>, Loan.Purpose <chr>,
## #   Employment.Status <chr>, Loan.Term..Months. <dbl>, Interest.Rate <dbl>,
## #   Loan.Status <chr>

4. Missing Value dan Duplikasi

colSums(is.na(finance_data))
##                     Age          Risk.Tolerance            Income.Level 
##                       0                       0                       0 
##         Account.Balance                Deposits             Withdrawals 
##                       0                       0                       0 
##               Transfers International.Transfers             Investments 
##                       0                       0                       0 
##             Loan.Amount            Loan.Purpose       Employment.Status 
##                       0                       0                       0 
##      Loan.Term..Months.           Interest.Rate             Loan.Status 
##                       0                       0                       0
sum(duplicated(finance_data))
## [1] 0

5. Mengatasi Outlier dengan Metode IQR

numerical_columns <- c("Age", "Income.Level", "Account.Balance", "Deposits", 
                       "Withdrawals", "Transfers", "International.Transfers", 
                       "Investments", "Loan.Amount", 
                       "Loan.Term..Months.", "Interest.Rate")

remove_outliers_iqr <- function(data, column){
  Q1 <- quantile(data[[column]], 0.25, na.rm = TRUE)
  Q3 <- quantile(data[[column]], 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  
  data %>% filter(.data[[column]] >= lower_bound & .data[[column]] <= upper_bound)
}

for (col in numerical_columns) {
  finance_data <- remove_outliers_iqr(finance_data, col)
}

dim(finance_data)
## [1] 4543   15

Visualisasi Boxplot

for (col in numerical_columns) {
  print(
    ggplot(finance_data, aes(y = .data[[col]])) +
      geom_boxplot() +
      ggtitle(paste("Boxplot of", col)) +
      theme_minimal()
  )
}

6. Statistik Deskriptif

data_numeric <- finance_data %>%
  select(where(is.numeric))

describe(data_numeric)
##                         vars    n     mean       sd   median  trimmed      mad
## Age                        1 4543    51.75    19.56    52.00    51.82    25.20
## Income.Level               2 4543 69032.56 29050.63 68605.30 68926.58 29949.56
## Account.Balance            3 4543 58729.19 32853.32 57378.22 59886.26 47846.11
## Deposits                   4 4543 17551.08 12488.10 14623.09 16310.36 12424.93
## Withdrawals                5 4543 10361.58  7569.72  8511.19  9590.10  7601.88
## Transfers                  6 4543  8462.06  5873.72  7233.53  7912.89  6126.04
## International.Transfers    7 4543  1153.67  1051.45   817.95  1002.50   897.17
## Investments                8 4543  8317.43  5819.26  6984.33  7762.11  5775.76
## Loan.Amount                9 4543 37561.36 15201.27 47406.86 39682.80  3844.59
## Loan.Term..Months.        10 4543    36.24    17.06    36.00    36.29    17.79
## Interest.Rate             11 4543     0.11     0.02     0.11     0.11     0.02
##                               min       max     range  skew kurtosis     se
## Age                         18.00     85.00     67.00 -0.02    -1.20   0.29
## Income.Level            -10338.99 151827.63 162166.62  0.04    -0.33 431.01
## Account.Balance           1000.00 100000.00  99000.00 -0.07    -1.41 487.43
## Deposits                   109.34  49992.09  49882.75  0.74    -0.35 185.28
## Withdrawals                 50.50  29996.72  29946.22  0.75    -0.37 112.31
## Transfers                   51.99  24991.81  24939.82  0.71    -0.30  87.14
## International.Transfers      0.01   4378.21   4378.20  1.09     0.39  15.60
## Investments                 51.22  26656.02  26604.80  0.78    -0.06  86.34
## Loan.Amount               5000.00  50000.00  45000.00 -0.81    -0.85 225.53
## Loan.Term..Months.          12.00     60.00     48.00 -0.01    -1.31   0.25
## Interest.Rate                0.05      0.15      0.10 -0.30    -0.46   0.00

7. Visualisasi Distribusi

for (col in colnames(data_numeric)) {
  print(
    ggplot(data_numeric, aes(x = .data[[col]])) +
      geom_histogram(bins = 10) +
      ggtitle(paste("Distribusi", col)) +
      theme_minimal()
  )
}

8. Analisis Korelasi

corr_matrix <- cor(data_numeric, use = "complete.obs")
corrplot(corr_matrix, method = "color", addCoef.col = "black", tl.cex = 0.7)

9. Uji Asumsi PCA dan FA

Bartlett Test

cortest.bartlett(corr_matrix, n = nrow(data_numeric))
## $chisq
## [1] 23414.17
## 
## $p.value
## [1] 0
## 
## $df
## [1] 55

KMO Test

KMO(data_numeric)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = data_numeric)
## Overall MSA =  0.84
## MSA for each item = 
##                     Age            Income.Level         Account.Balance 
##                    0.44                    0.89                    0.79 
##                Deposits             Withdrawals               Transfers 
##                    0.90                    0.91                    0.87 
## International.Transfers             Investments             Loan.Amount 
##                    0.88                    0.93                    0.66 
##      Loan.Term..Months.           Interest.Rate 
##                    0.78                    0.63

10. Standardisasi Data

Z <- scale(data_numeric)

11. Principal Component Analysis (PCA)

pca_result <- prcomp(Z, center = TRUE, scale. = TRUE)

eigenvalues <- pca_result$sdev^2
variance_ratio <- eigenvalues / sum(eigenvalues)

hasil_pca <- data.frame(
  Eigenvalue = eigenvalues,
  Proporsi_Varians = variance_ratio,
  Kumulatif_Varians = cumsum(variance_ratio)
)

hasil_pca
##    Eigenvalue Proporsi_Varians Kumulatif_Varians
## 1   4.4975315       0.40886650         0.4088665
## 2   1.5982227       0.14529297         0.5541595
## 3   1.0049825       0.09136204         0.6455215
## 4   0.9951936       0.09047215         0.7359937
## 5   0.7498466       0.06816787         0.8041615
## 6   0.5896182       0.05360165         0.8577632
## 7   0.4681944       0.04256313         0.9003263
## 8   0.3961348       0.03601225         0.9363386
## 9   0.3026249       0.02751136         0.9638499
## 10  0.2845687       0.02586988         0.9897198
## 11  0.1130822       0.01028020         1.0000000

Scree Plot

plot(eigenvalues, type = "b",
     xlab = "Komponen",
     ylab = "Eigenvalue",
     main = "Scree Plot PCA")
abline(h = 1, col = "red", lty = 2)

Kaiser Rule

sum(eigenvalues > 1)
## [1] 3

PCA dengan 3 Komponen

Z_pca <- pca_result$x[,1:3]
dim(Z_pca)
## [1] 4543    3

Loading Matrix

loadings <- pca_result$rotation[,1:3]
loadings
##                                  PC1         PC2           PC3
## Age                      0.001523359 -0.03987294 -0.7858987379
## Income.Level            -0.303060776  0.26729106  0.0151343930
## Account.Balance         -0.438201416 -0.14114665  0.0067029728
## Deposits                -0.375946623 -0.14600323  0.0186386574
## Withdrawals             -0.371991516 -0.13127608  0.0153284640
## Transfers               -0.386633675 -0.17339343 -0.0495076780
## International.Transfers -0.300469471 -0.14294658 -0.0720290764
## Investments             -0.346721700 -0.13990328  0.0458795034
## Loan.Amount             -0.220181636  0.62191137 -0.0139323728
## Loan.Term..Months.       0.010785766  0.05457335 -0.6095641574
## Interest.Rate           -0.166847497  0.63927222 -0.0003644493

12. Factor Analysis (FA)

fa.parallel(data_numeric, fa = "fa")

## Parallel analysis suggests that the number of factors =  3  and the number of components =  NA
fa_result <- fa(data_numeric, nfactors = 3, rotate = "varimax")

fa_result$loadings
## 
## Loadings:
##                         MR1    MR2    MR3   
## Age                                         
## Income.Level             0.471  0.412       
## Account.Balance          0.985  0.139       
## Deposits                 0.782  0.105       
## Withdrawals              0.763  0.118       
## Transfers                0.740  0.156  0.524
## International.Transfers  0.498  0.146  0.458
## Investments              0.691  0.103       
## Loan.Amount              0.159  0.981       
## Loan.Term..Months.                          
## Interest.Rate                   0.675       
## 
##                  MR1   MR2   MR3
## SS loadings    3.694 1.689 0.503
## Proportion Var 0.336 0.154 0.046
## Cumulative Var 0.336 0.489 0.535
fa_result$communality
##                     Age            Income.Level         Account.Balance 
##             0.001785818             0.392762859             0.991571285 
##                Deposits             Withdrawals               Transfers 
##             0.621911424             0.595606608             0.847107268 
## International.Transfers             Investments             Loan.Amount 
##             0.478586258             0.489002430             0.997496863 
##      Loan.Term..Months.           Interest.Rate 
##             0.001897632             0.467923452

13. Perbandingan PCA dan FA

cat("Jumlah komponen PCA:", 3, "\n")
## Jumlah komponen PCA: 3
cat("Jumlah faktor FA:", 3)
## Jumlah faktor FA: 3

14. Kesimpulan

Hasil analisis menunjukkan bahwa tiga komponen utama mampu merepresentasikan variasi data secara optimal berdasarkan Kaiser Rule. Analisis faktor dengan rotasi varimax memperjelas struktur laten variabel keuangan dalam dataset.