library(readr)
library(dplyr)
library(ggplot2)
library(corrplot)
library(psych)
library(FactoMineR)
library(factoextra)
finance_data <- read_csv("C:/Users/diyan/Downloads/5k.csv")
## Rows: 5000 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (17): Occupation, Risk Tolerance, Investment Goals, Income Level, Addres...
## dbl (2): Age, Loan Term (Months)
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Standarisasi nama kolom (menghilangkan spasi dan simbol)
names(finance_data) <- make.names(names(finance_data))
head(finance_data)
## # A tibble: 6 × 19
## Age Occupation Risk.Tolerance Investment.Goals Income.Level Address
## <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 40 Lawyer High Wealth Preservation $46044.94 "7168 Moody …
## 2 30 Teacher Low Wealth Preservation $57169.50 "50001 Hecto…
## 3 37 Teacher Low Speculation $71760.86 "997 James I…
## 4 27 Student Medium Speculation $-25488.15 "1607 Joshua…
## 5 36 Engineer Low Income Generation $106777.95 "96690 Campb…
## 6 77 Doctor Low Income Generation $59157.22 "23020 Jacks…
## # ℹ 13 more variables: Account.Balance <chr>, Deposits <chr>,
## # Withdrawals <chr>, Transfers <chr>, International.Transfers <chr>,
## # Investments <chr>, Loan.Amount <chr>, Loan.Purpose <chr>,
## # Employment.Status <chr>, Loan.Term..Months. <dbl>, Interest.Rate <chr>,
## # Loan.Status <chr>, Transaction.Description <chr>
str(finance_data)
## spc_tbl_ [5,000 × 19] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Age : num [1:5000] 40 30 37 27 36 77 70 45 56 61 ...
## $ Occupation : chr [1:5000] "Lawyer" "Teacher" "Teacher" "Student" ...
## $ Risk.Tolerance : chr [1:5000] "High" "Low" "Low" "Medium" ...
## $ Investment.Goals : chr [1:5000] "Wealth Preservation" "Wealth Preservation" "Speculation" "Speculation" ...
## $ Income.Level : chr [1:5000] "$46044.94" "$57169.50" "$71760.86" "$-25488.15" ...
## $ Address : chr [1:5000] "7168 Moody Meadow\nHernandezshire, PW 06016" "50001 Hector Square\nWest Luisfurt, MA 51935" "997 James Isle\nNorth Rebeccafurt, RI 13366" "1607 Joshua Camp Apt. 634\nConleymouth, CT 66479" ...
## $ Account.Balance : chr [1:5000] "$44653.26" "$29175.47" "$86141.59" "$1000.00" ...
## $ Deposits : chr [1:5000] "$9156.01" "$5933.22" "$22583.11" "$299.47" ...
## $ Withdrawals : chr [1:5000] "$9327.70" "$8671.60" "$16468.58" "$289.09" ...
## $ Transfers : chr [1:5000] "$3647.92" "$6729.86" "$6032.53" "$109.83" ...
## $ International.Transfers: chr [1:5000] "$82.80" "$819.69" "$1526.33" "$3.51" ...
## $ Investments : chr [1:5000] "$8729.30" "$4545.18" "$8251.45" "$195.73" ...
## $ Loan.Amount : chr [1:5000] "$27010.93" "$31266.97" "$41260.58" "$5000.00" ...
## $ Loan.Purpose : chr [1:5000] "Medical Expenses" "Auto Purchase" "Auto Purchase" "Small Business" ...
## $ Employment.Status : chr [1:5000] "Retired" "Retired" "Employed" "Retired" ...
## $ Loan.Term..Months. : num [1:5000] 36 36 12 60 24 24 24 48 48 12 ...
## $ Interest.Rate : chr [1:5000] "11.94%" "8.08%" "13.07%" "6.73%" ...
## $ Loan.Status : chr [1:5000] "pending" "approved" "pending" "approved" ...
## $ Transaction.Description: chr [1:5000] "Electronics transaction of $706.18 at Sanders, Roberts and Hughes" "Transaction at Evans-Smith for $2250.03" "Purchase at Taylor-Gutierrez for $615.85 on 2024-02-21" "Travel transaction of $4852.39 at Jones-Russell" ...
## - attr(*, "spec")=
## .. cols(
## .. Age = col_double(),
## .. Occupation = col_character(),
## .. `Risk Tolerance` = col_character(),
## .. `Investment Goals` = col_character(),
## .. `Income Level` = col_character(),
## .. Address = col_character(),
## .. `Account Balance` = col_character(),
## .. Deposits = col_character(),
## .. Withdrawals = col_character(),
## .. Transfers = col_character(),
## .. `International Transfers` = col_character(),
## .. Investments = col_character(),
## .. `Loan Amount` = col_character(),
## .. `Loan Purpose` = col_character(),
## .. `Employment Status` = col_character(),
## .. `Loan Term (Months)` = col_double(),
## .. `Interest Rate` = col_character(),
## .. `Loan Status` = col_character(),
## .. `Transaction Description` = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
currency_columns <- c("Income.Level", "Account.Balance", "Deposits",
"Withdrawals", "Transfers", "International.Transfers",
"Investments", "Loan.Amount")
for (col in currency_columns) {
finance_data[[col]] <- gsub("\\$", "", finance_data[[col]])
finance_data[[col]] <- gsub(",", "", finance_data[[col]])
finance_data[[col]] <- as.numeric(finance_data[[col]])
}
finance_data$Interest.Rate <- gsub("%", "", finance_data$Interest.Rate)
finance_data$Interest.Rate <- as.numeric(finance_data$Interest.Rate) / 100
str(finance_data)
## spc_tbl_ [5,000 × 19] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Age : num [1:5000] 40 30 37 27 36 77 70 45 56 61 ...
## $ Occupation : chr [1:5000] "Lawyer" "Teacher" "Teacher" "Student" ...
## $ Risk.Tolerance : chr [1:5000] "High" "Low" "Low" "Medium" ...
## $ Investment.Goals : chr [1:5000] "Wealth Preservation" "Wealth Preservation" "Speculation" "Speculation" ...
## $ Income.Level : num [1:5000] 46045 57170 71761 -25488 106778 ...
## $ Address : chr [1:5000] "7168 Moody Meadow\nHernandezshire, PW 06016" "50001 Hector Square\nWest Luisfurt, MA 51935" "997 James Isle\nNorth Rebeccafurt, RI 13366" "1607 Joshua Camp Apt. 634\nConleymouth, CT 66479" ...
## $ Account.Balance : num [1:5000] 44653 29175 86142 1000 77919 ...
## $ Deposits : num [1:5000] 9156 5933 22583 299 13853 ...
## $ Withdrawals : num [1:5000] 9328 8672 16469 289 22699 ...
## $ Transfers : num [1:5000] 3648 6730 6033 110 12189 ...
## $ International.Transfers: num [1:5000] 82.8 819.69 1526.33 3.51 2152.51 ...
## $ Investments : num [1:5000] 8729 4545 8251 196 9725 ...
## $ Loan.Amount : num [1:5000] 27011 31267 41261 5000 50000 ...
## $ Loan.Purpose : chr [1:5000] "Medical Expenses" "Auto Purchase" "Auto Purchase" "Small Business" ...
## $ Employment.Status : chr [1:5000] "Retired" "Retired" "Employed" "Retired" ...
## $ Loan.Term..Months. : num [1:5000] 36 36 12 60 24 24 24 48 48 12 ...
## $ Interest.Rate : num [1:5000] 0.1194 0.0808 0.1307 0.0673 0.106 ...
## $ Loan.Status : chr [1:5000] "pending" "approved" "pending" "approved" ...
## $ Transaction.Description: chr [1:5000] "Electronics transaction of $706.18 at Sanders, Roberts and Hughes" "Transaction at Evans-Smith for $2250.03" "Purchase at Taylor-Gutierrez for $615.85 on 2024-02-21" "Travel transaction of $4852.39 at Jones-Russell" ...
## - attr(*, "spec")=
## .. cols(
## .. Age = col_double(),
## .. Occupation = col_character(),
## .. `Risk Tolerance` = col_character(),
## .. `Investment Goals` = col_character(),
## .. `Income Level` = col_character(),
## .. Address = col_character(),
## .. `Account Balance` = col_character(),
## .. Deposits = col_character(),
## .. Withdrawals = col_character(),
## .. Transfers = col_character(),
## .. `International Transfers` = col_character(),
## .. Investments = col_character(),
## .. `Loan Amount` = col_character(),
## .. `Loan Purpose` = col_character(),
## .. `Employment Status` = col_character(),
## .. `Loan Term (Months)` = col_double(),
## .. `Interest Rate` = col_character(),
## .. `Loan Status` = col_character(),
## .. `Transaction Description` = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
finance_data <- finance_data %>%
select(-Occupation, -Investment.Goals, -Address, -Transaction.Description)
head(finance_data)
## # A tibble: 6 × 15
## Age Risk.Tolerance Income.Level Account.Balance Deposits Withdrawals
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 40 High 46045. 44653. 9156. 9328.
## 2 30 Low 57170. 29175. 5933. 8672.
## 3 37 Low 71761. 86142. 22583. 16469.
## 4 27 Medium -25488. 1000 299. 289.
## 5 36 Low 106778. 77919. 13853. 22699.
## 6 77 Low 59157. 61521. 22053. 16970.
## # ℹ 9 more variables: Transfers <dbl>, International.Transfers <dbl>,
## # Investments <dbl>, Loan.Amount <dbl>, Loan.Purpose <chr>,
## # Employment.Status <chr>, Loan.Term..Months. <dbl>, Interest.Rate <dbl>,
## # Loan.Status <chr>
colSums(is.na(finance_data))
## Age Risk.Tolerance Income.Level
## 0 0 0
## Account.Balance Deposits Withdrawals
## 0 0 0
## Transfers International.Transfers Investments
## 0 0 0
## Loan.Amount Loan.Purpose Employment.Status
## 0 0 0
## Loan.Term..Months. Interest.Rate Loan.Status
## 0 0 0
sum(duplicated(finance_data))
## [1] 0
numerical_columns <- c("Age", "Income.Level", "Account.Balance", "Deposits",
"Withdrawals", "Transfers", "International.Transfers",
"Investments", "Loan.Amount",
"Loan.Term..Months.", "Interest.Rate")
remove_outliers_iqr <- function(data, column){
Q1 <- quantile(data[[column]], 0.25, na.rm = TRUE)
Q3 <- quantile(data[[column]], 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
data %>% filter(.data[[column]] >= lower_bound & .data[[column]] <= upper_bound)
}
for (col in numerical_columns) {
finance_data <- remove_outliers_iqr(finance_data, col)
}
dim(finance_data)
## [1] 4543 15
for (col in numerical_columns) {
print(
ggplot(finance_data, aes(y = .data[[col]])) +
geom_boxplot() +
ggtitle(paste("Boxplot of", col)) +
theme_minimal()
)
}
data_numeric <- finance_data %>%
select(where(is.numeric))
describe(data_numeric)
## vars n mean sd median trimmed mad
## Age 1 4543 51.75 19.56 52.00 51.82 25.20
## Income.Level 2 4543 69032.56 29050.63 68605.30 68926.58 29949.56
## Account.Balance 3 4543 58729.19 32853.32 57378.22 59886.26 47846.11
## Deposits 4 4543 17551.08 12488.10 14623.09 16310.36 12424.93
## Withdrawals 5 4543 10361.58 7569.72 8511.19 9590.10 7601.88
## Transfers 6 4543 8462.06 5873.72 7233.53 7912.89 6126.04
## International.Transfers 7 4543 1153.67 1051.45 817.95 1002.50 897.17
## Investments 8 4543 8317.43 5819.26 6984.33 7762.11 5775.76
## Loan.Amount 9 4543 37561.36 15201.27 47406.86 39682.80 3844.59
## Loan.Term..Months. 10 4543 36.24 17.06 36.00 36.29 17.79
## Interest.Rate 11 4543 0.11 0.02 0.11 0.11 0.02
## min max range skew kurtosis se
## Age 18.00 85.00 67.00 -0.02 -1.20 0.29
## Income.Level -10338.99 151827.63 162166.62 0.04 -0.33 431.01
## Account.Balance 1000.00 100000.00 99000.00 -0.07 -1.41 487.43
## Deposits 109.34 49992.09 49882.75 0.74 -0.35 185.28
## Withdrawals 50.50 29996.72 29946.22 0.75 -0.37 112.31
## Transfers 51.99 24991.81 24939.82 0.71 -0.30 87.14
## International.Transfers 0.01 4378.21 4378.20 1.09 0.39 15.60
## Investments 51.22 26656.02 26604.80 0.78 -0.06 86.34
## Loan.Amount 5000.00 50000.00 45000.00 -0.81 -0.85 225.53
## Loan.Term..Months. 12.00 60.00 48.00 -0.01 -1.31 0.25
## Interest.Rate 0.05 0.15 0.10 -0.30 -0.46 0.00
for (col in colnames(data_numeric)) {
print(
ggplot(data_numeric, aes(x = .data[[col]])) +
geom_histogram(bins = 10) +
ggtitle(paste("Distribusi", col)) +
theme_minimal()
)
}
corr_matrix <- cor(data_numeric, use = "complete.obs")
corrplot(corr_matrix, method = "color", addCoef.col = "black", tl.cex = 0.7)
cortest.bartlett(corr_matrix, n = nrow(data_numeric))
## $chisq
## [1] 23414.17
##
## $p.value
## [1] 0
##
## $df
## [1] 55
KMO(data_numeric)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = data_numeric)
## Overall MSA = 0.84
## MSA for each item =
## Age Income.Level Account.Balance
## 0.44 0.89 0.79
## Deposits Withdrawals Transfers
## 0.90 0.91 0.87
## International.Transfers Investments Loan.Amount
## 0.88 0.93 0.66
## Loan.Term..Months. Interest.Rate
## 0.78 0.63
Z <- scale(data_numeric)
pca_result <- prcomp(Z, center = TRUE, scale. = TRUE)
eigenvalues <- pca_result$sdev^2
variance_ratio <- eigenvalues / sum(eigenvalues)
hasil_pca <- data.frame(
Eigenvalue = eigenvalues,
Proporsi_Varians = variance_ratio,
Kumulatif_Varians = cumsum(variance_ratio)
)
hasil_pca
## Eigenvalue Proporsi_Varians Kumulatif_Varians
## 1 4.4975315 0.40886650 0.4088665
## 2 1.5982227 0.14529297 0.5541595
## 3 1.0049825 0.09136204 0.6455215
## 4 0.9951936 0.09047215 0.7359937
## 5 0.7498466 0.06816787 0.8041615
## 6 0.5896182 0.05360165 0.8577632
## 7 0.4681944 0.04256313 0.9003263
## 8 0.3961348 0.03601225 0.9363386
## 9 0.3026249 0.02751136 0.9638499
## 10 0.2845687 0.02586988 0.9897198
## 11 0.1130822 0.01028020 1.0000000
plot(eigenvalues, type = "b",
xlab = "Komponen",
ylab = "Eigenvalue",
main = "Scree Plot PCA")
abline(h = 1, col = "red", lty = 2)
sum(eigenvalues > 1)
## [1] 3
Z_pca <- pca_result$x[,1:3]
dim(Z_pca)
## [1] 4543 3
loadings <- pca_result$rotation[,1:3]
loadings
## PC1 PC2 PC3
## Age 0.001523359 -0.03987294 -0.7858987379
## Income.Level -0.303060776 0.26729106 0.0151343930
## Account.Balance -0.438201416 -0.14114665 0.0067029728
## Deposits -0.375946623 -0.14600323 0.0186386574
## Withdrawals -0.371991516 -0.13127608 0.0153284640
## Transfers -0.386633675 -0.17339343 -0.0495076780
## International.Transfers -0.300469471 -0.14294658 -0.0720290764
## Investments -0.346721700 -0.13990328 0.0458795034
## Loan.Amount -0.220181636 0.62191137 -0.0139323728
## Loan.Term..Months. 0.010785766 0.05457335 -0.6095641574
## Interest.Rate -0.166847497 0.63927222 -0.0003644493
fa.parallel(data_numeric, fa = "fa")
## Parallel analysis suggests that the number of factors = 3 and the number of components = NA
fa_result <- fa(data_numeric, nfactors = 3, rotate = "varimax")
fa_result$loadings
##
## Loadings:
## MR1 MR2 MR3
## Age
## Income.Level 0.471 0.412
## Account.Balance 0.985 0.139
## Deposits 0.782 0.105
## Withdrawals 0.763 0.118
## Transfers 0.740 0.156 0.524
## International.Transfers 0.498 0.146 0.458
## Investments 0.691 0.103
## Loan.Amount 0.159 0.981
## Loan.Term..Months.
## Interest.Rate 0.675
##
## MR1 MR2 MR3
## SS loadings 3.694 1.689 0.503
## Proportion Var 0.336 0.154 0.046
## Cumulative Var 0.336 0.489 0.535
fa_result$communality
## Age Income.Level Account.Balance
## 0.001785818 0.392762859 0.991571285
## Deposits Withdrawals Transfers
## 0.621911424 0.595606608 0.847107268
## International.Transfers Investments Loan.Amount
## 0.478586258 0.489002430 0.997496863
## Loan.Term..Months. Interest.Rate
## 0.001897632 0.467923452
cat("Jumlah komponen PCA:", 3, "\n")
## Jumlah komponen PCA: 3
cat("Jumlah faktor FA:", 3)
## Jumlah faktor FA: 3
Hasil analisis menunjukkan bahwa tiga komponen utama mampu merepresentasikan variasi data secara optimal berdasarkan Kaiser Rule. Analisis faktor dengan rotasi varimax memperjelas struktur laten variabel keuangan dalam dataset.