library(knitr)
library(rmarkdown)
library(prettydoc)
library(equatiomatic)
library(stats)
Membaca file csv ke dalam R
data <- read.csv("Level risiko investasi.csv", na.strings = "")
Menampilkan beberapa baris awal untuk memahami struktur data
head(data)
## Country X1 X2 X3 X4 X5 X6 X7 X8
## 1 SE 23.2000 60338.020 175.42230 1.62000 0.6755 2.47168 0.3526 185.64097
## 2 SG 16.8056 62432.995 409.69700 0.10510 0.9068 2.77600 0.2912 94.00211
## 3 SI 18.2857 28684.168 103.06040 0.84352 0.0746 3.55290 1.9299 72.30708
## 4 SK 19.6715 21042.722 102.73060 1.17400 0.0734 3.21976 1.2325 111.78982
## 5 SM 11.9000 49356.262 60.15464 0.89594 0.5865 1.75420 -1.1342 88.60514
## 6 SV NA 3989.191 65.55750 0.39400 0.5042 2.44734 -0.1248 88.88685
## X9 X10 X11 X12 X13 X14
## 1 64.14972 537.609866 0.5000 25.11320 27.95256 8.6
## 2 -200.98100 339.988210 1.3095 26.76784 47.25374 3.0
## 3 16.23838 52.761781 3.0176 19.90742 25.76882 5.0
## 4 33.35258 102.567122 2.5300 22.83084 20.95780 7.0
## 5 -145.43800 1.490827 63.5000 17.79208 23.21144 7.3
## 6 27.33332 24.638720 1.5706 16.78238 14.52982 9.0
Memeriksa jumlah nilai yang hilang
sum(is.na(data))
## Error in render(x, visible = TRUE, envir = parent.frame()): unused arguments (visible = TRUE, envir = parent.frame())
Menghapus baris dengan nilai yang hilang
data_clean <- na.omit(data)
Memastikan setiap kolom memiliki tipe data yang sesuai
str(data)
## 'data.frame': 17 obs. of 15 variables:
## $ Country: chr "SE" "SG" "SI" "SK" ...
## $ X1 : num 23.2 16.8 18.3 19.7 11.9 ...
## $ X2 : num 60338 62433 28684 21043 49356 ...
## $ X3 : num 175.4 409.7 103.1 102.7 60.2 ...
## $ X4 : num 1.62 0.105 0.844 1.174 0.896 ...
## $ X5 : num 0.6755 0.9068 0.0746 0.0734 0.5865 ...
## $ X6 : num 2.47 2.78 3.55 3.22 1.75 ...
## $ X7 : num 0.353 0.291 1.93 1.232 -1.134 ...
## $ X8 : num 185.6 94 72.3 111.8 88.6 ...
## $ X9 : num 64.1 -201 16.2 33.4 -145.4 ...
## $ X10 : num 537.61 339.99 52.76 102.57 1.49 ...
## $ X11 : num 0.5 1.31 3.02 2.53 63.5 ...
## $ X12 : num 25.1 26.8 19.9 22.8 17.8 ...
## $ X13 : num 28 47.3 25.8 21 23.2 ...
## $ X14 : num 8.6 3 5 7 7.3 9 2 17 13.2 3.7 ...
Mengubah tipe data
data$Country <- as.factor(data$Country)
Meghapus kolom yang tidak diperlukan (menghapus kolom ’Country)
data_clean <- data_clean[, -c(1)]
menyimpan data yang sudah bersih dari missing data
write.csv(data_clean, "Level Risiko Investasi Clean.csv", row.names = FALSE)
Menampilkan data yang sudah bersih dari missing data
show(data_clean)
## X1 X2 X3 X4 X5 X6 X7 X8
## 1 23.2000 60338.020 175.42230 1.62000 0.6755 2.47168 0.3526 185.64097
## 2 16.8056 62432.995 409.69700 0.10510 0.9068 2.77600 0.2912 94.00211
## 3 18.2857 28684.168 103.06040 0.84352 0.0746 3.55290 1.9299 72.30708
## 4 19.6715 21042.722 102.73060 1.17400 0.0734 3.21976 1.2325 111.78982
## 5 11.9000 49356.262 60.15464 0.89594 0.5865 1.75420 -1.1342 88.60514
## 7 19.8000 7450.552 33.22256 0.34500 0.3153 3.44058 1.2787 100.19298
## 8 12.9000 3616.865 85.26668 5.55600 1.1173 1.60820 -1.5047 134.47988
## 11 22.0000 3955.070 103.90710 19.17300 -0.3906 0.34000 1.8906 72.25639
## 16 12.0977 3886.516 34.52492 2.79600 0.8506 6.94570 5.2762 86.56201
## X9 X10 X11 X12 X13 X14
## 1 64.14972 537.609866 0.5000 25.11320 27.95256 8.6
## 2 -200.98100 339.988210 1.3095 26.76784 47.25374 3.0
## 3 16.23838 52.761781 3.0176 19.90742 25.76882 5.0
## 4 33.35258 102.567122 2.5300 22.83084 20.95780 7.0
## 5 -145.43800 1.490827 63.5000 17.79208 23.21144 7.3
## 7 -42.56340 501.644054 3.2000 23.05990 32.47950 2.0
## 8 64.46288 39.218118 13.6000 18.80654 8.88180 17.0
## 11 -5.46582 155.581868 49.0000 17.79388 16.04966 9.5
## 16 7.39622 351.683014 1.6900 23.54764 25.80812 2.5
##Menghapus Outlier
#membaca file csv ke dalam R
data <- read.csv("Level Risiko Investasi Clean.csv", na.strings = "")
#menampilkan baris awal data untuk memahami struktur awal data
head(data_clean)
## X1 X2 X3 X4 X5 X6 X7 X8
## 1 23.2000 60338.020 175.42230 1.62000 0.6755 2.47168 0.3526 185.64097
## 2 16.8056 62432.995 409.69700 0.10510 0.9068 2.77600 0.2912 94.00211
## 3 18.2857 28684.168 103.06040 0.84352 0.0746 3.55290 1.9299 72.30708
## 4 19.6715 21042.722 102.73060 1.17400 0.0734 3.21976 1.2325 111.78982
## 5 11.9000 49356.262 60.15464 0.89594 0.5865 1.75420 -1.1342 88.60514
## 7 19.8000 7450.552 33.22256 0.34500 0.3153 3.44058 1.2787 100.19298
## X9 X10 X11 X12 X13 X14
## 1 64.14972 537.609866 0.5000 25.11320 27.95256 8.6
## 2 -200.98100 339.988210 1.3095 26.76784 47.25374 3.0
## 3 16.23838 52.761781 3.0176 19.90742 25.76882 5.0
## 4 33.35258 102.567122 2.5300 22.83084 20.95780 7.0
## 5 -145.43800 1.490827 63.5000 17.79208 23.21144 7.3
## 7 -42.56340 501.644054 3.2000 23.05990 32.47950 2.0
#mengidentifikasi outlier menggunakan metode interquartile range (IQR).
#menghapus outlier berdasarkan IQR:
remove_outliers <- function(x) {
q1 <- quantile(x, 0.25, na.rm = TRUE)
q3 <- quantile(x, 0.75, na.rm = TRUE)
iqr <- q3 - q1
lower_bound <- q1 - 1.5 * iqr
upper_bound <- q3 + 1.5 * iqr
x[x < lower_bound | x > upper_bound] <- NA
return(x)
}
data_cleaned <- data
for (col in names(data_clean)[-1]) {
data_clean[[col]] <- remove_outliers(data_clean[[col]])
}
#pada funsi di atas, kolom 'Country' telah diabaikan
#setelah mengganti outlier langkah selanjutnya adalah menghapus baris yang memiliki nilai 'NA'
data_cleaned <- na.omit(data_cleaned)
#menyimpan kembali data setela membersihkan outlier, file disimpan dalam bentuk csv.
write.csv(data_cleaned, "Level Risiko InvestasiCleaned.csv", row.names = FALSE)
#menampilkan data yang telah bersih dari oulier dan data missing
show(data_cleaned)
## X1 X2 X3 X4 X5 X6 X7 X8
## 1 23.2000 60338.020 175.42230 1.62000 0.6755 2.47168 0.3526 185.64097
## 2 16.8056 62432.995 409.69700 0.10510 0.9068 2.77600 0.2912 94.00211
## 3 18.2857 28684.168 103.06040 0.84352 0.0746 3.55290 1.9299 72.30708
## 4 19.6715 21042.722 102.73060 1.17400 0.0734 3.21976 1.2325 111.78982
## 5 11.9000 49356.262 60.15464 0.89594 0.5865 1.75420 -1.1342 88.60514
## 6 19.8000 7450.552 33.22256 0.34500 0.3153 3.44058 1.2787 100.19298
## 7 12.9000 3616.865 85.26668 5.55600 1.1173 1.60820 -1.5047 134.47988
## 8 22.0000 3955.070 103.90710 19.17300 -0.3906 0.34000 1.8906 72.25639
## 9 12.0977 3886.516 34.52492 2.79600 0.8506 6.94570 5.2762 86.56201
## X9 X10 X11 X12 X13 X14
## 1 64.14972 537.609866 0.5000 25.11320 27.95256 8.6
## 2 -200.98100 339.988210 1.3095 26.76784 47.25374 3.0
## 3 16.23838 52.761781 3.0176 19.90742 25.76882 5.0
## 4 33.35258 102.567122 2.5300 22.83084 20.95780 7.0
## 5 -145.43800 1.490827 63.5000 17.79208 23.21144 7.3
## 6 -42.56340 501.644054 3.2000 23.05990 32.47950 2.0
## 7 64.46288 39.218118 13.6000 18.80654 8.88180 17.0
## 8 -5.46582 155.581868 49.0000 17.79388 16.04966 9.5
## 9 7.39622 351.683014 1.6900 23.54764 25.80812 2.5
#memeriksa jumlah data missing
sum(is.na(data_cleaned))
## Error in render(x, visible = TRUE, envir = parent.frame()): unused arguments (visible = TRUE, envir = parent.frame())
Y <- data$X11
A <- factor(data$X1)
B <- factor(data$X2)
C <- factor(data$X3)
model <- aov(Y ~ A * B * C)
summary(model)
## Error in render(x, visible = TRUE, envir = parent.frame()): unused arguments (visible = TRUE, envir = parent.frame())
data3 <- read.csv("Level Risiko Investasi.csv")
str(data3)
## 'data.frame': 17 obs. of 15 variables:
## $ Country: chr "SE" "SG" "SI" "SK" ...
## $ X1 : num 23.2 16.8 18.3 19.7 11.9 ...
## $ X2 : num 60338 62433 28684 21043 49356 ...
## $ X3 : num 175.4 409.7 103.1 102.7 60.2 ...
## $ X4 : num 1.62 0.105 0.844 1.174 0.896 ...
## $ X5 : num 0.6755 0.9068 0.0746 0.0734 0.5865 ...
## $ X6 : num 2.47 2.78 3.55 3.22 1.75 ...
## $ X7 : num 0.353 0.291 1.93 1.232 -1.134 ...
## $ X8 : num 185.6 94 72.3 111.8 88.6 ...
## $ X9 : num 64.1 -201 16.2 33.4 -145.4 ...
## $ X10 : num 537.61 339.99 52.76 102.57 1.49 ...
## $ X11 : num 0.5 1.31 3.02 2.53 63.5 ...
## $ X12 : num 25.1 26.8 19.9 22.8 17.8 ...
## $ X13 : num 28 47.3 25.8 21 23.2 ...
## $ X14 : num 8.6 3 5 7 7.3 9 2 17 13.2 3.7 ...
Membersihkan data dengan mengganti nilai yang hilang menggunakan mean, median, atau modus
data_clean <- data3
for (i in 2:ncol(data_clean)) {
if (is.numeric(data_clean[[i]])) {
mean_value <- mean(data_clean[[i]], na.rm = TRUE)
data_clean[[i]][is.na(data_clean[[i]])] <- mean_value
}
}
for (i in 2:ncol(data_clean)) {
if (is.numeric(data_clean[[i]])) {
median_value <- median(data_clean[[i]], na.rm = TRUE)
data_clean[[i]][is.na(data_clean[[i]])] <- median_value
}
}
# Fungsi untuk menghitung modus
get_mode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
for (i in 2:ncol(data_clean)) {
if (is.numeric(data_clean[[i]])) {
mode_value <- get_mode(data_clean[[i]])
data_clean[[i]][is.na(data_clean[[i]])] <- mode_value
}
}
Mempersiapkan data dan menghapus kolom negara
library(e1071)
target <- data_clean$X1
features <- data_clean[, -1]
Membagi data menjadi set pelatihan dan pengujian lalu melatih model SVM
set.seed(123)
train_indices <- sample(1:nrow(data_clean), 0.8 * nrow(data_clean))
train_data <- data_clean[train_indices, ]
test_data <- data_clean[-train_indices, ]
svm_model <- svm(X1 ~ ., data = train_data)
print(train_data)
## Country X1 X2 X3 X4 X5 X6 X7
## 15 UZ 18.40000 1872.6699 30.04996 12.29840 1.6065 5.84000 3.0735
## 3 SI 18.28570 28684.1682 103.06040 0.84352 0.0746 3.55290 1.9299
## 14 UY 17.04000 15968.2306 73.01010 8.00348 0.3592 0.82090 -0.7169
## 16 VN 12.09770 3886.5162 34.52492 2.79600 0.8506 6.94570 5.2762
## 10 TW 14.14000 31854.2815 48.51016 0.72360 0.1015 2.53870 2.7686
## 2 SG 16.80560 62432.9952 409.69700 0.10510 0.9068 2.77600 0.2912
## 6 SV 17.41934 3989.1913 65.55750 0.39400 0.5042 2.44734 -0.1248
## 5 SM 11.90000 49356.2618 60.15464 0.89594 0.5865 1.75420 -1.1342
## 4 SK 19.67150 21042.7221 102.73060 1.17400 0.0734 3.21976 1.2325
## 11 UA 22.00000 3955.0704 103.90710 19.17300 -0.3906 0.34000 1.8906
## 8 TN 12.90000 3616.8650 85.26668 5.55600 1.1173 1.60820 -1.5047
## 1 SE 23.20000 60338.0204 175.42230 1.62000 0.6755 2.47168 0.3526
## 12 UG 21.56890 786.8776 42.26784 4.29470 3.6551 5.73874 0.4207
## X8 X9 X10 X11 X12 X13 X14
## 15 96.53965 -45.16010 57.707193 2.10000 31.60162 29.24286 6.000000
## 3 72.30708 16.23838 52.761781 3.01760 19.90742 25.76882 5.000000
## 14 49.05568 -16.23150 53.628838 11.25795 16.44666 17.57796 10.300000
## 16 86.56201 7.39622 351.683014 1.69000 23.54764 25.80812 2.500000
## 10 71.08310 -189.14000 668.122597 11.25795 22.02936 33.80118 3.700000
## 2 94.00211 -200.98100 339.988210 1.30950 26.76784 47.25374 3.000000
## 6 88.88685 27.33332 24.638720 1.57060 16.78238 14.52982 9.000000
## 5 88.60514 -145.43800 1.490827 63.50000 17.79208 23.21144 7.300000
## 4 111.78982 33.35258 102.567122 2.53000 22.83084 20.95780 7.000000
## 11 72.25639 -5.46582 155.581868 49.00000 17.79388 16.04966 9.500000
## 8 134.47988 64.46288 39.218118 13.60000 18.80654 8.88180 17.000000
## 1 185.64097 64.14972 537.609866 0.50000 25.11320 27.95256 8.600000
## 12 69.45587 21.88058 33.538172 3.33570 24.85996 19.47826 8.963413
Menampilkan model SVM
show(svm_model)
##
## Call:
## svm(formula = X1 ~ ., data = train_data)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: radial
## cost: 1
## gamma: 0.03846154
## epsilon: 0.1
##
##
## Number of Support Vectors: 11
set.seed(123)
train_indices <- sample(1:nrow(data_cleaned), 0.8 * nrow(data_cleaned))
train_data <- data_clean[train_indices, ]
test_data <- data_clean[-train_indices,]
train_data <- train_data[, -which(names(train_data) == "Country")]
test_data <- test_data[, -which(names(test_data) == "Country")]
print(train_data)
## X1 X2 X3 X4 X5 X6 X7 X8
## 3 18.28570 28684.168 103.06040 0.84352 0.0746 3.55290 1.9299 72.30708
## 6 17.41934 3989.191 65.55750 0.39400 0.5042 2.44734 -0.1248 88.88685
## 9 18.00000 8652.997 51.65878 11.65444 1.4844 4.15702 1.8070 116.52826
## 2 16.80560 62432.995 409.69700 0.10510 0.9068 2.77600 0.2912 94.00211
## 8 12.90000 3616.865 85.26668 5.55600 1.1173 1.60820 -1.5047 134.47988
## 5 11.90000 49356.262 60.15464 0.89594 0.5865 1.75420 -1.1342 88.60514
## 7 19.80000 7450.552 33.22256 0.34500 0.3153 3.44058 1.2787 100.19298
## X9 X10 X11 X12 X13 X14
## 3 16.23838 52.761781 3.01760 19.90742 25.76882 5.0
## 6 27.33332 24.638720 1.57060 16.78238 14.52982 9.0
## 9 28.56998 720.244499 11.25795 28.55834 26.32778 13.2
## 2 -200.98100 339.988210 1.30950 26.76784 47.25374 3.0
## 8 64.46288 39.218118 13.60000 18.80654 8.88180 17.0
## 5 -145.43800 1.490827 63.50000 17.79208 23.21144 7.3
## 7 -42.56340 501.644054 3.20000 23.05990 32.47950 2.0
na_count <- colSums(is.na(train_data))
print(na_count)
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0
na_count1 <- colSums(is.na(test_data))
print(na_count)
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0