data <- read.csv("Level Risiko Investasi.csv")
head(data)
##   Country      X1        X2        X3      X4     X5      X6      X7        X8
## 1      SE 23.2000 60338.020 175.42230 1.62000 0.6755 2.47168  0.3526 185.64097
## 2      SG 16.8056 62432.995 409.69700 0.10510 0.9068 2.77600  0.2912  94.00211
## 3      SI 18.2857 28684.168 103.06040 0.84352 0.0746 3.55290  1.9299  72.30708
## 4      SK 19.6715 21042.722 102.73060 1.17400 0.0734 3.21976  1.2325 111.78982
## 5      SM 11.9000 49356.262  60.15464 0.89594 0.5865 1.75420 -1.1342  88.60514
## 6      SV      NA  3989.191  65.55750 0.39400 0.5042 2.44734 -0.1248  88.88685
##           X9        X10     X11      X12      X13 X14
## 1   64.14972 537.609866  0.5000 25.11320 27.95256 8.6
## 2 -200.98100 339.988210  1.3095 26.76784 47.25374 3.0
## 3   16.23838  52.761781  3.0176 19.90742 25.76882 5.0
## 4   33.35258 102.567122  2.5300 22.83084 20.95780 7.0
## 5 -145.43800   1.490827 63.5000 17.79208 23.21144 7.3
## 6   27.33332  24.638720  1.5706 16.78238 14.52982 9.0
##DATA CLEANING
#memeriksa missing value

total_missing_values_all_columns <- sum(is.na(data))
cat("Total missing values across all columns:\n", total_missing_values_all_columns, "\n\n")
## Total missing values across all columns:
##  8
#menghapus baris dengan nilai yang hilang
data_clean <- na.omit(data)
#memastikan setiap kolom memiliki tipe data yang sesuai
str(data)
## 'data.frame':    17 obs. of  15 variables:
##  $ Country: chr  "SE" "SG" "SI" "SK" ...
##  $ X1     : num  23.2 16.8 18.3 19.7 11.9 ...
##  $ X2     : num  60338 62433 28684 21043 49356 ...
##  $ X3     : num  175.4 409.7 103.1 102.7 60.2 ...
##  $ X4     : num  1.62 0.105 0.844 1.174 0.896 ...
##  $ X5     : num  0.6755 0.9068 0.0746 0.0734 0.5865 ...
##  $ X6     : num  2.47 2.78 3.55 3.22 1.75 ...
##  $ X7     : num  0.353 0.291 1.93 1.232 -1.134 ...
##  $ X8     : num  185.6 94 72.3 111.8 88.6 ...
##  $ X9     : num  64.1 -201 16.2 33.4 -145.4 ...
##  $ X10    : num  537.61 339.99 52.76 102.57 1.49 ...
##  $ X11    : num  0.5 1.31 3.02 2.53 63.5 ...
##  $ X12    : num  25.1 26.8 19.9 22.8 17.8 ...
##  $ X13    : num  28 47.3 25.8 21 23.2 ...
##  $ X14    : num  8.6 3 5 7 7.3 9 2 17 13.2 3.7 ...
#Mengubah tipe data
data$Country <- as.factor(data$Country)
#Menghapus kolom yang tidak diperlukan (menghapus kolom 'Country')
data_clean <- data_clean[, -c(1)]
#menyimpan data yang sudah bersih dari missing data
write.csv(data_clean, "Level Risiko Investasi unmissing.csv", row.names = FALSE)
show(data_clean)
##         X1        X2        X3       X4      X5      X6      X7        X8
## 1  23.2000 60338.020 175.42230  1.62000  0.6755 2.47168  0.3526 185.64097
## 2  16.8056 62432.995 409.69700  0.10510  0.9068 2.77600  0.2912  94.00211
## 3  18.2857 28684.168 103.06040  0.84352  0.0746 3.55290  1.9299  72.30708
## 4  19.6715 21042.722 102.73060  1.17400  0.0734 3.21976  1.2325 111.78982
## 5  11.9000 49356.262  60.15464  0.89594  0.5865 1.75420 -1.1342  88.60514
## 7  19.8000  7450.552  33.22256  0.34500  0.3153 3.44058  1.2787 100.19298
## 8  12.9000  3616.865  85.26668  5.55600  1.1173 1.60820 -1.5047 134.47988
## 11 22.0000  3955.070 103.90710 19.17300 -0.3906 0.34000  1.8906  72.25639
## 16 12.0977  3886.516  34.52492  2.79600  0.8506 6.94570  5.2762  86.56201
##            X9        X10     X11      X12      X13  X14
## 1    64.14972 537.609866  0.5000 25.11320 27.95256  8.6
## 2  -200.98100 339.988210  1.3095 26.76784 47.25374  3.0
## 3    16.23838  52.761781  3.0176 19.90742 25.76882  5.0
## 4    33.35258 102.567122  2.5300 22.83084 20.95780  7.0
## 5  -145.43800   1.490827 63.5000 17.79208 23.21144  7.3
## 7   -42.56340 501.644054  3.2000 23.05990 32.47950  2.0
## 8    64.46288  39.218118 13.6000 18.80654  8.88180 17.0
## 11   -5.46582 155.581868 49.0000 17.79388 16.04966  9.5
## 16    7.39622 351.683014  1.6900 23.54764 25.80812  2.5
##DATA CLEANING
#memeriksa missing value

total_missing_values_all_columns <- sum(is.na(data_clean))
cat("Total missing values across all columns:\n", total_missing_values_all_columns, "\n\n")
## Total missing values across all columns:
##  0
#Membersihkan Outlier

data <- read.csv("Level Risiko Investasi unmissing.csv", na.strings = "")
head(data)
##        X1        X2        X3      X4     X5      X6      X7        X8
## 1 23.2000 60338.020 175.42230 1.62000 0.6755 2.47168  0.3526 185.64097
## 2 16.8056 62432.995 409.69700 0.10510 0.9068 2.77600  0.2912  94.00211
## 3 18.2857 28684.168 103.06040 0.84352 0.0746 3.55290  1.9299  72.30708
## 4 19.6715 21042.722 102.73060 1.17400 0.0734 3.21976  1.2325 111.78982
## 5 11.9000 49356.262  60.15464 0.89594 0.5865 1.75420 -1.1342  88.60514
## 6 19.8000  7450.552  33.22256 0.34500 0.3153 3.44058  1.2787 100.19298
##           X9        X10     X11      X12      X13 X14
## 1   64.14972 537.609866  0.5000 25.11320 27.95256 8.6
## 2 -200.98100 339.988210  1.3095 26.76784 47.25374 3.0
## 3   16.23838  52.761781  3.0176 19.90742 25.76882 5.0
## 4   33.35258 102.567122  2.5300 22.83084 20.95780 7.0
## 5 -145.43800   1.490827 63.5000 17.79208 23.21144 7.3
## 6  -42.56340 501.644054  3.2000 23.05990 32.47950 2.0
#mengidentifikasi outlier menggunakan metode interquartile range (IQR).
#menghapus outlier berdasarkan IQR:

remove_outliers <- function(x) {
    q1 <- quantile(x, 0.25, na.rm = TRUE)
    q3 <- quantile(x, 0.75, na.rm = TRUE)
    iqr <- q3 - q1
    lower_bound <- q1 - 1.5 * iqr
    upper_bound <- q3 + 1.5 * iqr
    x[x < lower_bound | x > upper_bound] <- NA  
    return(x)
}
data_cleaned <- data
for (col in names(data_clean)[-1]) { 
    data_clean[[col]] <- remove_outliers(data[[col]])
}

#pada funsi di atas, kolom 'Country' telah diabaikan
#setelah mengganti outlier langkah selanjutnya adalah menghapus baris yang memiliki nilai 'NA'
data <- na.omit(data_cleaned)
#menyimpan kembali data setela membersihkan outlier, file disimpan dalam bentuk csv.

write.csv(data_cleaned, "Level Risiko Investasi Cleaned Finallll.csv", row.names = FALSE)
#menampilkan data yang telah dibersihkan dari missing data dan outlier
show(data_cleaned)
##        X1        X2        X3       X4      X5      X6      X7        X8
## 1 23.2000 60338.020 175.42230  1.62000  0.6755 2.47168  0.3526 185.64097
## 2 16.8056 62432.995 409.69700  0.10510  0.9068 2.77600  0.2912  94.00211
## 3 18.2857 28684.168 103.06040  0.84352  0.0746 3.55290  1.9299  72.30708
## 4 19.6715 21042.722 102.73060  1.17400  0.0734 3.21976  1.2325 111.78982
## 5 11.9000 49356.262  60.15464  0.89594  0.5865 1.75420 -1.1342  88.60514
## 6 19.8000  7450.552  33.22256  0.34500  0.3153 3.44058  1.2787 100.19298
## 7 12.9000  3616.865  85.26668  5.55600  1.1173 1.60820 -1.5047 134.47988
## 8 22.0000  3955.070 103.90710 19.17300 -0.3906 0.34000  1.8906  72.25639
## 9 12.0977  3886.516  34.52492  2.79600  0.8506 6.94570  5.2762  86.56201
##           X9        X10     X11      X12      X13  X14
## 1   64.14972 537.609866  0.5000 25.11320 27.95256  8.6
## 2 -200.98100 339.988210  1.3095 26.76784 47.25374  3.0
## 3   16.23838  52.761781  3.0176 19.90742 25.76882  5.0
## 4   33.35258 102.567122  2.5300 22.83084 20.95780  7.0
## 5 -145.43800   1.490827 63.5000 17.79208 23.21144  7.3
## 6  -42.56340 501.644054  3.2000 23.05990 32.47950  2.0
## 7   64.46288  39.218118 13.6000 18.80654  8.88180 17.0
## 8   -5.46582 155.581868 49.0000 17.79388 16.04966  9.5
## 9    7.39622 351.683014  1.6900 23.54764 25.80812  2.5
#Struktur data untuk menampilkan tipe data
str(data_cleaned)
## 'data.frame':    9 obs. of  14 variables:
##  $ X1 : num  23.2 16.8 18.3 19.7 11.9 ...
##  $ X2 : num  60338 62433 28684 21043 49356 ...
##  $ X3 : num  175.4 409.7 103.1 102.7 60.2 ...
##  $ X4 : num  1.62 0.105 0.844 1.174 0.896 ...
##  $ X5 : num  0.6755 0.9068 0.0746 0.0734 0.5865 ...
##  $ X6 : num  2.47 2.78 3.55 3.22 1.75 ...
##  $ X7 : num  0.353 0.291 1.93 1.232 -1.134 ...
##  $ X8 : num  185.6 94 72.3 111.8 88.6 ...
##  $ X9 : num  64.1 -201 16.2 33.4 -145.4 ...
##  $ X10: num  537.61 339.99 52.76 102.57 1.49 ...
##  $ X11: num  0.5 1.31 3.02 2.53 63.5 ...
##  $ X12: num  25.1 26.8 19.9 22.8 17.8 ...
##  $ X13: num  28 47.3 25.8 21 23.2 ...
##  $ X14: num  8.6 3 5 7 7.3 2 17 9.5 2.5
#Menghitung Regresi Linear

linear_model <- lm(X14 ~ X1, data = data)
print(linear_model)
## 
## Call:
## lm(formula = X14 ~ X1, data = data)
## 
## Coefficients:
## (Intercept)           X1  
##     8.48071     -0.09209
#Visualisasi data linier model
plot(linear_model)

#Korelasi Matrix
correlation_matrix <- cor(data_cleaned, method = "pearson")
print(correlation_matrix)
##              X1           X2         X3         X4         X5          X6
## X1   1.00000000  0.132873061  0.1900699  0.2744029 -0.6166061 -0.33430178
## X2   0.13287306  1.000000000  0.6850229 -0.4686016  0.2656109 -0.16512981
## X3   0.19006994  0.685022908  1.0000000 -0.1492817  0.2594287 -0.18488443
## X4   0.27440289 -0.468601587 -0.1492817  1.0000000 -0.5145210 -0.51211884
## X5  -0.61660610  0.265610879  0.2594287 -0.5145210  1.0000000  0.30326915
## X6  -0.33430178 -0.165129811 -0.1848844 -0.5121188  0.3032692  1.00000000
## X7   0.04581372 -0.406296146 -0.2447768  0.1139032 -0.2520888  0.74858977
## X8   0.28889728  0.340810650  0.1260536 -0.2439023  0.4376938 -0.12339747
## X9   0.29402195 -0.501954252 -0.5286786  0.2273147 -0.1555573  0.09089035
## X10  0.43582164  0.186048129  0.2186606 -0.2099671  0.1881472  0.34390742
## X11 -0.21788380 -0.008869658 -0.2541867  0.5064816 -0.2924356 -0.58733939
## X12  0.24836320  0.438680457  0.5886052 -0.5285666  0.3690163  0.48898111
## X13  0.13737308  0.608548454  0.6793410 -0.5132112  0.1882858  0.30687493
## X14 -0.08419649 -0.189553013 -0.1433857  0.4148337  0.1537423 -0.61455701
##              X7          X8           X9          X10          X11        X12
## X1   0.04581372  0.28889728  0.294021955  0.435821637 -0.217883803  0.2483632
## X2  -0.40629615  0.34081065 -0.501954252  0.186048129 -0.008869658  0.4386805
## X3  -0.24477685  0.12605361 -0.528678586  0.218660631 -0.254186655  0.5886052
## X4   0.11390319 -0.24390225  0.227314705 -0.209967085  0.506481578 -0.5285666
## X5  -0.25208882  0.43769379 -0.155557318  0.188147245 -0.292435631  0.3690163
## X6   0.74858977 -0.12339747  0.090890352  0.343907417 -0.587339395  0.4889811
## X7   1.00000000 -0.38687877  0.204944063  0.312500195 -0.329991062  0.2517648
## X8  -0.38687877  1.00000000  0.438431821  0.433076606 -0.345532165  0.3736531
## X9   0.20494406  0.43843182  1.000000000  0.009695843 -0.320913067 -0.1752875
## X10  0.31250020  0.43307661  0.009695843  1.000000000 -0.523225109  0.7635011
## X11 -0.32999106 -0.34553216 -0.320913067 -0.523225109  1.000000000 -0.7546298
## X12  0.25176479  0.37365306 -0.175287482  0.763501083 -0.754629805  1.0000000
## X13  0.13817175 -0.08547333 -0.677952919  0.568612383 -0.380118801  0.7719893
## X14 -0.59275118  0.41321980  0.468778031 -0.477860761  0.293524282 -0.5409865
##             X13         X14
## X1   0.13737308 -0.08419649
## X2   0.60854845 -0.18955301
## X3   0.67934096 -0.14338572
## X4  -0.51321116  0.41483373
## X5   0.18828577  0.15374234
## X6   0.30687493 -0.61455701
## X7   0.13817175 -0.59275118
## X8  -0.08547333  0.41321980
## X9  -0.67795292  0.46877803
## X10  0.56861238 -0.47786076
## X11 -0.38011880  0.29352428
## X12  0.77198932 -0.54098654
## X13  1.00000000 -0.77642300
## X14 -0.77642300  1.00000000
##Boxplot
#Boxplot X1 dan X14 `guna mengetahui hubungan antara X1 dan X14.

par(mfrow=c(1, 2))

boxplot(data$X1, main="X1 (Variable 1)", 
        sub=paste("Outlier rows:", paste(boxplot.stats(data$X1)$out, collapse=", ")))
boxplot(data$X14, main="X14 (Variable 14)", 
        sub=paste("Outlier rows:", paste(boxplot.stats(data$X14)$out, collapse=", ")))