Memuat library yang diperlukan

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readxl)
library(plotly)

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

library(forecast)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(tseries)
library(dplyr)
library(tidyr)
library(car)

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

Membaca data

data <- read_excel("data invess.xlsx")

Fungsi untuk membersihkan dan mengkonversi data

clean_and_convert <- function(x) {
  x <- gsub(",", ".", x)  # Mengganti koma dengan titik
  x <- gsub("^\\s*$", NA, x)  # Mengganti string kosong dengan NA
  as.numeric(x)
}

Membersihkan data

data <- data %>%
  mutate(across(where(~!is.numeric(.)), clean_and_convert))

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(where(~!is.numeric(.)), clean_and_convert)`.
## Caused by warning:
## ! NAs introduced by coercion

Memeriksa struktur data setelah konversi

str(data)

## tibble [17 × 15] (S3: tbl_df/tbl/data.frame)
##  $ Country: num [1:17] NA NA NA NA NA NA NA NA NA NA ...
##  $ X1     : num [1:17] 23.2 16.8 18.3 19.7 11.9 ...
##  $ X2     : num [1:17] 60338 62433 28684 21043 49356 ...
##  $ X3     : num [1:17] 175.4 409.7 103.1 102.7 60.2 ...
##  $ X4     : num [1:17] 1.62 0.105 0.844 1.174 0.896 ...
##  $ X5     : num [1:17] 0.6755 0.9068 0.0746 0.0734 0.5865 ...
##  $ X6     : num [1:17] 2.47 2.78 3.55 3.22 1.75 ...
##  $ X7     : num [1:17] 0.353 0.291 1.93 1.232 -1.134 ...
##  $ X8     : num [1:17] 185.6 94 72.3 111.8 88.6 ...
##  $ X9     : num [1:17] 64.1 -201 16.2 33.4 -145.4 ...
##  $ X10    : num [1:17] 537.61 339.99 52.76 102.57 1.49 ...
##  $ X11    : num [1:17] 0.5 1.31 3.02 2.53 63.5 ...
##  $ X12    : num [1:17] 25.1 26.8 19.9 22.8 17.8 ...
##  $ X13    : num [1:17] 28 47.3 25.8 21 23.2 ...
##  $ X14    : num [1:17] 8.6 3 5 7 7.3 9 2 17 13.2 3.7 ...

Memeriksa jumlah NA dalam setiap kolom

colSums(is.na(data))

## Country      X1      X2      X3      X4      X5      X6      X7      X8      X9 
##      17       1       0       0       0       0       0       0       2       0 
##     X10     X11     X12     X13     X14 
##       0       4       0       0       1

Mengganti nilai yang hilang dengan median

numeric_cols <- sapply(data, is.numeric)
data[numeric_cols] <- lapply(data[numeric_cols], function(x) {
  ifelse(is.na(x), median(x, na.rm = TRUE), x)
})

Memeriksa nilai yang hilang

sum(is.na(data))

## [1] 17

Menyimpan data yang telah dibersihkan

write.csv(data, "data_bersih.csv", row.names = FALSE)

Deteksi outlier menggunakan metode IQR

detect_outliers <- function(x) {
  q1 <- quantile(x, 0.25, na.rm = TRUE)
  q3 <- quantile(x, 0.75, na.rm = TRUE)
  iqr <- q3 - q1
  lower <- q1 - 1.5 * iqr
  upper <- q3 + 1.5 * iqr
  return(x < lower | x > upper)
}

Terapkan fungsi pada kolom numerik

outliers <- data %>%
  select(where(is.numeric)) %>%
  map(detect_outliers)

Visualisasi outlier menggunakan boxplot

for (col in names(data)[numeric_cols]) {
  p <- ggplot(data, aes(y = !!sym(col))) +
    geom_boxplot() +
    labs(title = paste("Boxplot untuk", col), y = col)
  print(p)
}

## Warning: Removed 17 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Regresi linear

model_linear <- lm(X2 ~ X1, data = data)
summary(model_linear)

## 
## Call:
## lm(formula = X2 ~ X1, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -20732 -18301 -13431   8113  46511 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  29815.3    32718.8   0.911    0.377
## X1            -429.5     1846.3  -0.233    0.819
## 
## Residual standard error: 24550 on 15 degrees of freedom
## Multiple R-squared:  0.003595,   Adjusted R-squared:  -0.06283 
## F-statistic: 0.05413 on 1 and 15 DF,  p-value: 0.8192

Visualisasi regresi linear

ggplot(data, aes(x = X1, y = X2)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Regresi Linear: X2 vs X1", x = "X1", y = "X2")

## `geom_smooth()` using formula = 'y ~ x'

Regresi linear berganda

model_multiple <- lm(X2 ~ X1 + X3 + X4, data = data)
summary(model_multiple)

## 
## Call:
## lm(formula = X2 ~ X1 + X3 + X4, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -20642 -13068   -551   3520  40533 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 19887.62   24875.99   0.799   0.4384  
## X1           -241.32    1483.17  -0.163   0.8733  
## X3            146.03      54.01   2.704   0.0181 *
## X4          -1529.12     928.41  -1.647   0.1235  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18510 on 13 degrees of freedom
## Multiple R-squared:  0.5093, Adjusted R-squared:  0.3961 
## F-statistic: 4.498 on 3 and 13 DF,  p-value: 0.02249

Visualisasi regresi linear berganda (partial regression plots)

avPlots(model_multiple)

Menghitung akurasi model

predictions <- predict(model_multiple, data)
SSE <- sum((data$X2 - predictions)^2)
SST <- sum((data$X2 - mean(data$X2))^2)
R_squared <- 1 - SSE/SST
cat("Akurasi model (R-squared):", R_squared, "\n")

## Akurasi model (R-squared): 0.5093496

Membaca data (pastikan data sudah dibersihkan sebelumnya)

data <- read.csv("data_bersih.csv")

Asumsikan kolom X1 adalah waktu dan X2 adalah nilai yang ingin kita analisis

ts_data <- ts(data$X2, frequency = 1)  # Sesuaikan frequency jika ada pola musiman

Plot data time series

autoplot(ts_data) +
  ggtitle("Plot Time Series X2") +
  xlab("Waktu") +
  ylab("Nilai X2")

Uji stasioneritas

adf_test <- adf.test(ts_data)
print(adf_test)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_data
## Dickey-Fuller = -2.052, Lag order = 2, p-value = 0.5526
## alternative hypothesis: stationary

Jika data tidak stasioner, lakukan differencing

if (adf_test$p.value > 0.05) {
  ts_data_diff <- diff(ts_data)
  autoplot(ts_data_diff) +
    ggtitle("Plot Time Series X2 (Setelah Differencing)") +
    xlab("Waktu") +
    ylab("Nilai X2 (Differenced)")
} else {
  ts_data_diff <- ts_data
}

Identifikasi model ARIMA

model_auto <- auto.arima(ts_data)
summary(model_auto)

## Series: ts_data 
## ARIMA(0,0,0) with non-zero mean 
## 
## Coefficients:
##            mean
##       22330.401
## s.e.   5603.415
## 
## sigma^2 = 567128123:  log likelihood = -194.93
## AIC=393.87   AICc=394.72   BIC=395.53
## 
## Training set error measures:
##                         ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -6.419545e-12 23103.41 19765.65 -358.6588 394.1669 0.9752158
##                   ACF1
## Training set 0.1794194

Forecast

forecast_result <- forecast(model_auto, h = 10)  # Forecast 10 periode

Pentingnya Pembersihan Data: Proses pembersihan data sangat penting untuk memastikan analisis yang akurat. Mengganti nilai yang hilang dan mendeteksi outlier membantu meningkatkan kualitas data.

Visualisasi Membantu Pemahaman: Visualisasi data, seperti boxplot dan scatter plot, membantu dalam memahami distribusi data dan hubungan antar variabel, serta mendeteksi anomali.

Modeling dan Peramalan: Penggunaan model regresi dan ARIMA menunjukkan bagaimana data historis dapat digunakan untuk membuat prediksi yang bermanfaat. Akurasi model memberikan indikasi seberapa baik model tersebut dapat diandalkan.

Kesiapan untuk Analisis Lanjutan: Dengan data yang telah dibersihkan dan dianalisis, langkah selanjutnya dapat mencakup analisis lebih mendalam, seperti pengujian hipotesis atau penerapan algoritma pembelajaran mesin untuk prediksi yang lebih kompleks.

Data Tingkat Resiko Investasi

Berlian Mumtajmia L

2024-10-17