library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(tseries)
library(dplyr)
library(tidyr)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
data <- read_excel("data invess.xlsx")
clean_and_convert <- function(x) {
x <- gsub(",", ".", x) # Mengganti koma dengan titik
x <- gsub("^\\s*$", NA, x) # Mengganti string kosong dengan NA
as.numeric(x)
}
data <- data %>%
mutate(across(where(~!is.numeric(.)), clean_and_convert))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(where(~!is.numeric(.)), clean_and_convert)`.
## Caused by warning:
## ! NAs introduced by coercion
str(data)
## tibble [17 × 15] (S3: tbl_df/tbl/data.frame)
## $ Country: num [1:17] NA NA NA NA NA NA NA NA NA NA ...
## $ X1 : num [1:17] 23.2 16.8 18.3 19.7 11.9 ...
## $ X2 : num [1:17] 60338 62433 28684 21043 49356 ...
## $ X3 : num [1:17] 175.4 409.7 103.1 102.7 60.2 ...
## $ X4 : num [1:17] 1.62 0.105 0.844 1.174 0.896 ...
## $ X5 : num [1:17] 0.6755 0.9068 0.0746 0.0734 0.5865 ...
## $ X6 : num [1:17] 2.47 2.78 3.55 3.22 1.75 ...
## $ X7 : num [1:17] 0.353 0.291 1.93 1.232 -1.134 ...
## $ X8 : num [1:17] 185.6 94 72.3 111.8 88.6 ...
## $ X9 : num [1:17] 64.1 -201 16.2 33.4 -145.4 ...
## $ X10 : num [1:17] 537.61 339.99 52.76 102.57 1.49 ...
## $ X11 : num [1:17] 0.5 1.31 3.02 2.53 63.5 ...
## $ X12 : num [1:17] 25.1 26.8 19.9 22.8 17.8 ...
## $ X13 : num [1:17] 28 47.3 25.8 21 23.2 ...
## $ X14 : num [1:17] 8.6 3 5 7 7.3 9 2 17 13.2 3.7 ...
colSums(is.na(data))
## Country X1 X2 X3 X4 X5 X6 X7 X8 X9
## 17 1 0 0 0 0 0 0 2 0
## X10 X11 X12 X13 X14
## 0 4 0 0 1
numeric_cols <- sapply(data, is.numeric)
data[numeric_cols] <- lapply(data[numeric_cols], function(x) {
ifelse(is.na(x), median(x, na.rm = TRUE), x)
})
sum(is.na(data))
## [1] 17
write.csv(data, "data_bersih.csv", row.names = FALSE)
detect_outliers <- function(x) {
q1 <- quantile(x, 0.25, na.rm = TRUE)
q3 <- quantile(x, 0.75, na.rm = TRUE)
iqr <- q3 - q1
lower <- q1 - 1.5 * iqr
upper <- q3 + 1.5 * iqr
return(x < lower | x > upper)
}
outliers <- data %>%
select(where(is.numeric)) %>%
map(detect_outliers)
for (col in names(data)[numeric_cols]) {
p <- ggplot(data, aes(y = !!sym(col))) +
geom_boxplot() +
labs(title = paste("Boxplot untuk", col), y = col)
print(p)
}
## Warning: Removed 17 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
model_linear <- lm(X2 ~ X1, data = data)
summary(model_linear)
##
## Call:
## lm(formula = X2 ~ X1, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20732 -18301 -13431 8113 46511
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29815.3 32718.8 0.911 0.377
## X1 -429.5 1846.3 -0.233 0.819
##
## Residual standard error: 24550 on 15 degrees of freedom
## Multiple R-squared: 0.003595, Adjusted R-squared: -0.06283
## F-statistic: 0.05413 on 1 and 15 DF, p-value: 0.8192
ggplot(data, aes(x = X1, y = X2)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Regresi Linear: X2 vs X1", x = "X1", y = "X2")
## `geom_smooth()` using formula = 'y ~ x'
model_multiple <- lm(X2 ~ X1 + X3 + X4, data = data)
summary(model_multiple)
##
## Call:
## lm(formula = X2 ~ X1 + X3 + X4, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20642 -13068 -551 3520 40533
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19887.62 24875.99 0.799 0.4384
## X1 -241.32 1483.17 -0.163 0.8733
## X3 146.03 54.01 2.704 0.0181 *
## X4 -1529.12 928.41 -1.647 0.1235
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18510 on 13 degrees of freedom
## Multiple R-squared: 0.5093, Adjusted R-squared: 0.3961
## F-statistic: 4.498 on 3 and 13 DF, p-value: 0.02249
avPlots(model_multiple)
predictions <- predict(model_multiple, data)
SSE <- sum((data$X2 - predictions)^2)
SST <- sum((data$X2 - mean(data$X2))^2)
R_squared <- 1 - SSE/SST
cat("Akurasi model (R-squared):", R_squared, "\n")
## Akurasi model (R-squared): 0.5093496
data <- read.csv("data_bersih.csv")
ts_data <- ts(data$X2, frequency = 1) # Sesuaikan frequency jika ada pola musiman
autoplot(ts_data) +
ggtitle("Plot Time Series X2") +
xlab("Waktu") +
ylab("Nilai X2")
adf_test <- adf.test(ts_data)
print(adf_test)
##
## Augmented Dickey-Fuller Test
##
## data: ts_data
## Dickey-Fuller = -2.052, Lag order = 2, p-value = 0.5526
## alternative hypothesis: stationary
if (adf_test$p.value > 0.05) {
ts_data_diff <- diff(ts_data)
autoplot(ts_data_diff) +
ggtitle("Plot Time Series X2 (Setelah Differencing)") +
xlab("Waktu") +
ylab("Nilai X2 (Differenced)")
} else {
ts_data_diff <- ts_data
}
model_auto <- auto.arima(ts_data)
summary(model_auto)
## Series: ts_data
## ARIMA(0,0,0) with non-zero mean
##
## Coefficients:
## mean
## 22330.401
## s.e. 5603.415
##
## sigma^2 = 567128123: log likelihood = -194.93
## AIC=393.87 AICc=394.72 BIC=395.53
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -6.419545e-12 23103.41 19765.65 -358.6588 394.1669 0.9752158
## ACF1
## Training set 0.1794194
forecast_result <- forecast(model_auto, h = 10) # Forecast 10 periode
Pentingnya Pembersihan Data: Proses pembersihan data sangat penting untuk memastikan analisis yang akurat. Mengganti nilai yang hilang dan mendeteksi outlier membantu meningkatkan kualitas data.
Visualisasi Membantu Pemahaman: Visualisasi data, seperti boxplot dan scatter plot, membantu dalam memahami distribusi data dan hubungan antar variabel, serta mendeteksi anomali.
Modeling dan Peramalan: Penggunaan model regresi dan ARIMA menunjukkan bagaimana data historis dapat digunakan untuk membuat prediksi yang bermanfaat. Akurasi model memberikan indikasi seberapa baik model tersebut dapat diandalkan.
Kesiapan untuk Analisis Lanjutan: Dengan data yang telah dibersihkan dan dianalisis, langkah selanjutnya dapat mencakup analisis lebih mendalam, seperti pengujian hipotesis atau penerapan algoritma pembelajaran mesin untuk prediksi yang lebih kompleks.