#install.packages("imputeTS")
library(imputeTS)
## Warning: package 'imputeTS' was built under R version 4.3.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(readr)
## Warning: package 'readr' was built under R version 4.3.3
MEMBACA DATA
comp_data = read_csv("C:/Users/Mrs. Ira/Downloads/complete_data.csv")
## Rows: 3683 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): rate
## dttm (1): ts
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(comp_data)
## ts rate
## Min. :2020-02-01 00:00:00 Min. : 49.0
## 1st Qu.:2020-02-07 09:25:00 1st Qu.:103.0
## Median :2020-02-13 18:50:00 Median :150.0
## Mean :2020-02-13 18:50:00 Mean :143.2
## 3rd Qu.:2020-02-20 04:15:00 3rd Qu.:183.0
## Max. :2020-02-26 13:40:00 Max. :219.0
miss_data = read_csv("C:/Users/Mrs. Ira/Downloads/missing_data.csv")
## Rows: 3683 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): rate
## dttm (1): ts
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(miss_data)
## ts rate
## Min. :2020-02-01 00:00:00 Min. : 49.0
## 1st Qu.:2020-02-07 09:25:00 1st Qu.:103.0
## Median :2020-02-13 18:50:00 Median :150.0
## Mean :2020-02-13 18:50:00 Mean :143.1
## 3rd Qu.:2020-02-20 04:15:00 3rd Qu.:183.0
## Max. :2020-02-26 13:40:00 Max. :219.0
## NA's :64
Calculate Missing Values: Determine the number and percentage of missing values in the dataset.
persentase_na = function(data, kolom, nama_data){
kolomdata = data[[kolom]]
x = sum(is.na(kolomdata))
y = length(kolomdata)
persen = (x/y)*100
jum_na = paste0("Jumlah Missing Value pada kolom '", kolom, "' di '", nama_data, "'adalah ", x, ".")
has_pers = paste0("Persentase Missing Value pada kolom '", kolom, "' di '", nama_data, "'adalah ", round(persen,2), "%.")
return(list(jumlah_na = jum_na, persentase = has_pers))
}
persentase_na(comp_data, "rate", "comp_data")
## $jumlah_na
## [1] "Jumlah Missing Value pada kolom 'rate' di 'comp_data'adalah 0."
##
## $persentase
## [1] "Persentase Missing Value pada kolom 'rate' di 'comp_data'adalah 0%."
persentase_na(miss_data, "rate", "miss_data")
## $jumlah_na
## [1] "Jumlah Missing Value pada kolom 'rate' di 'miss_data'adalah 64."
##
## $persentase
## [1] "Persentase Missing Value pada kolom 'rate' di 'miss_data'adalah 1.74%."
Visualize Missing Value Distribution: Create visualizations to show the distribution and pattern of missing values across the time series.
Data Complete (tidak terdapat missing values)
ggplot_na_distribution(comp_data$rate)
#dapat dilihat bahwa tidak terdapat data yang kosong
Data Missing
ggplot_na_distribution(miss_data$rate)
Perform Missing Value Imputation: Apply all available imputation methods
Data Missing
#dengan metode imputasi: interpolasi linear
miss_lin = na_interpolation(miss_data$rate, option = "linear")
ggplot_na_imputations(miss_data$rate, miss_lin)
#dengan metode imputasi: interpolasi spline
miss_spline = na_interpolation(miss_data$rate, option = "spline")
ggplot_na_imputations(miss_data$rate, miss_spline)
#dengan metode imputasi: interpolasi stine
miss_stine = na_interpolation(miss_data$rate, option = "stine")
ggplot_na_imputations(miss_data$rate, miss_stine)
#dengan mean
miss_mean = na_mean(miss_data$rate)
ggplot_na_imputations(miss_data$rate, miss_mean)
#dengan nilai random
miss_ran = na_random(miss_data$rate)
ggplot_na_imputations(miss_data$rate, miss_ran)
#dengan nilai terakhir
miss_locf = na_locf(miss_data$rate)
ggplot_na_imputations(miss_data$rate, miss_locf)
#dengan Kalman Smoothing & Structural Time Series Models
miss_kal = na_kalman(miss_data$rate)
ggplot_na_imputations(miss_data$rate, miss_kal)
#dengan moving average
miss_ma = na_ma(miss_data$rate)
ggplot_na_imputations(miss_data$rate, miss_ma)
# Visualisasi distribusi missing sebelum imputasi
ggplot_na_gapsize(miss_data$rate)
Analyze and Evaluate the Imputation Results: Compare the imputed values with the original (true) values.
#install.packages("Metrics")
library(Metrics) # untuk rmse()
## Warning: package 'Metrics' was built under R version 4.3.3
miss_lin = na_interpolation(miss_data$rate, option = "linear")
miss_spline = na_interpolation(miss_data$rate, option = "spline")
miss_stine = na_interpolation(miss_data$rate, option = "stine")
miss_mean = na_mean(miss_data$rate)
miss_ran = na_random(miss_data$rate)
miss_locf = na_locf(miss_data$rate)
miss_kal = na_kalman(miss_data$rate)
miss_ma = na_ma(miss_data$rate)
rmse_lin <- rmse(comp_data$rate, miss_lin)
rmse_spline <- rmse(comp_data$rate, miss_spline)
rmse_stine <- rmse(comp_data$rate, miss_stine)
rmse_mean <- rmse(comp_data$rate, miss_mean)
rmse_ran <- rmse(comp_data$rate, miss_ran)
rmse_locf <- rmse(comp_data$rate, miss_locf)
rmse_kalman <- rmse(comp_data$rate, miss_kal)
rmse_ma <- rmse(comp_data$rate, miss_ma)
data.frame(
Method = c("Linear Interpolation", "Spline Interpolation","Stine Interpolation","Mean","Random","LOCF", "Kalman", "Moving Average"),
RMSE = c(rmse_lin, rmse_spline, rmse_stine, rmse_mean, rmse_ran, rmse_locf, rmse_kalman, rmse_ma)
)
## Method RMSE
## 1 Linear Interpolation 1.1642699
## 2 Spline Interpolation 2.4043306
## 3 Stine Interpolation 1.1708524
## 4 Mean 6.8694113
## 5 Random 9.4782210
## 6 LOCF 1.0785092
## 7 Kalman 0.9927024
## 8 Moving Average 1.0134304