Importing Library
library(flexmix)
library(readxl)
library(VIM)
library(ggplot2)
library(patchwork)
Analisis Deskriptif
df_full <- read_excel("Dataset/dataframe.xls", sheet = "df_full")
df_full <- df_full[,-c(1,8)]
summary(df_full)
## Ave. T. (ºC) Prec. (mm) S.L.Press./ Gheopot. Wind sp. (Km/h)
## Min. :24.20 Min. : 0.00 Min. :1006 Min. : 4.000
## 1st Qu.:27.40 1st Qu.: 0.00 1st Qu.:1008 1st Qu.: 7.000
## Median :27.90 Median : 1.00 Median :1009 Median : 8.000
## Mean :27.78 Mean : 10.86 Mean :1009 Mean : 8.536
## 3rd Qu.:28.20 3rd Qu.: 11.75 3rd Qu.:1010 3rd Qu.: 9.000
## Max. :30.10 Max. :257.00 Max. :1012 Max. :29.000
## Insolat. (hours) CC
## Min. : 0.000 Min. :0.5000
## 1st Qu.: 3.900 1st Qu.:0.7500
## Median : 6.100 Median :0.8750
## Mean : 5.741 Mean :0.8068
## 3rd Qu.: 7.900 3rd Qu.:0.8750
## Max. :10.700 Max. :1.0000
Pie Chart
df_pie <- read_excel("Dataset/dataframe.xls", sheet = "pie")
df_pie$Observation <- as.factor(df_pie$Observation)
df_pie$Count <- as.integer(df_pie$Count)
# install.packages("ggplot2")
library(ggplot2)
ggplot(df_pie, aes(x = "", y = Perc, fill = Observation)) +
geom_col(color = "black", linewidth = 0.7) +
geom_label(aes(label = Labels), color = c( "white","white"),
position = position_stack(vjust = 0.5),
show.legend = FALSE) +
guides(fill = guide_legend(title = "Observasi")) +
scale_fill_manual(values=c("#4472C4", "#EE8640")) +
coord_polar(theta = "y") +
theme_void()

Imputasi KNN
#Import Dataset
df_test <- read_excel("Dataset/df_imp.xls", na = c("Tr", "-"))
df_test <- df_test[,-c(1,3,8)]
#Impute Missing Value
percent_of_missing <- 1:6
for (i in percent_of_missing) {
percent_of_missing[i] <- 100 * (sum(is.na(df_test[, i])) / nrow(df_test))
}
library(Metrics)
Y <- read_excel("Dataset/df_imp.xls", sheet = "yactual")
## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
max_K <- round(sqrt(361))
rmse <- 1:max_K
set.seed(18013)
for (i in rmse) {
rmse[i] <- rmse(actual = Y$Y, predicted = NADIA::autotune_VIM_kNN(df_test, percent_of_missing, k = i)[1:55,6])
}
rmse_odd <- data.frame(t(rmse[c(seq(1,19,2))]))
Nilai_K <- c("K = 1", "K = 3", " K = 5", "K = 7", "K = 9", "K = 11", "K = 13", "K = 15", "K = 17", "K = 19")
colnames(rmse_odd) <- Nilai_K
rmse_odd
## K = 1 K = 3 K = 5 K = 7 K = 9 K = 11 K = 13 K = 15
## 1 20.42935 16.06468 15.5786 16.86894 16.1876 16.22424 15.97876 15.99926
## K = 17 K = 19
## 1 16.01129 15.99574
#Import Dataset
df_raw <- read_excel("Dataset/Dataset.xls", sheet = "df", na = c("Tr", "-"))
df_raw <- df_raw[,-c(1,8,9)]
#Impute Missing Value
percent.of_missing <- 1:6
for (i in percent.of_missing) {
percent.of_missing[i] <- 100 * (sum(is.na(df_raw[, i])) / nrow(df_raw))
}
set.seed(18013)
df <- VIM::kNN(df_raw, metric = "euclidean" ,variable = c("Y", "X4"), k = 5)
hasil_impute <- subset(df, Y_imp == "TRUE" | X4_imp == "TRUE")
hasil_impute
## Y X1 X2 X3 X4 X5 Y_imp X4_imp
## 2 5.0 27.8 1007.8 8 2.5 0.875 TRUE FALSE
## 6 1.0 26.1 1008.7 4 0.8 0.875 TRUE FALSE
## 17 1.0 26.7 1008.9 6 0.8 0.875 TRUE FALSE
## 20 5.0 27.4 1008.3 5 2.5 0.875 TRUE FALSE
## 22 0.8 27.1 1008.3 8 5.3 0.750 TRUE FALSE
## 23 2.0 27.6 1008.6 6 6.6 0.750 TRUE FALSE
## 36 0.5 27.4 1009.0 6 6.8 0.750 TRUE FALSE
## 41 2.0 28.0 1007.4 6 5.8 0.750 TRUE FALSE
## 53 0.3 27.4 1009.2 6 1.8 0.875 TRUE FALSE
## 60 4.0 27.4 1009.6 9 4.3 0.875 TRUE FALSE
## 61 0.5 27.6 1008.4 6 9.6 0.750 TRUE FALSE
## 63 21.0 27.0 1008.2 7 4.1 0.875 FALSE TRUE
## 80 0.0 28.1 1008.5 7 7.8 0.750 FALSE TRUE
## 88 1.0 28.1 1006.9 5 8.0 0.750 TRUE FALSE
## 95 11.3 28.6 1007.9 5 4.7 0.875 FALSE TRUE
## 108 4.0 28.0 1008.2 7 7.5 0.875 TRUE FALSE
## 110 0.0 28.4 1007.8 9 8.2 0.750 FALSE TRUE
## 111 0.0 28.2 1007.9 7 9.1 0.750 TRUE FALSE
## 130 0.0 28.1 1009.0 9 9.9 0.500 FALSE TRUE
## 138 0.0 28.6 1006.1 7 5.2 0.750 TRUE FALSE
## 143 6.0 28.0 1006.6 11 4.7 0.875 TRUE FALSE
## 154 0.0 28.4 1008.8 9 6.6 0.750 TRUE FALSE
## 165 4.0 26.6 1007.0 5 0.9 0.875 TRUE FALSE
## 166 35.0 28.1 1007.7 13 4.8 0.875 FALSE TRUE
## 171 3.0 27.9 1007.6 8 2.5 0.875 FALSE TRUE
## 172 2.0 27.2 1008.1 8 1.8 0.875 TRUE FALSE
## 181 2.0 28.3 1008.1 8 6.3 0.875 TRUE FALSE
## 187 0.0 28.2 1008.0 7 5.9 0.750 TRUE FALSE
## 189 2.0 27.4 1008.4 9 1.1 0.875 TRUE FALSE
## 212 0.3 27.3 1010.2 6 3.2 0.750 TRUE FALSE
## 219 0.0 28.1 1009.6 10 5.2 0.750 TRUE FALSE
## 224 13.0 27.3 1011.0 8 6.4 0.875 TRUE FALSE
## 226 0.0 27.6 1009.1 8 6.2 0.750 TRUE TRUE
## 227 0.0 28.0 1008.5 6 7.8 0.750 TRUE FALSE
## 250 1.0 28.4 1007.0 12 7.3 0.750 TRUE FALSE
## 251 2.0 27.7 1006.7 7 0.6 0.875 TRUE FALSE
## 257 1.0 27.2 1009.5 9 0.0 0.875 TRUE FALSE
## 259 2.0 27.8 1008.3 6 1.5 0.875 TRUE FALSE
## 262 1.0 28.3 1006.3 13 7.3 0.750 FALSE TRUE
## 263 5.0 27.4 1006.6 13 3.2 0.875 TRUE FALSE
## 272 0.0 28.4 1010.4 9 5.4 0.750 TRUE FALSE
## 273 1.0 27.6 1009.8 6 1.6 0.875 TRUE FALSE
## 280 0.0 27.7 1011.2 9 7.6 0.750 FALSE TRUE
## 287 33.0 26.6 1008.2 9 2.4 0.875 TRUE FALSE
## 295 2.0 28.2 1008.1 12 3.9 0.875 TRUE FALSE
## 300 1.0 27.1 1010.3 5 0.5 0.875 TRUE FALSE
## 310 3.0 27.4 1009.3 13 5.4 0.875 TRUE FALSE
## 314 0.0 29.0 1007.2 11 9.8 0.500 FALSE TRUE
## 323 2.0 26.5 1010.4 6 1.4 0.875 TRUE FALSE
## 324 6.0 26.7 1010.7 7 3.6 0.875 TRUE FALSE
## 333 2.0 27.9 1007.6 7 0.5 0.875 TRUE FALSE
## 334 0.0 28.1 1007.2 12 6.2 0.625 TRUE FALSE
## 337 0.0 26.4 1008.7 11 6.9 0.750 FALSE TRUE
## 343 13.0 26.4 1009.8 7 2.3 0.875 TRUE FALSE
## 354 0.0 28.5 1009.7 8 8.5 0.750 TRUE FALSE
df_sp <- read_excel("Dataset/Dataset.xls", sheet = "df_new", na = c("Tr", "-"))
p1 <- ggplot(df_sp) + geom_point(aes(X1, Y_imp)) + theme_classic()
p2 <- ggplot(df_sp) + geom_point(aes(X2, Y_imp)) + theme_classic()
p3 <- ggplot(df_sp) + geom_point(aes(X3, Y_imp)) + theme_classic()
p4 <- ggplot(df_sp) + geom_point(aes(X4_imp, Y_imp)) + theme_classic()
p5 <- ggplot(df_sp) + geom_point(aes(X5, Y_imp)) + theme_classic()
p1 + p2 + p3 + p4 + p5

Pemodelan Regresi Gerombol
df <- read_excel("Dataset/Dataset.xls", sheet = "df_new")
df <- df[,-c(1,8,9,10,11)]
set.seed(1)
summary(df)
## Y_imp X1 X2 X3
## Min. : 0.000 Min. :24.20 Min. :1.006 Min. : 4.000
## 1st Qu.: 0.000 1st Qu.:27.40 1st Qu.:1.008 1st Qu.: 7.000
## Median : 1.000 Median :27.80 Median :1.009 Median : 8.000
## Mean : 9.768 Mean :27.76 Mean :1.009 Mean : 8.457
## 3rd Qu.: 9.000 3rd Qu.:28.20 3rd Qu.:1.010 3rd Qu.: 9.000
## Max. :257.000 Max. :30.10 Max. :1.012 Max. :29.000
## X4_imp X5
## Min. : 0.000 Min. :0.5000
## 1st Qu.: 3.600 1st Qu.:0.7500
## Median : 5.900 Median :0.8750
## Mean : 5.588 Mean :0.8068
## 3rd Qu.: 7.700 3rd Qu.:0.8750
## Max. :10.700 Max. :1.0000
clr_model <- flexmix(Y_imp ~ X1 + X2 + X3 + X4_imp + X5, data = df, control = list(tol = 0.00001), k = 2)
BIC(clr_model)
## [1] 2386.773
clr_model@components
## $Comp.1
## $Comp.1[[1]]
## $coef
## (Intercept) X1 X2 X3 X4_imp
## 179.83171028 -0.30685054 -173.62834434 0.05623727 0.01169788
## X5
## 5.37328569
##
## $sigma
## [1] 1.13853
##
##
##
## $Comp.2
## $Comp.2[[1]]
## $coef
## (Intercept) X1 X2 X3 X4_imp X5
## 1071.566422 -4.120788 -975.668894 3.399330 -1.846850 39.320410
##
## $sigma
## [1] 30.63592
summary(clr_model)
##
## Call:
## flexmix(formula = Y_imp ~ X1 + X2 + X3 + X4_imp + X5, data = df,
## k = 2, control = list(tol = 1e-05))
##
## prior size post>0 ratio
## Comp.1 0.664 246 265 0.928
## Comp.2 0.336 115 361 0.319
##
## 'log Lik.' -1149.22 (df=15)
## AIC: 2328.439 BIC: 2386.773
Pengujian Parameter
summary(refit(clr_model))
## $Comp.1
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.7961e+02 7.7913e+01 2.3052 0.02115 *
## X1 -2.8406e-01 1.4682e-01 -1.9347 0.05303 .
## X2 -1.7382e+02 7.5875e+01 -2.2909 0.02197 *
## X3 5.4343e-02 2.8375e-02 1.9152 0.05547 .
## X4_imp 7.5464e-03 3.9190e-02 0.1926 0.84730
## X5 5.0935e+00 1.2160e+00 4.1887 2.806e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## $Comp.2
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1071.5623 2295.4286 0.4668 0.640625
## X1 -4.1262 3.8222 -1.0795 0.280348
## X2 -975.6727 2259.4509 -0.4318 0.665873
## X3 3.3221 1.1385 2.9180 0.003522 **
## X4_imp -1.7918 1.5881 -1.1283 0.259196
## X5 39.3349 46.4021 0.8477 0.396608
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df_cluster <- df
df_cluster$Cluster <- clr_model@cluster
df_cluster1 <- subset(df_cluster, Cluster == 1)
df_cluster2 <- subset(df_cluster, Cluster == 2)
summary(df_cluster1)
## Y_imp X1 X2 X3
## Min. :0.0000 Min. :26.10 Min. :1.006 Min. : 4.000
## 1st Qu.:0.0000 1st Qu.:27.40 1st Qu.:1.008 1st Qu.: 7.000
## Median :0.0000 Median :27.90 Median :1.009 Median : 8.000
## Mean :0.8691 Mean :27.86 Mean :1.009 Mean : 8.419
## 3rd Qu.:1.7500 3rd Qu.:28.20 3rd Qu.:1.010 3rd Qu.: 9.000
## Max. :5.0000 Max. :30.10 Max. :1.012 Max. :29.000
## X4_imp X5 Cluster
## Min. : 0.000 Min. :0.5000 Min. :1
## 1st Qu.: 4.100 1st Qu.:0.7500 1st Qu.:1
## Median : 6.350 Median :0.7500 Median :1
## Mean : 5.953 Mean :0.7795 Mean :1
## 3rd Qu.: 8.175 3rd Qu.:0.8750 3rd Qu.:1
## Max. :10.700 Max. :1.0000 Max. :1
diag(var(df_cluster1[,-7]))
## Y_imp X1 X2 X3 X4_imp X5
## 1.630062e+00 4.195177e-01 1.300412e-06 7.770914e+00 7.593196e+00 1.009727e-02
summary(df_cluster2)
## Y_imp X1 X2 X3
## Min. : 4.0 Min. :24.20 Min. :1.006 Min. : 4.000
## 1st Qu.: 10.0 1st Qu.:27.10 1st Qu.:1.008 1st Qu.: 7.000
## Median : 17.0 Median :27.70 Median :1.009 Median : 8.000
## Mean : 28.8 Mean :27.56 Mean :1.009 Mean : 8.539
## 3rd Qu.: 33.5 3rd Qu.:28.10 3rd Qu.:1.010 3rd Qu.: 9.500
## Max. :257.0 Max. :29.90 Max. :1.012 Max. :21.000
## X4_imp X5 Cluster
## Min. : 0.000 Min. :0.6250 Min. :2
## 1st Qu.: 2.750 1st Qu.:0.8750 1st Qu.:2
## Median : 4.800 Median :0.8750 Median :2
## Mean : 4.809 Mean :0.8652 Mean :2
## 3rd Qu.: 6.700 3rd Qu.:0.8750 3rd Qu.:2
## Max. :10.500 Max. :1.0000 Max. :2
diag(var(df_cluster2[,-7]))
## Y_imp X1 X2 X3 X4_imp X5
## 1.111665e+03 7.840580e-01 1.696702e-06 6.794508e+00 6.625713e+00 5.248856e-03