setwd('D:/Rakha/Skripsi')
#####
# C Berjalan
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(missForest)
## Warning: package 'missForest' was built under R version 4.2.2
## 
## Attaching package: 'missForest'
## The following object is masked from 'package:VIM':
## 
##     nrmse
load("Rdata Fix/df_simulasi.Rdata")
df <- df_simulasi[,-7]
set.seed(23)
df_2prsn <- prodNA(df, noNA = 0.02)

best_k <- function(data_mis, data, k_seq){
  library(VIM)
  library(missForest)
  listerr <- c() #Create a list in which to save BIC.
  for(i in 1:length(k_seq)){
    df_imp <- VIM::kNN(data_mis, k = k_seq[i])[,1:6]
    err <- mixError(df_imp, data_mis, data)
    listerr[i] <- err # save rmse into the vector
  }
  return(listerr) #Return the list of rmse.
}

k_seq <- seq(1,23,2)
k_terbaik <- best_k(df_2prsn, df, k_seq = k_seq)
plot(k_terbaik)

#####
# Proporsi NA Berjalan
library(missForest)
library(VIM)
library(flexmix)
## Loading required package: lattice
get_BIC <- function(proporsi_na, data){
  
  listBIC <- c() #Create a list in which to save BIC.
  
  for(i in 1:length(proporsi_na)){ #Loop through the numbers missing value proportion
    set.seed(23)
    df <- prodNA(data, noNA = proporsi_na[i])
    df_imp <- VIM::kNN(df, k = 5)[,1:6]
    set.seed(23)
    model <- flexmix(y ~ x1 + x2 + x3 + x4 + x5, data = df_imp, k = 2,
                     control = list(iter = 200, tol = 0.00001, verbose = 0))
    BIC_model <- BIC(model)
    listBIC[i] <- BIC_model # save BIC into the list
  }
  
  return(listBIC) #Return the list of BIC
}

na_prop <- seq(0.005, 0.1, by = 0.005)
BIC_all <- get_BIC(proporsi_na = na_prop, data = df)
plot(BIC_all)

# Import Library
library(flexmix)
library(readxl)

#Import Dataset
df_raw <- read_excel("Dataset.xls", sheet = "df", na = c("Tr", "-"))
df_raw <- df_raw[,-c(1,8,9)]

#Exploratory Data Analysis
summary(df_raw)
##        Y                X1              X2              X3        
##  Min.   :  0.00   Min.   :24.20   Min.   :1.006   Min.   : 4.000  
##  1st Qu.:  0.00   1st Qu.:27.40   1st Qu.:1.008   1st Qu.: 7.000  
##  Median :  1.00   Median :27.80   Median :1.009   Median : 8.000  
##  Mean   : 10.71   Mean   :27.76   Mean   :1.009   Mean   : 8.457  
##  3rd Qu.: 11.30   3rd Qu.:28.20   3rd Qu.:1.010   3rd Qu.: 9.000  
##  Max.   :257.00   Max.   :30.10   Max.   :1.012   Max.   :29.000  
##  NA's   :44                                                       
##        X4               X5        
##  Min.   : 0.000   Min.   :0.5000  
##  1st Qu.: 3.600   1st Qu.:0.7500  
##  Median : 5.800   Median :0.8750  
##  Mean   : 5.552   Mean   :0.8068  
##  3rd Qu.: 7.600   3rd Qu.:0.8750  
##  Max.   :10.700   Max.   :1.0000  
##  NA's   :12
#Impute Missing Value
library(DMwR2)
## Warning: package 'DMwR2' was built under R version 4.2.2
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## 
## Attaching package: 'DMwR2'
## The following object is masked from 'package:VIM':
## 
##     kNN
load("Last/KNN Imputation.Rdata")
df <- knni(as.data.frame(df_raw), k = 19, scale = F)
df <- df[,-c(7,8)]
summary(df)
##        Y                X1              X2              X3        
##  Min.   :  0.00   Min.   :24.20   Min.   :1.006   Min.   : 4.000  
##  1st Qu.:  0.00   1st Qu.:27.40   1st Qu.:1.008   1st Qu.: 7.000  
##  Median :  2.00   Median :27.80   Median :1.009   Median : 8.000  
##  Mean   : 10.63   Mean   :27.76   Mean   :1.009   Mean   : 8.457  
##  3rd Qu.: 12.28   3rd Qu.:28.20   3rd Qu.:1.010   3rd Qu.: 9.000  
##  Max.   :257.00   Max.   :30.10   Max.   :1.012   Max.   :29.000  
##        X4               X5        
##  Min.   : 0.000   Min.   :0.5000  
##  1st Qu.: 3.700   1st Qu.:0.7500  
##  Median : 5.900   Median :0.8750  
##  Mean   : 5.577   Mean   :0.8068  
##  3rd Qu.: 7.600   3rd Qu.:0.8750  
##  Max.   :10.700   Max.   :1.0000
#Modelling
kontrol <- list(tolerance = 10^-5, iter.max = 40)
clr_model_step <- stepFlexmix(Y ~ X1 + X2 + X3 + X4 + X5, data = df, k = 1:6, nrep = 10, verbose = TRUE, control = kontrol)
## 1 : * * * * * * * * * *
## 2 : * * * * * * * * * *
## 3 : *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     34 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     34 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     33 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     34 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     33 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     34 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     34 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     34 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     33 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     33 Log-likelihood: NaN
## 
## 4 : *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     29 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     31 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     30 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     31 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     28 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     33 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     30 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     28 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     28 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     29 Log-likelihood: NaN
## 
## 5 : *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     25 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     24 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     23 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     25 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     25 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     26 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     26 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     24 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     25 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     24 Log-likelihood: NaN
## 
## 6 : *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     24 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     24 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     20 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     23 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     21 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     24 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     24 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     22 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     24 Log-likelihood: NaN
##  *Error in FLXfit(model = model, concomitant = concomitant, control = control,  : 
##     26 Log-likelihood: NaN
set.seed(23)
clr_model <- flexmix(Y ~ X1 + X2 + X3 + X4 + X5, data = df, k = 2)

clr_model@components$Comp.1
## [[1]]
## $coef
##   (Intercept)            X1            X2            X3            X4 
##  15.275147861  -0.109014031 -12.922446861  -0.001235282   0.016766274 
##            X5 
##   1.161101017 
## 
## $sigma
## [1] 0.3222568
clr_model@components$Comp.2
## [[1]]
## $coef
## (Intercept)          X1          X2          X3          X4          X5 
##  19.9903220  -5.0393227  61.3241855   1.8023989  -0.0250367  71.8848454 
## 
## $sigma
## [1] 26.392
summary(clr_model)
## 
## Call:
## flexmix(formula = Y ~ X1 + X2 + X3 + X4 + X5, data = df, k = 2)
## 
##        prior size post>0 ratio
## Comp.1 0.438  164    171 0.959
## Comp.2 0.562  197    361 0.546
## 
## 'log Lik.' -1219.73 (df=15)
## AIC: 2469.461   BIC: 2527.794
parameters(clr_model, component = 1)
##                         Comp.1
## coef.(Intercept)  15.275147861
## coef.X1           -0.109014031
## coef.X2          -12.922446861
## coef.X3           -0.001235282
## coef.X4            0.016766274
## coef.X5            1.161101017
## sigma              0.322256805
parameters(clr_model, component = 2)
##                      Comp.2
## coef.(Intercept) 19.9903220
## coef.X1          -5.0393227
## coef.X2          61.3241855
## coef.X3           1.8023989
## coef.X4          -0.0250367
## coef.X5          71.8848454
## sigma            26.3920010
#Uji Signifikansi Parameter
rmodel <- refit(clr_model)
summary(rmodel)
## $Comp.1
##                Estimate  Std. Error z value  Pr(>|z|)    
## (Intercept)  15.2722890  24.1262058  0.6330 0.5267228    
## X1           -0.1089509   0.0457321 -2.3824 0.0172014 *  
## X2          -12.9216838  23.6400780 -0.5466 0.5846531    
## X3           -0.0012378   0.0099958 -0.1238 0.9014443    
## X4            0.0169297   0.0143471  1.1800 0.2379964    
## X5            1.1582056   0.3198297  3.6213 0.0002931 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $Comp.2
##                Estimate  Std. Error z value Pr(>|z|)   
## (Intercept)   19.990337 1606.451393  0.0124 0.990072   
## X1            -5.039498    2.682002 -1.8790 0.060244 . 
## X2            61.324203 1580.512026  0.0388 0.969050   
## X3             1.801952    0.689642  2.6129 0.008978 **
## X4            -0.027047    0.960487 -0.0282 0.977534   
## X5            71.885330   29.253023  2.4574 0.013996 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Clustered_dataset <- cbind(df, clr_model@cluster)
colnames(Clustered_dataset) <- c("Y", "X1", "X2", "X3", "X4", "X5", "Cluster")

Gerombol1 <- Clustered_dataset[Clustered_dataset$Cluster == 1,]
Gerombol2 <- Clustered_dataset[Clustered_dataset$Cluster == 2,]

summary(Gerombol1[,-7]) 
##        Y                X1              X2              X3        
##  Min.   :0.0000   Min.   :26.10   Min.   :1.006   Min.   : 4.000  
##  1st Qu.:0.0000   1st Qu.:27.60   1st Qu.:1.008   1st Qu.: 7.000  
##  Median :0.0000   Median :27.90   Median :1.009   Median : 8.000  
##  Mean   :0.1847   Mean   :27.92   Mean   :1.009   Mean   : 8.372  
##  3rd Qu.:0.0000   3rd Qu.:28.30   3rd Qu.:1.010   3rd Qu.: 9.000  
##  Max.   :1.0000   Max.   :30.10   Max.   :1.012   Max.   :29.000  
##        X4               X5        
##  Min.   : 0.000   Min.   :0.5000  
##  1st Qu.: 5.100   1st Qu.:0.7500  
##  Median : 7.000   Median :0.7500  
##  Mean   : 6.524   Mean   :0.7553  
##  3rd Qu.: 8.525   3rd Qu.:0.8750  
##  Max.   :10.700   Max.   :0.8750
round(diag(var(Gerombol1)), 6)
##        Y       X1       X2       X3       X4       X5  Cluster 
## 0.129234 0.404278 0.000001 7.462031 6.857422 0.010612 0.000000
summary(Gerombol2[,-7])
##        Y                 X1              X2              X3        
##  Min.   :  1.307   Min.   :24.20   Min.   :1.006   Min.   : 4.000  
##  1st Qu.:  4.000   1st Qu.:27.20   1st Qu.:1.008   1st Qu.: 7.000  
##  Median : 10.988   Median :27.70   Median :1.009   Median : 8.000  
##  Mean   : 19.319   Mean   :27.63   Mean   :1.009   Mean   : 8.528  
##  3rd Qu.: 22.000   3rd Qu.:28.10   3rd Qu.:1.010   3rd Qu.:10.000  
##  Max.   :257.000   Max.   :29.90   Max.   :1.012   Max.   :21.000  
##        X4               X5        
##  Min.   : 0.000   Min.   :0.6250  
##  1st Qu.: 2.800   1st Qu.:0.7500  
##  Median : 5.100   Median :0.8750  
##  Mean   : 4.789   Mean   :0.8496  
##  3rd Qu.: 6.700   3rd Qu.:0.8750  
##  Max.   :10.500   Max.   :1.0000
round(diag(var(Gerombol2)), 6)
##          Y         X1         X2         X3         X4         X5    Cluster 
## 785.270767   0.640014   0.000001   7.454574   6.575821   0.005730   0.000000
boxplot(Y~ as.factor(Cluster), ylab = "Jumlah Curah Hujan Harian", xlab = "Gerombol", 
        data = Clustered_dataset, outline = F)

#writexl::write_xlsx(Clustered_dataset, "Last/Hasil Gerombol.xlsx")