df <- read.csv("covid_19_indonesia_time_series_all.csv")

str(df)
## 'data.frame':    31822 obs. of  38 variables:
##  $ Date                       : chr  "3/1/2020" "3/2/2020" "3/2/2020" "3/2/2020" ...
##  $ Location.ISO.Code          : chr  "ID-JK" "ID-JK" "IDN" "ID-RI" ...
##  $ Location                   : chr  "DKI Jakarta" "DKI Jakarta" "Indonesia" "Riau" ...
##  $ New.Cases                  : int  2 2 2 1 2 0 1 0 2 0 ...
##  $ New.Deaths                 : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ New.Recovered              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ New.Active.Cases           : int  2 2 2 1 2 0 0 0 2 0 ...
##  $ Total.Cases                : int  39 41 2 1 43 2 1 1 45 2 ...
##  $ Total.Deaths               : int  20 20 0 0 20 0 1 0 20 0 ...
##  $ Total.Recovered            : int  75 75 0 1 75 0 60 1 75 0 ...
##  $ Total.Active.Cases         : int  -56 -54 2 0 -52 2 -60 0 -50 2 ...
##  $ Location.Level             : chr  "Province" "Province" "Country" "Province" ...
##  $ City.or.Regency            : logi  NA NA NA NA NA NA ...
##  $ Province                   : chr  "DKI Jakarta" "DKI Jakarta" "" "Riau" ...
##  $ Country                    : chr  "Indonesia" "Indonesia" "Indonesia" "Indonesia" ...
##  $ Continent                  : chr  "Asia" "Asia" "Asia" "Asia" ...
##  $ Island                     : chr  "Jawa" "Jawa" "" "Sumatera" ...
##  $ Time.Zone                  : chr  "UTC+07:00" "UTC+07:00" "" "UTC+07:00" ...
##  $ Special.Status             : chr  "Daerah Khusus Ibu Kota" "Daerah Khusus Ibu Kota" "" "" ...
##  $ Total.Regencies            : int  1 1 416 10 1 416 18 10 1 416 ...
##  $ Total.Cities               : int  5 5 98 2 5 98 9 2 5 98 ...
##  $ Total.Districts            : int  44 44 7230 169 44 7230 627 169 44 7230 ...
##  $ Total.Urban.Villages       : int  267 267 8488 268 267 8488 645 268 267 8488 ...
##  $ Total.Rural.Villages       : int  NA NA 74953 1591 NA 74953 5312 1591 NA 74953 ...
##  $ Area..km2.                 : int  664 664 1916907 87024 664 1916907 35378 87024 664 1916907 ...
##  $ Population                 : int  10846145 10846145 265185520 6074100 10846145 265185520 45161325 6074100 10846145 265185520 ...
##  $ Population.Density         : num  16334.3 16334.3 138.3 69.8 16334.3 ...
##  $ Longitude                  : num  107 107 114 102 107 ...
##  $ Latitude                   : num  -6.205 -6.205 -0.789 0.512 -6.205 ...
##  $ New.Cases.per.Million      : num  0.18 0.18 0.01 0.16 0.18 0 0.02 0 0.18 0 ...
##  $ Total.Cases.per.Million    : num  3.6 3.78 0.01 0.16 3.96 0.01 0.02 0.16 4.15 0.01 ...
##  $ New.Deaths.per.Million     : num  0 0 0 0 0 0 0.02 0 0 0 ...
##  $ Total.Deaths.per.Million   : num  1.84 1.84 0 0 1.84 0 0.02 0 1.84 0 ...
##  $ Total.Deaths.per.100rb     : num  0.18 0.18 0 0 0.18 0 0 0 0.18 0 ...
##  $ Case.Fatality.Rate         : chr  "51.28%" "48.78%" "0.00%" "0.00%" ...
##  $ Case.Recovered.Rate        : chr  "192.31%" "182.93%" "0.00%" "100.00%" ...
##  $ Growth.Factor.of.New.Cases : num  NA 1 NA NA 1 0 NA 0 1 1 ...
##  $ Growth.Factor.of.New.Deaths: num  NA 1 NA NA 1 1 NA 1 1 1 ...
head(df)
##       Date Location.ISO.Code    Location New.Cases New.Deaths New.Recovered
## 1 3/1/2020             ID-JK DKI Jakarta         2          0             0
## 2 3/2/2020             ID-JK DKI Jakarta         2          0             0
## 3 3/2/2020               IDN   Indonesia         2          0             0
## 4 3/2/2020             ID-RI        Riau         1          0             0
## 5 3/3/2020             ID-JK DKI Jakarta         2          0             0
## 6 3/3/2020               IDN   Indonesia         0          0             0
##   New.Active.Cases Total.Cases Total.Deaths Total.Recovered Total.Active.Cases
## 1                2          39           20              75                -56
## 2                2          41           20              75                -54
## 3                2           2            0               0                  2
## 4                1           1            0               1                  0
## 5                2          43           20              75                -52
## 6                0           2            0               0                  2
##   Location.Level City.or.Regency    Province   Country Continent   Island
## 1       Province              NA DKI Jakarta Indonesia      Asia     Jawa
## 2       Province              NA DKI Jakarta Indonesia      Asia     Jawa
## 3        Country              NA             Indonesia      Asia         
## 4       Province              NA        Riau Indonesia      Asia Sumatera
## 5       Province              NA DKI Jakarta Indonesia      Asia     Jawa
## 6        Country              NA             Indonesia      Asia         
##   Time.Zone         Special.Status Total.Regencies Total.Cities Total.Districts
## 1 UTC+07:00 Daerah Khusus Ibu Kota               1            5              44
## 2 UTC+07:00 Daerah Khusus Ibu Kota               1            5              44
## 3                                              416           98            7230
## 4 UTC+07:00                                     10            2             169
## 5 UTC+07:00 Daerah Khusus Ibu Kota               1            5              44
## 6                                              416           98            7230
##   Total.Urban.Villages Total.Rural.Villages Area..km2. Population
## 1                  267                   NA        664   10846145
## 2                  267                   NA        664   10846145
## 3                 8488                74953    1916907  265185520
## 4                  268                 1591      87024    6074100
## 5                  267                   NA        664   10846145
## 6                 8488                74953    1916907  265185520
##   Population.Density Longitude   Latitude New.Cases.per.Million
## 1           16334.31  106.8361 -6.2046990                  0.18
## 2           16334.31  106.8361 -6.2046990                  0.18
## 3             138.34  113.9213 -0.7892750                  0.01
## 4              69.80  101.8051  0.5116479                  0.16
## 5           16334.31  106.8361 -6.2046990                  0.18
## 6             138.34  113.9213 -0.7892750                  0.00
##   Total.Cases.per.Million New.Deaths.per.Million Total.Deaths.per.Million
## 1                    3.60                      0                     1.84
## 2                    3.78                      0                     1.84
## 3                    0.01                      0                     0.00
## 4                    0.16                      0                     0.00
## 5                    3.96                      0                     1.84
## 6                    0.01                      0                     0.00
##   Total.Deaths.per.100rb Case.Fatality.Rate Case.Recovered.Rate
## 1                   0.18             51.28%             192.31%
## 2                   0.18             48.78%             182.93%
## 3                   0.00              0.00%               0.00%
## 4                   0.00              0.00%             100.00%
## 5                   0.18             46.51%             174.42%
## 6                   0.00              0.00%               0.00%
##   Growth.Factor.of.New.Cases Growth.Factor.of.New.Deaths
## 1                         NA                          NA
## 2                          1                           1
## 3                         NA                          NA
## 4                         NA                          NA
## 5                          1                           1
## 6                          0                           1
df <- df[df$Location.Level == "Province", ]
colnames(df)
##  [1] "Date"                        "Location.ISO.Code"          
##  [3] "Location"                    "New.Cases"                  
##  [5] "New.Deaths"                  "New.Recovered"              
##  [7] "New.Active.Cases"            "Total.Cases"                
##  [9] "Total.Deaths"                "Total.Recovered"            
## [11] "Total.Active.Cases"          "Location.Level"             
## [13] "City.or.Regency"             "Province"                   
## [15] "Country"                     "Continent"                  
## [17] "Island"                      "Time.Zone"                  
## [19] "Special.Status"              "Total.Regencies"            
## [21] "Total.Cities"                "Total.Districts"            
## [23] "Total.Urban.Villages"        "Total.Rural.Villages"       
## [25] "Area..km2."                  "Population"                 
## [27] "Population.Density"          "Longitude"                  
## [29] "Latitude"                    "New.Cases.per.Million"      
## [31] "Total.Cases.per.Million"     "New.Deaths.per.Million"     
## [33] "Total.Deaths.per.Million"    "Total.Deaths.per.100rb"     
## [35] "Case.Fatality.Rate"          "Case.Recovered.Rate"        
## [37] "Growth.Factor.of.New.Cases"  "Growth.Factor.of.New.Deaths"
df_selected <- df[, c("Location",
                     "New.Cases",
                     "New.Deaths",
                     "Total.Cases",
                     "Total.Deaths",
                     "Case.Fatality.Rate")]
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df_final <- df_selected %>%
  group_by(Location) %>%
  summarise(
    cases = mean(New.Cases, na.rm=TRUE),
    deaths = mean(New.Deaths, na.rm=TRUE),
    total_cases = mean(Total.Cases, na.rm=TRUE),
    total_deaths = mean(Total.Deaths, na.rm=TRUE),
    cfr = mean(Case.Fatality.Rate, na.rm=TRUE)
  )
## Warning: There were 34 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `cfr = mean(Case.Fatality.Rate, na.rm = TRUE)`.
## ℹ In group 1: `Location = "Aceh"`.
## Caused by warning in `mean.default()`:
## ! argument is not numeric or logical: returning NA
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 33 remaining warnings.
head(df_final)
## # A tibble: 6 × 6
##   Location                    cases deaths total_cases total_deaths   cfr
##   <chr>                       <dbl>  <dbl>       <dbl>        <dbl> <dbl>
## 1 Aceh                         48.7  2.46       22082.        1085.    NA
## 2 Bali                        182.   5.15       72087.        2195.    NA
## 3 Banten                      361.   3.19      108872.        1544.    NA
## 4 Bengkulu                     32.5  0.581      13735.         270.    NA
## 5 DKI Jakarta                1520.  16.7       587520.        8259.    NA
## 6 Daerah Istimewa Yogyakarta  245.   6.49       94191.        2719.    NA
df_final <- df_selected %>%
  group_by(Location) %>%
  summarise(
    cases = mean(New.Cases, na.rm=TRUE),
    deaths = mean(New.Deaths, na.rm=TRUE),
    total_cases = mean(Total.Cases, na.rm=TRUE),
    total_deaths = mean(Total.Deaths, na.rm=TRUE)
  )
df_scaled <- scale(df_final[,-1])
wss <- sapply(1:10, function(k){
  kmeans(df_scaled, centers = k)$tot.withinss
})

plot(1:10, wss, type="b")

km <- kmeans(df_scaled, centers = 3)
df_final$cluster <- km$cluster

head(df_final)
## # A tibble: 6 × 6
##   Location                    cases deaths total_cases total_deaths cluster
##   <chr>                       <dbl>  <dbl>       <dbl>        <dbl>   <int>
## 1 Aceh                         48.7  2.46       22082.        1085.       3
## 2 Bali                        182.   5.15       72087.        2195.       2
## 3 Banten                      361.   3.19      108872.        1544.       2
## 4 Bengkulu                     32.5  0.581      13735.         270.       3
## 5 DKI Jakarta                1520.  16.7       587520.        8259.       1
## 6 Daerah Istimewa Yogyakarta  245.   6.49       94191.        2719.       2
head(df_final)
## # A tibble: 6 × 6
##   Location                    cases deaths total_cases total_deaths cluster
##   <chr>                       <dbl>  <dbl>       <dbl>        <dbl>   <int>
## 1 Aceh                         48.7  2.46       22082.        1085.       3
## 2 Bali                        182.   5.15       72087.        2195.       2
## 3 Banten                      361.   3.19      108872.        1544.       2
## 4 Bengkulu                     32.5  0.581      13735.         270.       3
## 5 DKI Jakarta                1520.  16.7       587520.        8259.       1
## 6 Daerah Istimewa Yogyakarta  245.   6.49       94191.        2719.       2
library(flexclust)
## Warning: package 'flexclust' was built under R version 4.5.3
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.5.3
## 
## Attaching package: 'dbscan'
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
library(meanShiftR)
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:flexclust':
## 
##     bclust
library(cluster)
## Warning: package 'cluster' was built under R version 4.5.3
library(dplyr)
kmed <- kcca(df_scaled, k = 3, family = kccaFamily("kmedians"))
db <- dbscan(df_scaled, eps = 0.5, minPts = 3)
ms <- meanShift(df_scaled)
fcm <- cmeans(df_scaled, centers = 3)
sil_km   <- mean(silhouette(km$cluster, dist(df_scaled))[,3])
sil_kmed <- mean(silhouette(clusters(kmed), dist(df_scaled))[,3])
sil_fcm  <- mean(silhouette(fcm$cluster, dist(df_scaled))[,3])
sil_db   <- mean(silhouette(db$cluster, dist(df_scaled))[,3])
sil_ms   <- mean(silhouette(ms$assignment, dist(df_scaled))[,3])
hasil <- data.frame(
  Metode = c("K-means","K-median","DBSCAN","Mean Shift","Fuzzy C-means"),
  Silhouette = c(sil_km, sil_kmed, sil_db, sil_ms, sil_fcm)
)

hasil
##          Metode Silhouette
## 1       K-means  0.5721922
## 2      K-median  0.5721922
## 3        DBSCAN  0.7971977
## 4    Mean Shift  0.9031243
## 5 Fuzzy C-means  0.9031243
df_final$cluster <- ms$assignment
aggregate(. ~ cluster, data = df_final, mean)
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
##   cluster Location cases deaths total_cases total_deaths
## 1       1       NA    NA     NA          NA           NA
## 2       2       NA    NA     NA          NA           NA
## 3       3       NA    NA     NA          NA           NA
df_final[order(df_final$cluster), ]
## # A tibble: 34 × 6
##    Location                   cases deaths total_cases total_deaths cluster[,1]
##    <chr>                      <dbl>  <dbl>       <dbl>        <dbl>       <dbl>
##  1 Aceh                        48.7  2.46       22082.        1085.           1
##  2 Bali                       182.   5.15       72087.        2195.           1
##  3 Banten                     361.   3.19      108872.        1544.           1
##  4 Bengkulu                    32.5  0.581      13735.         270.           1
##  5 Daerah Istimewa Yogyakarta 245.   6.49       94191.        2719.           1
##  6 Gorontalo                   15.7  0.548       7597.         263.           1
##  7 Jambi                       42.5  0.977      17253.         402.           1
##  8 Kalimantan Barat            72.8  1.25       25975.         519.           1
##  9 Kalimantan Selatan          97.2  2.87       43872.        1397.           1
## 10 Kalimantan Tengah           64.4  1.73       28216.         787.           1
## # ℹ 24 more rows
plot(df_scaled, col = df_final$cluster)