df <- read.csv("covid_19_indonesia_time_series_all.csv")
str(df)
## 'data.frame': 31822 obs. of 38 variables:
## $ Date : chr "3/1/2020" "3/2/2020" "3/2/2020" "3/2/2020" ...
## $ Location.ISO.Code : chr "ID-JK" "ID-JK" "IDN" "ID-RI" ...
## $ Location : chr "DKI Jakarta" "DKI Jakarta" "Indonesia" "Riau" ...
## $ New.Cases : int 2 2 2 1 2 0 1 0 2 0 ...
## $ New.Deaths : int 0 0 0 0 0 0 1 0 0 0 ...
## $ New.Recovered : int 0 0 0 0 0 0 0 0 0 0 ...
## $ New.Active.Cases : int 2 2 2 1 2 0 0 0 2 0 ...
## $ Total.Cases : int 39 41 2 1 43 2 1 1 45 2 ...
## $ Total.Deaths : int 20 20 0 0 20 0 1 0 20 0 ...
## $ Total.Recovered : int 75 75 0 1 75 0 60 1 75 0 ...
## $ Total.Active.Cases : int -56 -54 2 0 -52 2 -60 0 -50 2 ...
## $ Location.Level : chr "Province" "Province" "Country" "Province" ...
## $ City.or.Regency : logi NA NA NA NA NA NA ...
## $ Province : chr "DKI Jakarta" "DKI Jakarta" "" "Riau" ...
## $ Country : chr "Indonesia" "Indonesia" "Indonesia" "Indonesia" ...
## $ Continent : chr "Asia" "Asia" "Asia" "Asia" ...
## $ Island : chr "Jawa" "Jawa" "" "Sumatera" ...
## $ Time.Zone : chr "UTC+07:00" "UTC+07:00" "" "UTC+07:00" ...
## $ Special.Status : chr "Daerah Khusus Ibu Kota" "Daerah Khusus Ibu Kota" "" "" ...
## $ Total.Regencies : int 1 1 416 10 1 416 18 10 1 416 ...
## $ Total.Cities : int 5 5 98 2 5 98 9 2 5 98 ...
## $ Total.Districts : int 44 44 7230 169 44 7230 627 169 44 7230 ...
## $ Total.Urban.Villages : int 267 267 8488 268 267 8488 645 268 267 8488 ...
## $ Total.Rural.Villages : int NA NA 74953 1591 NA 74953 5312 1591 NA 74953 ...
## $ Area..km2. : int 664 664 1916907 87024 664 1916907 35378 87024 664 1916907 ...
## $ Population : int 10846145 10846145 265185520 6074100 10846145 265185520 45161325 6074100 10846145 265185520 ...
## $ Population.Density : num 16334.3 16334.3 138.3 69.8 16334.3 ...
## $ Longitude : num 107 107 114 102 107 ...
## $ Latitude : num -6.205 -6.205 -0.789 0.512 -6.205 ...
## $ New.Cases.per.Million : num 0.18 0.18 0.01 0.16 0.18 0 0.02 0 0.18 0 ...
## $ Total.Cases.per.Million : num 3.6 3.78 0.01 0.16 3.96 0.01 0.02 0.16 4.15 0.01 ...
## $ New.Deaths.per.Million : num 0 0 0 0 0 0 0.02 0 0 0 ...
## $ Total.Deaths.per.Million : num 1.84 1.84 0 0 1.84 0 0.02 0 1.84 0 ...
## $ Total.Deaths.per.100rb : num 0.18 0.18 0 0 0.18 0 0 0 0.18 0 ...
## $ Case.Fatality.Rate : chr "51.28%" "48.78%" "0.00%" "0.00%" ...
## $ Case.Recovered.Rate : chr "192.31%" "182.93%" "0.00%" "100.00%" ...
## $ Growth.Factor.of.New.Cases : num NA 1 NA NA 1 0 NA 0 1 1 ...
## $ Growth.Factor.of.New.Deaths: num NA 1 NA NA 1 1 NA 1 1 1 ...
head(df)
## Date Location.ISO.Code Location New.Cases New.Deaths New.Recovered
## 1 3/1/2020 ID-JK DKI Jakarta 2 0 0
## 2 3/2/2020 ID-JK DKI Jakarta 2 0 0
## 3 3/2/2020 IDN Indonesia 2 0 0
## 4 3/2/2020 ID-RI Riau 1 0 0
## 5 3/3/2020 ID-JK DKI Jakarta 2 0 0
## 6 3/3/2020 IDN Indonesia 0 0 0
## New.Active.Cases Total.Cases Total.Deaths Total.Recovered Total.Active.Cases
## 1 2 39 20 75 -56
## 2 2 41 20 75 -54
## 3 2 2 0 0 2
## 4 1 1 0 1 0
## 5 2 43 20 75 -52
## 6 0 2 0 0 2
## Location.Level City.or.Regency Province Country Continent Island
## 1 Province NA DKI Jakarta Indonesia Asia Jawa
## 2 Province NA DKI Jakarta Indonesia Asia Jawa
## 3 Country NA Indonesia Asia
## 4 Province NA Riau Indonesia Asia Sumatera
## 5 Province NA DKI Jakarta Indonesia Asia Jawa
## 6 Country NA Indonesia Asia
## Time.Zone Special.Status Total.Regencies Total.Cities Total.Districts
## 1 UTC+07:00 Daerah Khusus Ibu Kota 1 5 44
## 2 UTC+07:00 Daerah Khusus Ibu Kota 1 5 44
## 3 416 98 7230
## 4 UTC+07:00 10 2 169
## 5 UTC+07:00 Daerah Khusus Ibu Kota 1 5 44
## 6 416 98 7230
## Total.Urban.Villages Total.Rural.Villages Area..km2. Population
## 1 267 NA 664 10846145
## 2 267 NA 664 10846145
## 3 8488 74953 1916907 265185520
## 4 268 1591 87024 6074100
## 5 267 NA 664 10846145
## 6 8488 74953 1916907 265185520
## Population.Density Longitude Latitude New.Cases.per.Million
## 1 16334.31 106.8361 -6.2046990 0.18
## 2 16334.31 106.8361 -6.2046990 0.18
## 3 138.34 113.9213 -0.7892750 0.01
## 4 69.80 101.8051 0.5116479 0.16
## 5 16334.31 106.8361 -6.2046990 0.18
## 6 138.34 113.9213 -0.7892750 0.00
## Total.Cases.per.Million New.Deaths.per.Million Total.Deaths.per.Million
## 1 3.60 0 1.84
## 2 3.78 0 1.84
## 3 0.01 0 0.00
## 4 0.16 0 0.00
## 5 3.96 0 1.84
## 6 0.01 0 0.00
## Total.Deaths.per.100rb Case.Fatality.Rate Case.Recovered.Rate
## 1 0.18 51.28% 192.31%
## 2 0.18 48.78% 182.93%
## 3 0.00 0.00% 0.00%
## 4 0.00 0.00% 100.00%
## 5 0.18 46.51% 174.42%
## 6 0.00 0.00% 0.00%
## Growth.Factor.of.New.Cases Growth.Factor.of.New.Deaths
## 1 NA NA
## 2 1 1
## 3 NA NA
## 4 NA NA
## 5 1 1
## 6 0 1
df <- df[df$Location.Level == "Province", ]
colnames(df)
## [1] "Date" "Location.ISO.Code"
## [3] "Location" "New.Cases"
## [5] "New.Deaths" "New.Recovered"
## [7] "New.Active.Cases" "Total.Cases"
## [9] "Total.Deaths" "Total.Recovered"
## [11] "Total.Active.Cases" "Location.Level"
## [13] "City.or.Regency" "Province"
## [15] "Country" "Continent"
## [17] "Island" "Time.Zone"
## [19] "Special.Status" "Total.Regencies"
## [21] "Total.Cities" "Total.Districts"
## [23] "Total.Urban.Villages" "Total.Rural.Villages"
## [25] "Area..km2." "Population"
## [27] "Population.Density" "Longitude"
## [29] "Latitude" "New.Cases.per.Million"
## [31] "Total.Cases.per.Million" "New.Deaths.per.Million"
## [33] "Total.Deaths.per.Million" "Total.Deaths.per.100rb"
## [35] "Case.Fatality.Rate" "Case.Recovered.Rate"
## [37] "Growth.Factor.of.New.Cases" "Growth.Factor.of.New.Deaths"
df_selected <- df[, c("Location",
"New.Cases",
"New.Deaths",
"Total.Cases",
"Total.Deaths",
"Case.Fatality.Rate")]
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df_final <- df_selected %>%
group_by(Location) %>%
summarise(
cases = mean(New.Cases, na.rm=TRUE),
deaths = mean(New.Deaths, na.rm=TRUE),
total_cases = mean(Total.Cases, na.rm=TRUE),
total_deaths = mean(Total.Deaths, na.rm=TRUE),
cfr = mean(Case.Fatality.Rate, na.rm=TRUE)
)
## Warning: There were 34 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `cfr = mean(Case.Fatality.Rate, na.rm = TRUE)`.
## ℹ In group 1: `Location = "Aceh"`.
## Caused by warning in `mean.default()`:
## ! argument is not numeric or logical: returning NA
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 33 remaining warnings.
head(df_final)
## # A tibble: 6 × 6
## Location cases deaths total_cases total_deaths cfr
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Aceh 48.7 2.46 22082. 1085. NA
## 2 Bali 182. 5.15 72087. 2195. NA
## 3 Banten 361. 3.19 108872. 1544. NA
## 4 Bengkulu 32.5 0.581 13735. 270. NA
## 5 DKI Jakarta 1520. 16.7 587520. 8259. NA
## 6 Daerah Istimewa Yogyakarta 245. 6.49 94191. 2719. NA
df_final <- df_selected %>%
group_by(Location) %>%
summarise(
cases = mean(New.Cases, na.rm=TRUE),
deaths = mean(New.Deaths, na.rm=TRUE),
total_cases = mean(Total.Cases, na.rm=TRUE),
total_deaths = mean(Total.Deaths, na.rm=TRUE)
)
df_scaled <- scale(df_final[,-1])
wss <- sapply(1:10, function(k){
kmeans(df_scaled, centers = k)$tot.withinss
})
plot(1:10, wss, type="b")

km <- kmeans(df_scaled, centers = 3)
df_final$cluster <- km$cluster
head(df_final)
## # A tibble: 6 × 6
## Location cases deaths total_cases total_deaths cluster
## <chr> <dbl> <dbl> <dbl> <dbl> <int>
## 1 Aceh 48.7 2.46 22082. 1085. 3
## 2 Bali 182. 5.15 72087. 2195. 2
## 3 Banten 361. 3.19 108872. 1544. 2
## 4 Bengkulu 32.5 0.581 13735. 270. 3
## 5 DKI Jakarta 1520. 16.7 587520. 8259. 1
## 6 Daerah Istimewa Yogyakarta 245. 6.49 94191. 2719. 2
head(df_final)
## # A tibble: 6 × 6
## Location cases deaths total_cases total_deaths cluster
## <chr> <dbl> <dbl> <dbl> <dbl> <int>
## 1 Aceh 48.7 2.46 22082. 1085. 3
## 2 Bali 182. 5.15 72087. 2195. 2
## 3 Banten 361. 3.19 108872. 1544. 2
## 4 Bengkulu 32.5 0.581 13735. 270. 3
## 5 DKI Jakarta 1520. 16.7 587520. 8259. 1
## 6 Daerah Istimewa Yogyakarta 245. 6.49 94191. 2719. 2
library(flexclust)
## Warning: package 'flexclust' was built under R version 4.5.3
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.5.3
##
## Attaching package: 'dbscan'
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(meanShiftR)
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
##
## Attaching package: 'e1071'
## The following object is masked from 'package:flexclust':
##
## bclust
library(cluster)
## Warning: package 'cluster' was built under R version 4.5.3
library(dplyr)
kmed <- kcca(df_scaled, k = 3, family = kccaFamily("kmedians"))
db <- dbscan(df_scaled, eps = 0.5, minPts = 3)
ms <- meanShift(df_scaled)
fcm <- cmeans(df_scaled, centers = 3)
sil_km <- mean(silhouette(km$cluster, dist(df_scaled))[,3])
sil_kmed <- mean(silhouette(clusters(kmed), dist(df_scaled))[,3])
sil_fcm <- mean(silhouette(fcm$cluster, dist(df_scaled))[,3])
sil_db <- mean(silhouette(db$cluster, dist(df_scaled))[,3])
sil_ms <- mean(silhouette(ms$assignment, dist(df_scaled))[,3])
hasil <- data.frame(
Metode = c("K-means","K-median","DBSCAN","Mean Shift","Fuzzy C-means"),
Silhouette = c(sil_km, sil_kmed, sil_db, sil_ms, sil_fcm)
)
hasil
## Metode Silhouette
## 1 K-means 0.5721922
## 2 K-median 0.5721922
## 3 DBSCAN 0.7971977
## 4 Mean Shift 0.9031243
## 5 Fuzzy C-means 0.9031243
df_final$cluster <- ms$assignment
aggregate(. ~ cluster, data = df_final, mean)
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## cluster Location cases deaths total_cases total_deaths
## 1 1 NA NA NA NA NA
## 2 2 NA NA NA NA NA
## 3 3 NA NA NA NA NA
df_final[order(df_final$cluster), ]
## # A tibble: 34 × 6
## Location cases deaths total_cases total_deaths cluster[,1]
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Aceh 48.7 2.46 22082. 1085. 1
## 2 Bali 182. 5.15 72087. 2195. 1
## 3 Banten 361. 3.19 108872. 1544. 1
## 4 Bengkulu 32.5 0.581 13735. 270. 1
## 5 Daerah Istimewa Yogyakarta 245. 6.49 94191. 2719. 1
## 6 Gorontalo 15.7 0.548 7597. 263. 1
## 7 Jambi 42.5 0.977 17253. 402. 1
## 8 Kalimantan Barat 72.8 1.25 25975. 519. 1
## 9 Kalimantan Selatan 97.2 2.87 43872. 1397. 1
## 10 Kalimantan Tengah 64.4 1.73 28216. 787. 1
## # ℹ 24 more rows
plot(df_scaled, col = df_final$cluster)
