Clustering Analysis for COVID-19 in Peru

R Markdown

Note: This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

# Los siguientes 2 paquetes son para Cluster
# Analysis por K-means: install.packages('rattle')
library(rattle)

## Loading required package: tibble

## Loading required package: bitops

## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

library(cluster)

# **********************************************************************
# PAQUETES NECESARIOS PARA EL ANALISIS
library(tidyverse)

## -- Attaching packages --------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v dplyr   1.0.2
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## v purrr   0.3.4

## -- Conflicts ------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(broom)
library(knitr)
library(readxl)
library(cowplot)
library(nycflights13)

## 
## Attaching package: 'nycflights13'

## The following object is masked from 'package:rattle':
## 
##     weather

library(dplyr)
library(stringr)
library(infer)
library(datasets)
library(corrplot)

## corrplot 0.84 loaded

library(Hmisc)

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(psych)

## 
## Attaching package: 'psych'

## The following object is masked from 'package:Hmisc':
## 
##     describe

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(skimr)
library(tableone)
library(wooldridge)

## 
## Attaching package: 'wooldridge'

## The following objects are masked from 'package:rattle':
## 
##     audit, wine

library(estimatr)
library(margins)
library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(ggpubr)

## 
## Attaching package: 'ggpubr'

## The following object is masked from 'package:cowplot':
## 
##     get_legend

library(datasets)
library(FinCal)

## 
## Attaching package: 'FinCal'

## The following objects are masked from 'package:psych':
## 
##     geometric.mean, harmonic.mean

library(arm)

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:wooldridge':
## 
##     cement

## The following object is masked from 'package:dplyr':
## 
##     select

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loading required package: lme4

## Registered S3 methods overwritten by 'lme4':
##   method                          from
##   cooks.distance.influence.merMod car 
##   influence.merMod                car 
##   dfbeta.influence.merMod         car 
##   dfbetas.influence.merMod        car

## 
## arm (Version 1.11-2, built: 2020-7-27)

## Working directory is F:/06-B Base de Analisis en R 2DA_OLA/20.11.27.01 Nuevo Cluster Analysis

## 
## Attaching package: 'arm'

## The following objects are masked from 'package:psych':
## 
##     logit, rescale, sim

## The following object is masked from 'package:corrplot':
## 
##     corrplot

library(survival)
library(rms)

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

## 
## Attaching package: 'rms'

## The following object is masked from 'package:lmtest':
## 
##     lrtest

library(eeptools)
library(formatR)
library(ggfortify)

## Registered S3 method overwritten by 'ggfortify':
##   method      from    
##   autoplot.lm eeptools

# Fin de Paquetes 1

# para las graficas de series de tiempo:
# install.packages('TTR')
library(TTR)
# para la lectura de archivos DTA de Stata:
library(haven)
# para exportar a Excel file:
library(xlsx)
# Fin de Paquetes 2 para hacer ReShape (Pivot
# Tables):
library(tidyr)
# library(reshape2) Fin de Paquetes 3 - by November
# 26, 2020. install.packages('contrib.url')
# library(contrib.url)

# **********************************************************************
# * INICIO DEL PRIMER ANALISIS
# **********************************************************************
# * Cargando la base de datos: Update Viernes 27 de
# Noviembre 2020: Version02
# C:/15_STATA_DTA_files/20.11.26.03_BASE_DE_DATOS_21_NOV/Tabla_de_Picos_y_Fechas_xDpto.xlsx
# library(haven) #para la lectura de archivos DTA
# de Stata para exportar a Excel file:
# library(xlsx) library(readxl)

# Tabla_de_Picos_y_Fechas_xDpto <-
# read_excel('C:/15_STATA_DTA_files/20.11.26.03_BASE_DE_DATOS_21_NOV/Tabla_de_Picos_y_Fechas_xDpto.xlsx')
# Update Viernes 04 de Diciembre 2020: Version03 :
Tabla_de_Hitos_y_Fechas_xDpto <- read_excel("C:/15_STATA_DTA_files/20.11.26.03_BASE_DE_DATOS_21_NOV/Tabla_de_Hitos_y_Fechas_xDpto.xlsx")

# View(Tabla_de_Hitos_y_Fechas_xDpto)
summary(Tabla_de_Hitos_y_Fechas_xDpto)

##    DPTO_NRO         Vector_lugar       Vector_Semana_de_inicio_de_ola
##  Length:25          Length:25          Min.   : 4.00                 
##  Class :character   Class :character   1st Qu.: 6.00                 
##  Mode  :character   Mode  :character   Median : 7.00                 
##                                        Mean   : 8.16                 
##                                        3rd Qu.: 9.00                 
##                                        Max.   :17.00                 
##  Vector_Fecha_de_inicio_de_cresta Vector_Fecha_de_mayor_tasa
##  Min.   : 6.00                    Min.   : 9.00             
##  1st Qu.: 7.00                    1st Qu.:13.00             
##  Median :11.00                    Median :19.00             
##  Mean   :11.24                    Mean   :17.84             
##  3rd Qu.:14.00                    3rd Qu.:22.00             
##  Max.   :20.00                    Max.   :24.00             
##  Vector_Fecha_de_fin_de_cresta Tiempo_al_pico  Tiempo_reduccion_80pc
##  Min.   :16.00                 Min.   : 3.00   Min.   : 4.0         
##  1st Qu.:29.00                 1st Qu.: 7.00   1st Qu.: 8.0         
##  Median :29.00                 Median : 9.00   Median : 9.0         
##  Mean   :28.24                 Mean   : 9.68   Mean   :10.4         
##  3rd Qu.:30.00                 3rd Qu.:12.00   3rd Qu.:12.0         
##  Max.   :33.00                 Max.   :17.00   Max.   :20.0         
##     Duracion    
##  Min.   :10.00  
##  1st Qu.:17.00  
##  Median :21.00  
##  Mean   :20.08  
##  3rd Qu.:24.00  
##  Max.   :27.00

print(is.data.frame(Tabla_de_Hitos_y_Fechas_xDpto))

## [1] TRUE

# attach(Tabla_de_Hitos_y_Fechas_xDpto)
head(Tabla_de_Hitos_y_Fechas_xDpto)

names(Tabla_de_Hitos_y_Fechas_xDpto)

## [1] "DPTO_NRO"                         "Vector_lugar"                    
## [3] "Vector_Semana_de_inicio_de_ola"   "Vector_Fecha_de_inicio_de_cresta"
## [5] "Vector_Fecha_de_mayor_tasa"       "Vector_Fecha_de_fin_de_cresta"   
## [7] "Tiempo_al_pico"                   "Tiempo_reduccion_80pc"           
## [9] "Duracion"

# CONTENIDO DE TABLA: DTAsinadef_V01 es la tabla
# donde cada registro es un fallecido, la mas
# voluminosa, la original.

FEATURE ENGINEERING Y ESTANDARIZACION DE VARIABLES

Se hace la selección de variables óptima para las necesidad del clustering de olas:

# **********************************************************************
# PASO de ESTANDARIZACION DE VARIABLES (unas van de
# 1200 a 1400, otras de 0.3 a 4.5, ahora iran de -1
# a 1)
Tabla_de_Hitos_y_Fechas_xDpto.sinNombres = Tabla_de_Hitos_y_Fechas_xDpto[, 
    !grepl("Vector_lugar", names(Tabla_de_Hitos_y_Fechas_xDpto))]

# Tabla_de_Picos_y_Fechas_xDpto.sinNombres =
# #Tabla_de_Picos_y_Fechas_xDpto.sinNombres[,!grepl('Vector_Max_tm_sem',names(Tabla_de_Picos_y_Fechas_xDpto.sinNombres))]
# Tabla_de_Picos_y_Fechas_xDpto.sinNombres =
# #Tabla_de_Picos_y_Fechas_xDpto.sinNombres[,!grepl('Vector_Wave_Start_mortalityRate',names(Tabla_de_Picos_y_Fechas_xDpto.sinNombres))]

# Tabla_de_Picos_y_Fechas_xDpto.standarized <-
# scale(Tabla_de_Picos_y_Fechas_xDpto.sinNombres[-1])
# # To standarize the variables Update Viernes 04
# de Diciembre 2020: Version03 :
Tabla_de_Hitos_y_Fechas_xDpto.standarized <- scale(Tabla_de_Hitos_y_Fechas_xDpto.sinNombres[-1])  # To standarize the variables

# View(Tabla_de_Hitos_y_Fechas_xDpto.standarized)
head(Tabla_de_Hitos_y_Fechas_xDpto.standarized)

##      Vector_Semana_de_inicio_de_ola Vector_Fecha_de_inicio_de_cresta
## [1,]                      0.2453317                        0.3779052
## [2,]                     -0.6308530                       -0.9104080
## [3,]                     -0.3387914                       -0.9104080
## [4,]                      0.2453317                        0.5926241
## [5,]                     -0.3387914                       -0.4809703
## [6,]                      1.1215165                        1.0220618
##      Vector_Fecha_de_mayor_tasa Vector_Fecha_de_fin_de_cresta Tiempo_al_pico
## [1,]                  0.2268847                     0.4234719     0.07766472
## [2,]                 -0.7510666                     0.4234719    -0.40773977
## [3,]                  1.0092457                     1.1452990     1.53387817
## [4,]                  0.6180652                     0.1828629     0.56306920
## [5,]                  1.2048360                     1.1452990     1.77658042
## [6,]                  0.4224749                     0.1828629    -0.40773977
##      Tiempo_reduccion_80pc   Duracion
## [1,]            0.13278800  0.1819139
## [2,]            1.23935467  0.7751113
## [3,]           -0.08852533  1.1705763
## [4,]           -0.53115200 -0.0158186
## [5,]           -0.30983867  1.1705763
## [6,]           -0.30983867 -0.6090161

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

# **********************************************************************
# PASO de crea el Objeto FIT con el reporte de
# cuales son los CLUSTERS creados, y cuales son K
# centroides de los CLUSTERS: K-Means K = 3
# k.means.fit.04.de.Tabla_de_Picos_y_Fechas_xDpto.standarized
# <-
# kmeans(Tabla_de_Picos_y_Fechas_xDpto.standarized,
# K) # k = 6
# attributes(k.means.fit.04.de.Tabla_de_Picos_y_Fechas_xDpto.standarized)
# k.means.fit.04.de.Tabla_de_Picos_y_Fechas_xDpto.standarized
# k.means.fit.04.de.Tabla_de_Picos_y_Fechas_xDpto.standarized$size

# Update Viernes 04 de Diciembre 2020: Version03 :
K = 6
k.means.FIT.05.de.Tabla_de_Hitos_y_Fechas_xDpto <- kmeans(Tabla_de_Hitos_y_Fechas_xDpto.standarized, 
    K, nstart = 100)  # k = 6
attributes(k.means.FIT.05.de.Tabla_de_Hitos_y_Fechas_xDpto)

## $names
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"      
## 
## $class
## [1] "kmeans"

k.means.FIT.05.de.Tabla_de_Hitos_y_Fechas_xDpto

## K-means clustering with 6 clusters of sizes 2, 7, 2, 6, 5, 3
## 
## Cluster means:
##   Vector_Semana_de_inicio_de_ola Vector_Fecha_de_inicio_de_cresta
## 1                     -0.6308530                       -0.9104080
## 2                      0.5791164                        0.8380171
## 3                      2.5818244                        1.8809373
## 4                     -0.2901145                       -0.1946784
## 5                     -0.8060900                       -0.9962956
## 6                     -0.7282069                       -0.5525432
##   Vector_Fecha_de_mayor_tasa Vector_Fecha_de_fin_de_cresta Tiempo_al_pico
## 1                 -1.7290178                   -2.82475002    -1.62125098
## 2                  0.3945335                   -0.09211889     0.00832122
## 3                  1.1070408                    0.30316738    -0.77179313
## 4                  0.9114506                    0.66408092     1.37207668
## 5                 -1.1422471                    0.47159370    -0.74752291
## 6                 -0.4250828                   -0.21815219     0.07766472
##   Tiempo_reduccion_80pc   Duracion
## 1            -0.6418087 -1.8942772
## 2            -0.5311520 -0.4677786
## 3            -0.9737787 -1.4988122
## 4            -0.4204953  0.7421559
## 5             1.7262440  0.9332973
## 6             0.2803302  0.3137355
## 
## Clustering vector:
##  [1] 2 5 4 2 4 2 5 3 4 4 6 4 6 5 5 1 2 2 4 5 3 2 2 6 1
## 
## Within cluster sum of squares by cluster:
## [1] 0.07298522 9.45755269 0.09707538 6.17002528 2.33104812 4.01903814
##  (between_SS / total_SS =  86.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

k.means.FIT.05.de.Tabla_de_Hitos_y_Fechas_xDpto$size

## [1] 2 7 2 6 5 3

ANALISIS DE CENTROIDES

# Centroids:
# k.means.fit.04.de.Tabla_de_Picos_y_Fechas_xDpto.standarized$centers
k.means.FIT.05.de.Tabla_de_Hitos_y_Fechas_xDpto$centers

##   Vector_Semana_de_inicio_de_ola Vector_Fecha_de_inicio_de_cresta
## 1                     -0.6308530                       -0.9104080
## 2                      0.5791164                        0.8380171
## 3                      2.5818244                        1.8809373
## 4                     -0.2901145                       -0.1946784
## 5                     -0.8060900                       -0.9962956
## 6                     -0.7282069                       -0.5525432
##   Vector_Fecha_de_mayor_tasa Vector_Fecha_de_fin_de_cresta Tiempo_al_pico
## 1                 -1.7290178                   -2.82475002    -1.62125098
## 2                  0.3945335                   -0.09211889     0.00832122
## 3                  1.1070408                    0.30316738    -0.77179313
## 4                  0.9114506                    0.66408092     1.37207668
## 5                 -1.1422471                    0.47159370    -0.74752291
## 6                 -0.4250828                   -0.21815219     0.07766472
##   Tiempo_reduccion_80pc   Duracion
## 1            -0.6418087 -1.8942772
## 2            -0.5311520 -0.4677786
## 3            -0.9737787 -1.4988122
## 4            -0.4204953  0.7421559
## 5             1.7262440  0.9332973
## 6             0.2803302  0.3137355

ANALISIS DE LOS CLUSTERS CREADOS

# Clusters:
# k.means.fit.04.de.Tabla_de_Picos_y_Fechas_xDpto.standarized$cluster
k.means.FIT.05.de.Tabla_de_Hitos_y_Fechas_xDpto$cluster

##  [1] 2 5 4 2 4 2 5 3 4 4 6 4 6 5 5 1 2 2 4 5 3 2 2 6 1

# Cluster size:
# k.means.fit.04.de.Tabla_de_Picos_y_Fechas_xDpto.standarized$size
k.means.FIT.05.de.Tabla_de_Hitos_y_Fechas_xDpto$size

## [1] 2 7 2 6 5 3

REPORTE FINAL DE CLUSTERS

wssplot <- function(data, nc = 15, seed = 1234) {
    wss <- (nrow(data) - 1) * sum(apply(data, 2, var))
    for (i in 2:nc) {
        set.seed(seed)
        wss[i] <- sum(kmeans(data, centers = i)$withinss)
    }
    plot(1:nc, wss, type = "b", xlab = "Number of Clusters", 
        ylab = "Within groups sum of squares")
}

# wssplot(Tabla_de_Picos_y_Fechas_xDpto.standarized,
# nc=8)
wssplot(Tabla_de_Hitos_y_Fechas_xDpto.standarized, 
    nc = 8)

# lapply(k.means.fit.04.de.Tabla_de_Picos_y_Fechas_xDpto.standarized,
# nc=8, wssplot)

PLOTEO DE CLUSTERS

#**********************************************************************
#*# Publication quality graphs require 600dpi
dpi=600    #pixels per square inch
carpeta = "C:/15_STATA_DTA_files/20.12.04.01_Figura_TIF_de_Clustering/"
archivo = paste( "COV06_Clustering_K_de_" , K,"-means", sep="")
time = Sys.time()
time = gsub(":", "-", time)
carpeta_y_archivo = paste(carpeta,archivo,time,".tif")
nombre_de_tif = carpeta_y_archivo
tiff(nombre_de_tif, width=6*dpi, height=5*dpi, res=dpi)
#**********************************************************************

plot_de_cluster <- clusplot(Tabla_de_Hitos_y_Fechas_xDpto.standarized, 
         k.means.FIT.05.de.Tabla_de_Hitos_y_Fechas_xDpto$cluster,
         #main='2D representation of the Cluster solution',
         
         main=paste('CLUSTERING, K = ',K),
         color=TRUE, shade=TRUE,
         labels=2, lines=0)
plot_de_cluster

## $Distances
## NULL
## 
## $Shading
## [1]        NA  7.435505        NA 15.546509 14.244555 11.773431

#**********************************************************************
dev.off()

## png 
##   2

print(paste("Finalizado procesamiento de CLUSTER NRO. ",time))

## [1] "Finalizado procesamiento de CLUSTER NRO.  2020-12-29 18-05-35"

#**********************************************************************
plot_de_cluster <- clusplot(Tabla_de_Hitos_y_Fechas_xDpto.standarized, 
         k.means.FIT.05.de.Tabla_de_Hitos_y_Fechas_xDpto$cluster,
         #main='2D representation of the Cluster solution',
         
         main=paste('CLUSTERING, K = ',K),
         color=TRUE, shade=TRUE,
         labels=2, lines=0)

plot_de_cluster

## $Distances
## NULL
## 
## $Shading
## [1]        NA  7.435505        NA 15.546509 14.244555 11.773431

# FIN

EXPORTAR CLUSTERS

# str(k.means.fit.04.de.Tabla_de_Picos_y_Fechas_xDpto.standarized)

# Tabla_de_Cluster <- data.frame(
# Tabla_de_Picos_y_Fechas_xDpto$Vector_lugar,
# k.means.fit.04.de.Tabla_de_Picos_y_Fechas_xDpto.standarized$cluster
# , Tabla_de_Picos_y_Fechas_xDpto$DPTO_NRO, id =
# 1:25, stringsAsFactors=FALSE)

Reporte_de_Clustering <- Tabla_de_Hitos_y_Fechas_xDpto

# Reporte_de_Clustering$cluster <-
# k.means.fit.04.de.Tabla_de_Picos_y_Fechas_xDpto.standarized$cluster
Reporte_de_Clustering$cluster <- k.means.FIT.05.de.Tabla_de_Hitos_y_Fechas_xDpto$cluster

carpeta = "C:/15_STATA_DTA_files/20.12.04.03_Reporte_de_Clustering/"
archivo = paste("ClustersK=", K, "_Reporte_", sep = "")
time = Sys.time()
time = gsub(":", "-", time)
carpeta_y_archivo = paste(carpeta, archivo, time, ".xlsx")
write.xlsx(Reporte_de_Clustering, file = carpeta_y_archivo)