Carga y limpieza preliminar de los datos
Los datos que se van a analizar en este documento proceden de la compilación hecha por ususarios de Kaggle. La fecha del analisis inicia el 11 de septiembre del 2020.
#import pandas as pd
#datos = pd.read_csv()
datos<-read.delim("C:/cursoanalisisdatoscovid/covid_19_clean_complete.csv", sep = ",")
kable(head(datos,10))
Province.State
|
Country.Region
|
Lat
|
Long
|
Date
|
Confirmed
|
Deaths
|
Recovered
|
|
Afghanistan
|
33.0000
|
65.0000
|
1/22/20
|
0
|
0
|
0
|
|
Albania
|
41.1533
|
20.1683
|
1/22/20
|
0
|
0
|
0
|
|
Algeria
|
28.0339
|
1.6596
|
1/22/20
|
0
|
0
|
0
|
|
Andorra
|
42.5063
|
1.5218
|
1/22/20
|
0
|
0
|
0
|
|
Angola
|
-11.2027
|
17.8739
|
1/22/20
|
0
|
0
|
0
|
|
Antigua and Barbuda
|
17.0608
|
-61.7964
|
1/22/20
|
0
|
0
|
0
|
|
Argentina
|
-38.4161
|
-63.6167
|
1/22/20
|
0
|
0
|
0
|
|
Armenia
|
40.0691
|
45.0382
|
1/22/20
|
0
|
0
|
0
|
Australian Capital Territory
|
Australia
|
-35.4735
|
149.0124
|
1/22/20
|
0
|
0
|
0
|
New South Wales
|
Australia
|
-33.8688
|
151.2093
|
1/22/20
|
0
|
0
|
0
|
Estructura de los datos
str(datos)
## 'data.frame': 21484 obs. of 8 variables:
## $ Province.State: chr "" "" "" "" ...
## $ Country.Region: chr "Afghanistan" "Albania" "Algeria" "Andorra" ...
## $ Lat : num 33 41.2 28 42.5 -11.2 ...
## $ Long : num 65 20.17 1.66 1.52 17.87 ...
## $ Date : chr "1/22/20" "1/22/20" "1/22/20" "1/22/20" ...
## $ Confirmed : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Deaths : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Recovered : int 0 0 0 0 0 0 0 0 0 0 ...
colnames(datos)
## [1] "Province.State" "Country.Region" "Lat" "Long"
## [5] "Date" "Confirmed" "Deaths" "Recovered"
colnames(datos)=c("Provincia_estado",
"Pais_region",
"Latitud",# Norte+ o Sur-
"Longitud",# Este+ Oeste-
"Fecha",
"Casos_Confirmados",
"Casos_Muertos",
"Casos_Recuperados")
datos %>% head(10) %>% kable() %>% kable_styling()
Provincia_estado
|
Pais_region
|
Latitud
|
Longitud
|
Fecha
|
Casos_Confirmados
|
Casos_Muertos
|
Casos_Recuperados
|
|
Afghanistan
|
33.0000
|
65.0000
|
1/22/20
|
0
|
0
|
0
|
|
Albania
|
41.1533
|
20.1683
|
1/22/20
|
0
|
0
|
0
|
|
Algeria
|
28.0339
|
1.6596
|
1/22/20
|
0
|
0
|
0
|
|
Andorra
|
42.5063
|
1.5218
|
1/22/20
|
0
|
0
|
0
|
|
Angola
|
-11.2027
|
17.8739
|
1/22/20
|
0
|
0
|
0
|
|
Antigua and Barbuda
|
17.0608
|
-61.7964
|
1/22/20
|
0
|
0
|
0
|
|
Argentina
|
-38.4161
|
-63.6167
|
1/22/20
|
0
|
0
|
0
|
|
Armenia
|
40.0691
|
45.0382
|
1/22/20
|
0
|
0
|
0
|
Australian Capital Territory
|
Australia
|
-35.4735
|
149.0124
|
1/22/20
|
0
|
0
|
0
|
New South Wales
|
Australia
|
-33.8688
|
151.2093
|
1/22/20
|
0
|
0
|
0
|
- Variables cualitativas se convierten con “factor” o bien “as.factor”
- Ordinales se convierten con “ordered”
- Cuantitativos se convierten con “as.numeric”
datos$Provincia_estado=factor(datos$Provincia_estado)
datos$Pais_region=factor(datos$Pais_region)
Procesar
str(datos)
## 'data.frame': 21484 obs. of 8 variables:
## $ Provincia_estado : Factor w/ 81 levels "","Alberta","Anguilla",..: 1 1 1 1 1 1 1 1 6 49 ...
## $ Pais_region : Factor w/ 185 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 9 ...
## $ Latitud : num 33 41.2 28 42.5 -11.2 ...
## $ Longitud : num 65 20.17 1.66 1.52 17.87 ...
## $ Fecha : chr "1/22/20" "1/22/20" "1/22/20" "1/22/20" ...
## $ Casos_Confirmados: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Casos_Muertos : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Casos_Recuperados: int 0 0 0 0 0 0 0 0 0 0 ...
head(datos,10)
## Provincia_estado Pais_region Latitud Longitud Fecha
## 1 Afghanistan 33.0000 65.0000 1/22/20
## 2 Albania 41.1533 20.1683 1/22/20
## 3 Algeria 28.0339 1.6596 1/22/20
## 4 Andorra 42.5063 1.5218 1/22/20
## 5 Angola -11.2027 17.8739 1/22/20
## 6 Antigua and Barbuda 17.0608 -61.7964 1/22/20
## 7 Argentina -38.4161 -63.6167 1/22/20
## 8 Armenia 40.0691 45.0382 1/22/20
## 9 Australian Capital Territory Australia -35.4735 149.0124 1/22/20
## 10 New South Wales Australia -33.8688 151.2093 1/22/20
## Casos_Confirmados Casos_Muertos Casos_Recuperados
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## 7 0 0 0
## 8 0 0 0
## 9 0 0 0
## 10 0 0 0
#Verificar datos anomalos en los registros \[Casos confirmados=muertos+recuperados + enfermos\]
*Añadir una columna al set de datos
datos%<>%mutate(casos_enfermos=Casos_Confirmados-Casos_Muertos-Casos_Recuperados)
datos%>%
filter(Casos_Confirmados>10000)%>%
head(10)%>%
kable()
Provincia_estado
|
Pais_region
|
Latitud
|
Longitud
|
Fecha
|
Casos_Confirmados
|
Casos_Muertos
|
Casos_Recuperados
|
casos_enfermos
|
Hubei
|
China
|
30.9756
|
112.2707
|
2/2/20
|
11177
|
350
|
295
|
10532
|
Hubei
|
China
|
30.9756
|
112.2707
|
2/3/20
|
13522
|
414
|
386
|
12722
|
Hubei
|
China
|
30.9756
|
112.2707
|
2/4/20
|
16678
|
479
|
522
|
15677
|
Hubei
|
China
|
30.9756
|
112.2707
|
2/5/20
|
19665
|
549
|
633
|
18483
|
Hubei
|
China
|
30.9756
|
112.2707
|
2/6/20
|
22112
|
618
|
817
|
20677
|
Hubei
|
China
|
30.9756
|
112.2707
|
2/7/20
|
24953
|
699
|
1115
|
23139
|
Hubei
|
China
|
30.9756
|
112.2707
|
2/8/20
|
27100
|
780
|
1439
|
24881
|
Hubei
|
China
|
30.9756
|
112.2707
|
2/9/20
|
29631
|
871
|
1795
|
26965
|
Hubei
|
China
|
30.9756
|
112.2707
|
2/10/20
|
31728
|
974
|
2222
|
28532
|
Hubei
|
China
|
30.9756
|
112.2707
|
2/11/20
|
33366
|
1068
|
2639
|
29659
|
*revisar la consistencia de los datos
datos%>%
filter(casos_enfermos<0)%>%
kable()
Provincia_estado
|
Pais_region
|
Latitud
|
Longitud
|
Fecha
|
Casos_Confirmados
|
Casos_Muertos
|
Casos_Recuperados
|
casos_enfermos
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
3/22/20
|
0
|
1
|
0
|
-1
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
3/23/20
|
0
|
1
|
0
|
-1
|
Hainan
|
China
|
19.1959
|
109.7453
|
3/24/20
|
168
|
6
|
168
|
-6
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
3/24/20
|
0
|
1
|
0
|
-1
|
Hainan
|
China
|
19.1959
|
109.7453
|
3/25/20
|
168
|
6
|
168
|
-6
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
3/25/20
|
0
|
1
|
0
|
-1
|
Hainan
|
China
|
19.1959
|
109.7453
|
3/26/20
|
168
|
6
|
168
|
-6
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
3/26/20
|
0
|
1
|
0
|
-1
|
Hainan
|
China
|
19.1959
|
109.7453
|
3/27/20
|
168
|
6
|
168
|
-6
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
3/27/20
|
0
|
1
|
0
|
-1
|
Hainan
|
China
|
19.1959
|
109.7453
|
3/28/20
|
168
|
6
|
168
|
-6
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
3/28/20
|
0
|
1
|
0
|
-1
|
Hainan
|
China
|
19.1959
|
109.7453
|
3/29/20
|
168
|
6
|
168
|
-6
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
3/29/20
|
0
|
1
|
0
|
-1
|
Hainan
|
China
|
19.1959
|
109.7453
|
3/30/20
|
168
|
6
|
168
|
-6
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
3/30/20
|
0
|
1
|
0
|
-1
|
Hainan
|
China
|
19.1959
|
109.7453
|
3/31/20
|
168
|
6
|
168
|
-6
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
3/31/20
|
0
|
1
|
0
|
-1
|
Hainan
|
China
|
19.1959
|
109.7453
|
4/1/20
|
168
|
6
|
168
|
-6
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
4/1/20
|
0
|
1
|
0
|
-1
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
4/2/20
|
0
|
1
|
0
|
-1
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
4/3/20
|
0
|
1
|
0
|
-1
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
4/4/20
|
0
|
1
|
0
|
-1
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
4/5/20
|
0
|
1
|
0
|
-1
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
4/6/20
|
0
|
1
|
0
|
-1
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
4/7/20
|
0
|
1
|
0
|
-1
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
4/8/20
|
0
|
1
|
0
|
-1
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
4/9/20
|
0
|
1
|
0
|
-1
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
4/10/20
|
0
|
1
|
0
|
-1
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
4/11/20
|
0
|
1
|
0
|
-1
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
4/12/20
|
-1
|
1
|
0
|
-2
|
datos%>%
filter(casos_enfermos<0)%>%
kable()
Provincia_estado
|
Pais_region
|
Latitud
|
Longitud
|
Fecha
|
Casos_Confirmados
|
Casos_Muertos
|
Casos_Recuperados
|
casos_enfermos
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
3/22/20
|
0
|
1
|
0
|
-1
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
3/23/20
|
0
|
1
|
0
|
-1
|
Hainan
|
China
|
19.1959
|
109.7453
|
3/24/20
|
168
|
6
|
168
|
-6
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
3/24/20
|
0
|
1
|
0
|
-1
|
Hainan
|
China
|
19.1959
|
109.7453
|
3/25/20
|
168
|
6
|
168
|
-6
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
3/25/20
|
0
|
1
|
0
|
-1
|
Hainan
|
China
|
19.1959
|
109.7453
|
3/26/20
|
168
|
6
|
168
|
-6
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
3/26/20
|
0
|
1
|
0
|
-1
|
Hainan
|
China
|
19.1959
|
109.7453
|
3/27/20
|
168
|
6
|
168
|
-6
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
3/27/20
|
0
|
1
|
0
|
-1
|
Hainan
|
China
|
19.1959
|
109.7453
|
3/28/20
|
168
|
6
|
168
|
-6
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
3/28/20
|
0
|
1
|
0
|
-1
|
Hainan
|
China
|
19.1959
|
109.7453
|
3/29/20
|
168
|
6
|
168
|
-6
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
3/29/20
|
0
|
1
|
0
|
-1
|
Hainan
|
China
|
19.1959
|
109.7453
|
3/30/20
|
168
|
6
|
168
|
-6
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
3/30/20
|
0
|
1
|
0
|
-1
|
Hainan
|
China
|
19.1959
|
109.7453
|
3/31/20
|
168
|
6
|
168
|
-6
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
3/31/20
|
0
|
1
|
0
|
-1
|
Hainan
|
China
|
19.1959
|
109.7453
|
4/1/20
|
168
|
6
|
168
|
-6
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
4/1/20
|
0
|
1
|
0
|
-1
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
4/2/20
|
0
|
1
|
0
|
-1
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
4/3/20
|
0
|
1
|
0
|
-1
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
4/4/20
|
0
|
1
|
0
|
-1
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
4/5/20
|
0
|
1
|
0
|
-1
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
4/6/20
|
0
|
1
|
0
|
-1
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
4/7/20
|
0
|
1
|
0
|
-1
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
4/8/20
|
0
|
1
|
0
|
-1
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
4/9/20
|
0
|
1
|
0
|
-1
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
4/10/20
|
0
|
1
|
0
|
-1
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
4/11/20
|
0
|
1
|
0
|
-1
|
Diamond Princess
|
Canada
|
0.0000
|
0.0000
|
4/12/20
|
-1
|
1
|
0
|
-2
|
datos%>%filter(Provincia_estado=="Hainan",casos_enfermos<0)%>%
arrange(Provincia_estado, Fecha)%>%
mutate (Casos_Recuperados=Casos_Recuperados+casos_enfermos,casos_enfermos=0) %>%
kable()
Provincia_estado
|
Pais_region
|
Latitud
|
Longitud
|
Fecha
|
Casos_Confirmados
|
Casos_Muertos
|
Casos_Recuperados
|
casos_enfermos
|
Hainan
|
China
|
19.1959
|
109.7453
|
3/24/20
|
168
|
6
|
162
|
0
|
Hainan
|
China
|
19.1959
|
109.7453
|
3/25/20
|
168
|
6
|
162
|
0
|
Hainan
|
China
|
19.1959
|
109.7453
|
3/26/20
|
168
|
6
|
162
|
0
|
Hainan
|
China
|
19.1959
|
109.7453
|
3/27/20
|
168
|
6
|
162
|
0
|
Hainan
|
China
|
19.1959
|
109.7453
|
3/28/20
|
168
|
6
|
162
|
0
|
Hainan
|
China
|
19.1959
|
109.7453
|
3/29/20
|
168
|
6
|
162
|
0
|
Hainan
|
China
|
19.1959
|
109.7453
|
3/30/20
|
168
|
6
|
162
|
0
|
Hainan
|
China
|
19.1959
|
109.7453
|
3/31/20
|
168
|
6
|
162
|
0
|
Hainan
|
China
|
19.1959
|
109.7453
|
4/1/20
|
168
|
6
|
162
|
0
|
*corregir la inconsistencia
datos%>%filter(Pais_region=="Paraguay")%>%head(10)%>%kable()
Provincia_estado
|
Pais_region
|
Latitud
|
Longitud
|
Fecha
|
Casos_Confirmados
|
Casos_Muertos
|
Casos_Recuperados
|
casos_enfermos
|
|
Paraguay
|
-23.4425
|
-58.4438
|
1/22/20
|
0
|
0
|
0
|
0
|
|
Paraguay
|
-23.4425
|
-58.4438
|
1/23/20
|
0
|
0
|
0
|
0
|
|
Paraguay
|
-23.4425
|
-58.4438
|
1/24/20
|
0
|
0
|
0
|
0
|
|
Paraguay
|
-23.4425
|
-58.4438
|
1/25/20
|
0
|
0
|
0
|
0
|
|
Paraguay
|
-23.4425
|
-58.4438
|
1/26/20
|
0
|
0
|
0
|
0
|
|
Paraguay
|
-23.4425
|
-58.4438
|
1/27/20
|
0
|
0
|
0
|
0
|
|
Paraguay
|
-23.4425
|
-58.4438
|
1/28/20
|
0
|
0
|
0
|
0
|
|
Paraguay
|
-23.4425
|
-58.4438
|
1/29/20
|
0
|
0
|
0
|
0
|
|
Paraguay
|
-23.4425
|
-58.4438
|
1/30/20
|
0
|
0
|
0
|
0
|
|
Paraguay
|
-23.4425
|
-58.4438
|
1/31/20
|
0
|
0
|
0
|
0
|
Análisis gráfico
#filtrar los datos de Europa
#datos_europa=datos[datos$Latitud>38 & datos$Longitud<30]