La base de datos analizada contiene la información del contagio del Covid_19 a nivel mundial, los datos fueron adquiridos de la pagina Our World in Data y contiene las variables:
library(readr)
url <- "https://covid.ourworldindata.org/data/ecdc/full_data.csv"
covid <- read_csv(url)
str(covid)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 17085 obs. of 6 variables:
## $ date : Date, format: "2019-12-31" "2020-01-01" ...
## $ location : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ new_cases : num 0 0 0 0 0 0 0 0 0 0 ...
## $ new_deaths : num 0 0 0 0 0 0 0 0 0 0 ...
## $ total_cases : num 0 0 0 0 0 0 0 0 0 0 ...
## $ total_deaths: num 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "spec")=
## .. cols(
## .. date = col_date(format = ""),
## .. location = col_character(),
## .. new_cases = col_double(),
## .. new_deaths = col_double(),
## .. total_cases = col_double(),
## .. total_deaths = col_double()
## .. )
dim(covid)
## [1] 17085 6
head(covid, 5)
## # A tibble: 5 x 6
## date location new_cases new_deaths total_cases total_deaths
## <date> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2019-12-31 Afghanistan 0 0 0 0
## 2 2020-01-01 Afghanistan 0 0 0 0
## 3 2020-01-02 Afghanistan 0 0 0 0
## 4 2020-01-03 Afghanistan 0 0 0 0
## 5 2020-01-04 Afghanistan 0 0 0 0
tail(covid, 5)
## # A tibble: 5 x 6
## date location new_cases new_deaths total_cases total_deaths
## <date> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2020-05-10 Zimbabwe 1 0 36 4
## 2 2020-05-11 Zimbabwe 0 0 36 4
## 3 2020-05-12 Zimbabwe 1 0 37 4
## 4 2020-05-13 Zimbabwe 0 0 37 4
## 5 2020-05-14 Zimbabwe 0 0 37 4
library(tibble)
glimpse(covid)
## Observations: 17,085
## Variables: 6
## $ date <date> 2019-12-31, 2020-01-01, 2020-01-02, 2020-01-03, 2020-...
## $ location <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanis...
## $ new_cases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ new_deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ total_cases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ total_deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
library(Hmisc)
describe(covid)
## covid
##
## 6 Variables 17085 Observations
## --------------------------------------------------------------------------------
## date
## n missing distinct Info Mean Gmd .05
## 17085 0 136 1 2020-03-25 40.95 2020-01-12
## .10 .25 .50 .75 .90 .95
## 2020-01-25 2020-03-02 2020-04-03 2020-04-24 2020-05-06 2020-05-10
##
## lowest : 2019-12-31 2020-01-01 2020-01-02 2020-01-03 2020-01-04
## highest: 2020-05-10 2020-05-11 2020-05-12 2020-05-13 2020-05-14
## --------------------------------------------------------------------------------
## location
## n missing distinct
## 17085 0 210
##
## lowest : Afghanistan Albania Algeria Andorra Angola
## highest: Western Sahara World Yemen Zambia Zimbabwe
## --------------------------------------------------------------------------------
## new_cases
## n missing distinct Info Mean Gmd .05 .10
## 17085 0 1560 0.924 504.4 972.7 0 0
## .25 .50 .75 .90 .95
## 0 2 38 320 1057
##
## lowest : -2461 -1480 -713 -161 -105, highest: 90139 90156 91742 94185 101445
## --------------------------------------------------------------------------------
## new_deaths
## n missing distinct Info Mean Gmd .05 .10
## 17085 0 518 0.635 34.73 67.84 0.0 0.0
## .25 .50 .75 .90 .95
## 0.0 0.0 1.0 10.0 52.8
##
## lowest : -6 0 1 2 3, highest: 7604 7663 8568 8709 10520
## --------------------------------------------------------------------------------
## total_cases
## n missing distinct Info Mean Gmd .05 .10
## 17085 0 4409 0.994 14361 27745 0 0
## .25 .50 .75 .90 .95
## 4 64 870 8104 27189
##
## lowest : 0 1 2 3 4
## highest: 3989894 4066883 4137915 4223701 4308809
## --------------------------------------------------------------------------------
## total_deaths
## n missing distinct Info Mean Gmd .05 .10
## 17085 0 1493 0.898 952 1864 0 0
## .25 .50 .75 .90 .95
## 0 1 18 215 1163
##
## lowest : 0 1 2 3 4, highest: 278951 282361 285930 291696 296680
## --------------------------------------------------------------------------------
covid[covid$new_cases < 0, ]
## # A tibble: 8 x 6
## date location new_cases new_deaths total_cases total_deaths
## <date> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2020-05-07 Ecuador -2461 49 29420 1618
## 2 2020-05-09 Ecuador -1480 50 28818 1704
## 3 2020-05-12 Ecuador -50 18 29509 2145
## 4 2020-03-10 International -9 1 696 7
## 5 2020-04-29 Lithuania -105 3 1344 44
## 6 2020-05-03 Portugal -161 16 25190 1023
## 7 2020-05-11 San Marino -9 0 628 41
## 8 2020-04-19 Spain -713 410 193252 20453
covid$new_cases <- abs(covid$new_cases)
covid[which(is.na(covid$new_cases)), ]
## # A tibble: 0 x 6
## # ... with 6 variables: date <date>, location <chr>, new_cases <dbl>,
## # new_deaths <dbl>, total_cases <dbl>, total_deaths <dbl>
# wich (qué o cual) arroja los indices orden númerico
# is.na arroja valores faltantes o no disponible
covidmundo <- covid[covid$location == "World", ]
head(covidmundo, 5)
## # A tibble: 5 x 6
## date location new_cases new_deaths total_cases total_deaths
## <date> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2019-12-31 World 27 0 27 0
## 2 2020-01-01 World 0 0 27 0
## 3 2020-01-02 World 0 0 27 0
## 4 2020-01-03 World 17 0 44 0
## 5 2020-01-04 World 0 0 44 0
tail(covidmundo, 1)
## # A tibble: 1 x 6
## date location new_cases new_deaths total_cases total_deaths
## <date> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2020-05-14 World 85108 4984 4308809 296680
summary(covidmundo$new_cases)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 1066 4180 31682 72666 101445
summary(covidmundo$new_deaths)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 45.25 182.00 2181.47 4894.25 10520.00
library(dplyr)
mundofecha <- covidmundo %>%
filter(date >= "2020-04-09" & date <= "2020-05-09")
mundofecha
## # A tibble: 31 x 6
## date location new_cases new_deaths total_cases total_deaths
## <date> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2020-04-09 World 84479 6436 1489045 89219
## 2 2020-04-10 World 85805 7445 1574850 96664
## 3 2020-04-11 World 89320 7221 1664170 103885
## 4 2020-04-12 World 80770 6026 1744940 109911
## 5 2020-04-13 World 71254 5266 1816194 115177
## 6 2020-04-14 World 66017 5371 1882211 120548
## 7 2020-04-15 World 76602 7604 1958813 128152
## 8 2020-04-16 World 79748 10520 2038561 138672
## 9 2020-04-17 World 82962 8709 2121523 147381
## 10 2020-04-18 World 81894 8568 2203417 155949
## # ... with 21 more rows
covidmundo$new_cases[which.max(covidmundo$new_cases)]
## [1] 101445
covidmundo$new_deaths[which.max(covidmundo$new_deaths)]
## [1] 10520
library(ggplot2)
ggplot(covidmundo, aes(date, new_cases)) +
geom_line(color = "red") +
geom_point(color = "red") +
labs(x = "Fecha", y = "Casos Nuevos") +
theme_light()
ggplot(covidmundo, aes(date, new_deaths)) +
geom_line(color = "blue") +
geom_point(color = "blue") +
labs(x = "Fecha", y = "Muertes Reportadas") +
theme_light()
library(DT)
paises <- covid[covid$location != "World", ]
datatable(paises, class = 'cell-border stripe')
max(paises$new_cases)
## [1] 48529
paises$location[which.max(paises$new_cases)]
## [1] "United States"
max(covid$new_deaths)
## [1] 10520
paises$location[which.max(paises$new_deaths)]
## [1] "United States"
italia <- filter(paises, paises$location == "Italy")
datatable(italia, class = 'cell-border stripe')
ggplot(italia, aes(date, new_cases)) +
geom_point(color = "green") +
geom_line(color = "green") +
labs(x = "Fecha", y = "Casos Reportados") +
theme_light()
ggplot(italia, aes(date, new_deaths)) +
geom_point(color = "black") +
geom_line(color = "black") +
labs(x = "Fechas", y = "Muertes Reportadas") +
theme_light()
library(dplyr)
promsd <- paises %>%
group_by(location) %>%
summarise(media_casos = mean(new_cases),
desv_casos = sd(new_cases),
media_muertes = mean(new_deaths),
desv_muertes = sd(new_deaths))
datatable(promsd, class = 'cell-border stripe')
paisesprome <- promsd %>%
top_n(10, media_casos)
paisesprome <- data.frame(paisesprome)
ggplot(paisesprome, aes(reorder(location, -media_casos),
media_casos)) +
geom_bar(stat = "identity",
color = "paleturquoise4",
fill = "paleturquoise4", alpha = 0.5) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "País", y = "Promedio Casos")
paisesmuert <- promsd %>%
top_n(10, media_muertes)
ggplot(paisesmuert, aes(reorder(location, -media_muertes),
media_muertes)) +
geom_bar(stat = "identity",
color= "lightsalmon4",
fill = "lightsalmon4", alpha = 0.5) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "País", y = "Promedio Muertes")
casostotal <- paises %>%
group_by(location) %>%
summarise(tail(total_cases, 1),
tail(total_deaths, 1))
names(casostotal) <- c("País", "Casos", "Muertes")
head(casostotal, 5)
## # A tibble: 5 x 3
## País Casos Muertes
## <chr> <dbl> <dbl>
## 1 Afghanistan 5226 130
## 2 Albania 880 31
## 3 Algeria 6253 522
## 4 Andorra 760 49
## 5 Angola 45 2
library(ggplot2)
casos5 <- casostotal %>%
top_n(5, Casos)
filter(paises, location %in% casos5$País) %>%
filter(date >= "2020-03-01") %>%
ggplot(aes(x = date, y = new_cases,
color = location)) +
geom_line() +
scale_color_manual(values = c("darkmagenta", "navy",
"darkgreen", "darkgoldenrod",
"darkred")) +
geom_point() +
theme_bw() +
theme(legend.position = "bottom",
legend.title = element_text(size = 8),
legend.text = element_text(size = 6)) +
labs(x = "Fecha", y = "Casos Confirmados")
muertes5 <- casostotal %>%
top_n(5, Muertes)
filter(paises, location %in% muertes5$País) %>%
filter(date >= "2020-03-01") %>%
ggplot(aes(x = date, y = new_deaths, color = location)) +
geom_line() +
scale_color_manual(values = c("darkmagenta", "navy",
"darkgreen", "darkgoldenrod",
"darkred")) +
geom_point() +
theme_bw() +
theme(legend.position = "bottom",
legend.title = element_text(size = 8),
legend.text = element_text(size = 6)) +
labs(x = "Fecha", y = "Muertes Confirmadas")
(Base de datos adquirida de la página web Our World in Data).
continentes <- read.csv("https://covid.ourworldindata.org/data/ecdc/locations.csv")
head(continentes, 5)
## countriesAndTerritories location continent population_year population
## 1 Afghanistan Afghanistan Asia 2020 38928341
## 2 Albania Albania Europe 2020 2877800
## 3 Algeria Algeria Africa 2020 43851043
## 4 Andorra Andorra Europe 2020 77265
## 5 Angola Angola Africa 2020 32866268
tail(continentes, 5)
## countriesAndTerritories location continent population_year population
## 205 Vietnam Vietnam Asia 2020 97338583
## 206 Western_Sahara Western Sahara Africa 2020 597330
## 207 Yemen Yemen Asia 2020 29825968
## 208 Zambia Zambia Africa 2020 18383956
## 209 Zimbabwe Zimbabwe Africa 2020 14862927
table(continentes$continent)
##
## Africa Asia Europe North America
## 1 54 46 51 36
## Oceania South America
## 8 13
continentes2 <- continentes %>%
select(location, continent)
library(dplyr)
bases2 <- left_join(continentes2, paises, by = "location")
datatable(bases2, options = list(pageLength = 5,
dom = 'tip'), rownames = TRUE)
suramerica5 <- bases2 %>%
filter(continent == "South America") %>%
group_by(location) %>%
summarise(Casos = tail(total_cases, 1)) %>%
top_n(5, Casos)
datatable(suramerica5, class = 'cell-border stripe')