library(readr)
#Loading the dataset
covid19_df <- read.csv("E:/DATA SCIENCE STORE/Ireneaus Nyame/COVID19_WORLD/covid19.csv")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Displaying the dimensions of the dataset
dim(covid19_df)
## [1] 27641 12
#Storing the column names in a variable
vector_col <- colnames(covid19_df)
vector_col
## [1] "Date" "Country_Region" "Province_State" "positive"
## [5] "active" "hospitalized" "hospitalizedCurr" "recovered"
## [9] "death" "total_tested" "daily_tested" "daily_positive"
#Showing the first few rows sof the dataset
head(covid19_df)
## Date Country_Region Province_State positive active hospitalized
## 1 2020-01-16 Iceland All States 3 NA NA
## 2 2020-01-17 Iceland All States 4 NA NA
## 3 2020-01-18 Iceland All States 7 NA NA
## 4 2020-01-20 South Korea All States 1 NA NA
## 5 2020-01-22 United States All States 0 NA NA
## 6 2020-01-22 United States Massachusetts 0 NA NA
## hospitalizedCurr recovered death total_tested daily_tested daily_positive
## 1 NA NA NA NA NA NA
## 2 NA NA NA NA NA 1
## 3 NA NA NA NA NA 3
## 4 NA NA NA 4 NA NA
## 5 NA NA 0 0 NA NA
## 6 NA NA 0 0 NA NA
#Displaying a global view of the dataset
tibble(covid19_df)
## # A tibble: 27,641 × 12
## Date Country_Region Province_State positive active hospitalized
## <chr> <chr> <chr> <int> <int> <int>
## 1 2020-01-16 Iceland All States 3 NA NA
## 2 2020-01-17 Iceland All States 4 NA NA
## 3 2020-01-18 Iceland All States 7 NA NA
## 4 2020-01-20 South Korea All States 1 NA NA
## 5 2020-01-22 United States All States 0 NA NA
## 6 2020-01-22 United States Massachusetts 0 NA NA
## 7 2020-01-22 United States Washington 0 NA NA
## 8 2020-01-23 United States All States 0 NA NA
## 9 2020-01-23 United States Massachusetts 0 NA NA
## 10 2020-01-23 United States Washington 0 NA NA
## # ℹ 27,631 more rows
## # ℹ 6 more variables: hospitalizedCurr <int>, recovered <int>, death <int>,
## # total_tested <dbl>, daily_tested <int>, daily_positive <int>
glimpse(covid19_df)
## Rows: 27,641
## Columns: 12
## $ Date <chr> "2020-01-16", "2020-01-17", "2020-01-18", "2020-01-20…
## $ Country_Region <chr> "Iceland", "Iceland", "Iceland", "South Korea", "Unit…
## $ Province_State <chr> "All States", "All States", "All States", "All States…
## $ positive <int> 3, 4, 7, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, NA, NA, NA,…
## $ active <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ hospitalized <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ hospitalizedCurr <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ recovered <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ death <int> NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, NA, NA…
## $ total_tested <dbl> NA, NA, NA, 4, 0, 0, 0, 0, 0, 0, 27, 0, 0, 0, NA, NA,…
## $ daily_tested <int> NA, NA, NA, NA, NA, NA, NA, 0, 0, 0, 5, 0, 0, 0, NA, …
## $ daily_positive <int> NA, 1, 3, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, NA, NA…
The dataset contains 12 columns and 27,641
rows. This database provides information on the numbers (per day and
cumulatively) of COVID-19 positive cases, deaths, tests performed and
hospitalizations for each country through the column’s names store in
the variable vector_cols.
This variable contains a character vector.
The use of the function glimpse() is the very first
operation to do because we don’t only learn about the dimensions of the
database but also about the names of the first columns and their types
and content. It can replace the three previous operations:
dim(), colnames(), and
head().
Selecting the rows related to "All States" and removing
the Province_State
#Filter the "All States" Province States and remove the 'Province_State' column
covid19_df_all_states <- covid19_df %>%
filter(Province_State == "All States") %>%
select(-Province_State)
We can remove the Province_State column without losing
information because after filtering, the column contains only the
"All States" observation
Creating a dataset for the daily columns from the
covid19_df_all_states dataframe
Let’s recall the description of the dataset’s columns
Date: DateCountry_Region: Country namesProvince_State: States/province names; value is
All States when state/provincial level data is not
availablepositive: Cumulative number of positive cases
reported.active: Number of actively cases on that
day.hospitalized: Cumulative number of hospitalized cases
reported.hospitalizedCurr: Number of actively hospitalized cases
on that day.recovered: Cumulative number of recovered cases
reported.death: Cumulative number of deaths reported.total_tested: Cumulative number of tests
conducted.daily_tested: Number of tests conducted on the
day; if daily data is unavailable, daily tested is
averaged across number of days in between.daily_positive: Number of positive cases reported on
the day; if daily data is unavailable, daily positive
is averaged across number of days in.#Selecting columns of interest
covid19_df_all_states_daily <- covid19_df_all_states %>%
select(Date, Country_Region, active, hospitalizedCurr, daily_tested, daily_positive)
head(covid19_df_all_states_daily)
## Date Country_Region active hospitalizedCurr daily_tested daily_positive
## 1 2020-01-16 Iceland NA NA NA NA
## 2 2020-01-17 Iceland NA NA NA 1
## 3 2020-01-18 Iceland NA NA NA 3
## 4 2020-01-20 South Korea NA NA NA NA
## 5 2020-01-22 United States NA NA NA NA
## 6 2020-01-23 United States NA NA 0 0
Country_Region
columncovid19_df_all_states_daily_sum <- covid19_df_all_states_daily %>%
group_by(Country_Region) %>%
summarize(tested = sum(daily_tested, na.rm = TRUE),
positive = sum(daily_positive, na.rm = TRUE),
active = sum(active, na.rm = TRUE),
hospitalzed = sum(hospitalizedCurr, na.rm = TRUE)) %>%
arrange(desc(tested)) #this is similar to `arrange(-tested)`
covid19_df_all_states_daily_sum
## # A tibble: 146 × 5
## Country_Region tested positive active hospitalzed
## <chr> <int> <int> <int> <int>
## 1 United States 136937092 9850413 0 0
## 2 India 106267322 60959 0 0
## 3 Italy 17370389 934875 17176595 2401146
## 4 Russia 11319603 432269 7621860 0
## 5 Canada 9873530 259992 1354390 0
## 6 Australia 8874298 0 394222 36384
## 7 Israel 4915043 402 0 22726
## 8 Turkey 4351655 221499 4025622 0
## 9 Peru 3578707 59497 0 0
## 10 Brazil 3474441 10321 0 0
## # ℹ 136 more rows
#Date, Country_Region, active, hospitalizedCurr, daily_tested, daily_positive
covid19_top_10 <- head(covid19_df_all_states_daily_sum, 10)
covid19_top_10
## # A tibble: 10 × 5
## Country_Region tested positive active hospitalzed
## <chr> <int> <int> <int> <int>
## 1 United States 136937092 9850413 0 0
## 2 India 106267322 60959 0 0
## 3 Italy 17370389 934875 17176595 2401146
## 4 Russia 11319603 432269 7621860 0
## 5 Canada 9873530 259992 1354390 0
## 6 Australia 8874298 0 394222 36384
## 7 Israel 4915043 402 0 22726
## 8 Turkey 4351655 221499 4025622 0
## 9 Peru 3578707 59497 0 0
## 10 Brazil 3474441 10321 0 0
countries <- covid19_top_10$Country_Region
tested_cases <- covid19_top_10$tested
positive_cases <- covid19_top_10$positive
active_cases <- covid19_top_10$active
hospitalized_cases <- covid19_top_10$hospitalzed
names(positive_cases) <- countries
names(tested_cases) <- countries
names(active_cases) <- countries
names(hospitalized_cases) <- countries
positive_cases
## United States India Italy Russia Canada
## 9850413 60959 934875 432269 259992
## Australia Israel Turkey Peru Brazil
## 0 402 221499 59497 10321
sum(positive_cases)
## [1] 11830227
mean(positive_cases)
## [1] 1183023
positive_cases/sum(positive_cases)
## United States India Italy Russia Canada
## 8.326478e-01 5.152817e-03 7.902427e-02 3.653937e-02 2.197692e-02
## Australia Israel Turkey Peru Brazil
## 0.000000e+00 3.398075e-05 1.872314e-02 5.029236e-03 8.724262e-04
positive_cases/tested_cases
## United States India Italy Russia Canada
## 7.193386e-02 5.736382e-04 5.382004e-02 3.818765e-02 2.633222e-02
## Australia Israel Turkey Peru Brazil
## 0.000000e+00 8.178972e-05 5.089994e-02 1.662528e-02 2.970550e-03
positive_tested_top_3 <- c("United States" = 0.07, "India" = 0.06, "Italy" = 0.05)
# Creating the vectors for the top 3 countries with positive cases for covid19
United_States <- c(0.07, 136937092, 9850413, 0, 0)
India <- c(0.06, 106267322, 60959, 0, 0)
Italy <- c(0.05, 17370389, 934875, 17176595, 2401146)
# Creating a matrix for the top 3 countries
covid_mat <- rbind(United_States, India, Italy)
# Naming the columns of the matrix created
colnames(covid_mat) <- c("Ratio", "Tested", "Positive", "Active", "Hospitalized")
# Viewing the matrix created
covid_mat
## Ratio Tested Positive Active Hospitalized
## United_States 0.07 136937092 9850413 0 0
## India 0.06 106267322 60959 0 0
## Italy 0.05 17370389 934875 17176595 2401146
question <- "Which countries have had the highest number of positive cases against the number of tests?"
answer <- c("Positive tested cases" = positive_tested_top_3)
datasets <- list(
original = covid19_df,
allstates = covid19_df_all_states,
daily = covid19_df_all_states_daily,
top_10 = covid19_top_10
)
matrices <- list(covid_mat)
vectors <- list(vector_col, countries)
data_structure_list <- list("dataframe" = datasets, "matrix" = matrices, "vector" = vectors)
covid_analysis_list <- list(question, answer, data_structure_list)
covid_analysis_list[[2]]
## Positive tested cases.United States Positive tested cases.India
## 0.07 0.06
## Positive tested cases.Italy
## 0.05