# Understanding the Data
## Loading the dataset from the `covid19.csv` CSV file and quick exploration
``` r
library(readr)
# Loading the dataset
covid_df <- read.csv("C:/Users/litan/Downloads/COVID 19 PROJECT/covid19.csv")
# Displaing the dimension of the data:
dim(covid_df)
## [1] 27641 12
# Storing the column names in a variable
vector_cols <- colnames(covid_df)
# Displaing the variable vector_cols
vector_cols
## [1] "Date" "Country_Region" "Province_State" "positive"
## [5] "active" "hospitalized" "hospitalizedCurr" "recovered"
## [9] "death" "total_tested" "daily_tested" "daily_positive"
# Showing the first few rows of the dataset
head(covid_df)
## Date Country_Region Province_State positive active hospitalized
## 1 2020-01-16 Iceland All States 3 NA NA
## 2 2020-01-17 Iceland All States 4 NA NA
## 3 2020-01-18 Iceland All States 7 NA NA
## 4 2020-01-20 South Korea All States 1 NA NA
## 5 2020-01-22 United States All States 0 NA NA
## 6 2020-01-22 United States Massachusetts 0 NA NA
## hospitalizedCurr recovered death total_tested daily_tested daily_positive
## 1 NA NA NA NA NA NA
## 2 NA NA NA NA NA 1
## 3 NA NA NA NA NA 3
## 4 NA NA NA 4 NA NA
## 5 NA NA 0 0 NA NA
## 6 NA NA 0 0 NA NA
# Showing a global view of the dataset.
library(tibble)
glimpse(covid_df)
## Rows: 27,641
## Columns: 12
## $ Date <chr> "2020-01-16", "2020-01-17", "2020-01-18", "2020-01-20…
## $ Country_Region <chr> "Iceland", "Iceland", "Iceland", "South Korea", "Unit…
## $ Province_State <chr> "All States", "All States", "All States", "All States…
## $ positive <int> 3, 4, 7, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, NA, NA, NA,…
## $ active <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ hospitalized <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ hospitalizedCurr <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ recovered <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ death <int> NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, NA, NA…
## $ total_tested <dbl> NA, NA, NA, 4, 0, 0, 0, 0, 0, 0, 27, 0, 0, 0, NA, NA,…
## $ daily_tested <int> NA, NA, NA, NA, NA, NA, NA, 0, 0, 0, 5, 0, 0, 0, NA, …
## $ daily_positive <int> NA, 1, 3, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, NA, NA…
The dataset contains 14 columns and 10,903
rows. This database provides information on the numbers (per day and
cumulatively) of COVID-19 positive cases, deaths, tests performed and
hospitalizations for each country through the column’s names store in
the variable vector_cols.
This variable contains a character vector.
The use of the function glimpse() is the very first
operation to do because we don’t only learn about the dimensions of the
database but also about the names of the first columns and their types
and content. It can replace the three previous operations:
dim(), colnames(), and
head().
"All States" and
removing the Province_State.library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Filter the "All States" Province states and remove the `Province_State` column
covid_df_all_states <- covid_df %>%
filter(Province_State == "All States") %>%
select(-Province_State)
Province_State without loosing
information because after the filtering step this column only contains
the value "All States".covid_df_all_states dataframeLet’s recall the description of the dataset’s columns.
Date: DateContinent_Name: Continent namesTwo_Letter_Country_Code: Country codesCountry_Region: Country namesProvince_State: States/province names; value is
All States when state/provincial level data is not
availablepositive: Cumulative number of positive cases
reported.active: Number of actively cases on that
day.hospitalized: Cumulative number of hospitalized cases
reported.hospitalizedCurr: Number of actively hospitalized cases
on that day.recovered: Cumulative number of recovered cases
reported.death: Cumulative number of deaths reported.total_tested: Cumulative number of tests
conducted.daily_tested: Number of tests conducted on the
day; if daily data is unavailable, daily tested is
averaged across number of days in between.daily_positive: Number of positive cases reported on
the day; if daily data is unavailable, daily positive
is averaged across number of days in.# Selecting the columns with cumulative numbers
covid_df_all_states_daily <- covid_df_all_states %>%
select(Date, Country_Region, active, hospitalizedCurr, daily_tested, daily_positive)
head(covid_df_all_states_daily)
## Date Country_Region active hospitalizedCurr daily_tested daily_positive
## 1 2020-01-16 Iceland NA NA NA NA
## 2 2020-01-17 Iceland NA NA NA 1
## 3 2020-01-18 Iceland NA NA NA 3
## 4 2020-01-20 South Korea NA NA NA NA
## 5 2020-01-22 United States NA NA NA NA
## 6 2020-01-23 United States NA NA 0 0
Country_Region
column.covid_df_all_states_daily_sum <- covid_df_all_states_daily %>%
group_by(Country_Region) %>%
summarise(tested = sum(daily_tested),
positive = sum(daily_positive),
active = sum(active),
hospitalized = sum(hospitalizedCurr)) %>%
arrange(desc(tested)) #this is equivalent to `arrange(-tested)`
covid_df_all_states_daily_sum
## # A tibble: 146 × 5
## Country_Region tested positive active hospitalized
## <chr> <int> <int> <int> <int>
## 1 Afghanistan NA NA NA NA
## 2 Albania NA NA NA NA
## 3 Algeria NA NA NA NA
## 4 Argentina NA NA NA NA
## 5 Armenia NA NA 1846922 NA
## 6 Australia NA NA NA NA
## 7 Austria NA NA NA NA
## 8 Azerbaijan NA NA NA NA
## 9 Bahrain NA NA NA NA
## 10 Bangladesh NA NA 14479558 NA
## # ℹ 136 more rows
#Date, Country_Region, active, hospitalizedCurr, daily_tested, daily_positive
covid_top_10 <- head(covid_df_all_states_daily_sum, 10)
covid_top_10
## # A tibble: 10 × 5
## Country_Region tested positive active hospitalized
## <chr> <int> <int> <int> <int>
## 1 Afghanistan NA NA NA NA
## 2 Albania NA NA NA NA
## 3 Algeria NA NA NA NA
## 4 Argentina NA NA NA NA
## 5 Armenia NA NA 1846922 NA
## 6 Australia NA NA NA NA
## 7 Austria NA NA NA NA
## 8 Azerbaijan NA NA NA NA
## 9 Bahrain NA NA NA NA
## 10 Bangladesh NA NA 14479558 NA
countries <- covid_top_10$Country_Region
tested_cases <- covid_top_10$tested
positive_cases <- covid_top_10$positive
active_cases <- covid_top_10$active
hospitalized_cases <- covid_top_10$hospitalized
names(positive_cases) <- countries
names(tested_cases) <- countries
names(active_cases) <- countries
names(hospitalized_cases) <- countries
positive_cases
## Afghanistan Albania Algeria Argentina Armenia Australia
## NA NA NA NA NA NA
## Austria Azerbaijan Bahrain Bangladesh
## NA NA NA NA
sum(positive_cases)
## [1] NA
mean(positive_cases)
## [1] NA
positive_cases/sum(positive_cases)
## Afghanistan Albania Algeria Argentina Armenia Australia
## NA NA NA NA NA NA
## Austria Azerbaijan Bahrain Bangladesh
## NA NA NA NA
positive_cases/tested_cases
## Afghanistan Albania Algeria Argentina Armenia Australia
## NA NA NA NA NA NA
## Austria Azerbaijan Bahrain Bangladesh
## NA NA NA NA
positive_tested_top_3 <- c("United Kingdom" = 0.11, "United States" = 0.10, "Turkey" = 0.08)
# Creating vectors
united_kingdom <- c(0.11, 1473672, 166909, 0, 0)
united_states <- c(0.10, 17282363, 1877179, 0, 0)
turkey <- c(0.08, 2031192, 163941, 2980960, 0)
# Creating the matrix covid_mat
covid_mat <- rbind(united_kingdom, united_states, turkey)
# Naming columns
colnames(covid_mat) <- c("Ratio", "tested", "positive", "active", "hospitalized")
#d Displaying the matrix
covid_mat
## Ratio tested positive active hospitalized
## united_kingdom 0.11 1473672 166909 0 0
## united_states 0.10 17282363 1877179 0 0
## turkey 0.08 2031192 163941 2980960 0
question <- "Which countries have had the highest number of positive cases against the number of tests?"
answer <- c("Positive tested cases" = positive_tested_top_3)
datasets <- list(
original = covid_df,
allstates = covid_df_all_states,
daily = covid_df_all_states_daily,
top_10 = covid_top_10
)
matrices <- list(covid_mat)
vectors <- list(vector_cols, countries)
data_structure_list <- list("dataframe" = datasets, "matrix" = matrices, "vector" = vectors)
covid_analysis_list <- list(question, answer, data_structure_list)
covid_analysis_list[[2]]
## Positive tested cases.United Kingdom Positive tested cases.United States
## 0.11 0.10
## Positive tested cases.Turkey
## 0.08