Understanding the data

Loading the data from the ‘covid.csv’ CSV file

library(readr)

#Loading the dataset
covid19_df <- read.csv("E:/DATA SCIENCE STORE/Ireneaus Nyame/COVID19_WORLD/covid19.csv")

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#Displaying the dimensions of the dataset
dim(covid19_df)
## [1] 27641    12
#Storing the column names in a variable
vector_col <- colnames(covid19_df)
vector_col
##  [1] "Date"             "Country_Region"   "Province_State"   "positive"        
##  [5] "active"           "hospitalized"     "hospitalizedCurr" "recovered"       
##  [9] "death"            "total_tested"     "daily_tested"     "daily_positive"
#Showing the first few rows sof the dataset
head(covid19_df)
##         Date Country_Region Province_State positive active hospitalized
## 1 2020-01-16        Iceland     All States        3     NA           NA
## 2 2020-01-17        Iceland     All States        4     NA           NA
## 3 2020-01-18        Iceland     All States        7     NA           NA
## 4 2020-01-20    South Korea     All States        1     NA           NA
## 5 2020-01-22  United States     All States        0     NA           NA
## 6 2020-01-22  United States  Massachusetts        0     NA           NA
##   hospitalizedCurr recovered death total_tested daily_tested daily_positive
## 1               NA        NA    NA           NA           NA             NA
## 2               NA        NA    NA           NA           NA              1
## 3               NA        NA    NA           NA           NA              3
## 4               NA        NA    NA            4           NA             NA
## 5               NA        NA     0            0           NA             NA
## 6               NA        NA     0            0           NA             NA
#Displaying a global view of the dataset
tibble(covid19_df)
## # A tibble: 27,641 × 12
##    Date       Country_Region Province_State positive active hospitalized
##    <chr>      <chr>          <chr>             <int>  <int>        <int>
##  1 2020-01-16 Iceland        All States            3     NA           NA
##  2 2020-01-17 Iceland        All States            4     NA           NA
##  3 2020-01-18 Iceland        All States            7     NA           NA
##  4 2020-01-20 South Korea    All States            1     NA           NA
##  5 2020-01-22 United States  All States            0     NA           NA
##  6 2020-01-22 United States  Massachusetts         0     NA           NA
##  7 2020-01-22 United States  Washington            0     NA           NA
##  8 2020-01-23 United States  All States            0     NA           NA
##  9 2020-01-23 United States  Massachusetts         0     NA           NA
## 10 2020-01-23 United States  Washington            0     NA           NA
## # ℹ 27,631 more rows
## # ℹ 6 more variables: hospitalizedCurr <int>, recovered <int>, death <int>,
## #   total_tested <dbl>, daily_tested <int>, daily_positive <int>
glimpse(covid19_df)
## Rows: 27,641
## Columns: 12
## $ Date             <chr> "2020-01-16", "2020-01-17", "2020-01-18", "2020-01-20…
## $ Country_Region   <chr> "Iceland", "Iceland", "Iceland", "South Korea", "Unit…
## $ Province_State   <chr> "All States", "All States", "All States", "All States…
## $ positive         <int> 3, 4, 7, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, NA, NA, NA,…
## $ active           <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ hospitalized     <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ hospitalizedCurr <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ recovered        <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ death            <int> NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, NA, NA…
## $ total_tested     <dbl> NA, NA, NA, 4, 0, 0, 0, 0, 0, 0, 27, 0, 0, 0, NA, NA,…
## $ daily_tested     <int> NA, NA, NA, NA, NA, NA, NA, 0, 0, 0, 5, 0, 0, 0, NA, …
## $ daily_positive   <int> NA, 1, 3, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, NA, NA…

The dataset contains 12 columns and 27,641 rows. This database provides information on the numbers (per day and cumulatively) of COVID-19 positive cases, deaths, tests performed and hospitalizations for each country through the column’s names store in the variable vector_cols.

  1. This variable contains a character vector.

  2. The use of the function glimpse() is the very first operation to do because we don’t only learn about the dimensions of the database but also about the names of the first columns and their types and content. It can replace the three previous operations: dim(), colnames(), and head().

Isolating the Rows We Need

Selecting the rows related to "All States" and removing the Province_State

#Filter the "All States" Province States and remove the 'Province_State' column 
covid19_df_all_states <- covid19_df %>%
  filter(Province_State == "All States") %>%
  select(-Province_State)

We can remove the Province_State column without losing information because after filtering, the column contains only the "All States" observation

Isolating the Column We Need

Creating a dataset for the daily columns from the covid19_df_all_states dataframe

Let’s recall the description of the dataset’s columns

  1. Date: Date
  2. Country_Region: Country names
  3. Province_State: States/province names; value is All States when state/provincial level data is not available
  4. positive: Cumulative number of positive cases reported.
  5. active: Number of actively cases on that day.
  6. hospitalized: Cumulative number of hospitalized cases reported.
  7. hospitalizedCurr: Number of actively hospitalized cases on that day.
  8. recovered: Cumulative number of recovered cases reported.
  9. death: Cumulative number of deaths reported.
  10. total_tested: Cumulative number of tests conducted.
  11. daily_tested: Number of tests conducted on the day; if daily data is unavailable, daily tested is averaged across number of days in between.
  12. daily_positive: Number of positive cases reported on the day; if daily data is unavailable, daily positive is averaged across number of days in.
#Selecting columns of interest

covid19_df_all_states_daily <- covid19_df_all_states %>%
  select(Date, Country_Region, active, hospitalizedCurr, daily_tested, daily_positive)

head(covid19_df_all_states_daily)
##         Date Country_Region active hospitalizedCurr daily_tested daily_positive
## 1 2020-01-16        Iceland     NA               NA           NA             NA
## 2 2020-01-17        Iceland     NA               NA           NA              1
## 3 2020-01-18        Iceland     NA               NA           NA              3
## 4 2020-01-20    South Korea     NA               NA           NA             NA
## 5 2020-01-22  United States     NA               NA           NA             NA
## 6 2020-01-23  United States     NA               NA            0              0

Extracting the Top Ten Countries in the number of tested cases

Summarizing the data based on the Country_Region column

covid19_df_all_states_daily_sum <- covid19_df_all_states_daily %>%
  group_by(Country_Region) %>%
  summarize(tested = sum(daily_tested, na.rm = TRUE),
            positive = sum(daily_positive, na.rm = TRUE),
            active = sum(active, na.rm = TRUE),
            hospitalzed = sum(hospitalizedCurr, na.rm = TRUE)) %>%
  arrange(desc(tested)) #this is similar to `arrange(-tested)`

covid19_df_all_states_daily_sum
## # A tibble: 146 × 5
##    Country_Region    tested positive   active hospitalzed
##    <chr>              <int>    <int>    <int>       <int>
##  1 United States  136937092  9850413        0           0
##  2 India          106267322    60959        0           0
##  3 Italy           17370389   934875 17176595     2401146
##  4 Russia          11319603   432269  7621860           0
##  5 Canada           9873530   259992  1354390           0
##  6 Australia        8874298        0   394222       36384
##  7 Israel           4915043      402        0       22726
##  8 Turkey           4351655   221499  4025622           0
##  9 Peru             3578707    59497        0           0
## 10 Brazil           3474441    10321        0           0
## # ℹ 136 more rows
#Date, Country_Region, active, hospitalizedCurr, daily_tested, daily_positive

Isolating the Top Ten

covid19_top_10 <- head(covid19_df_all_states_daily_sum, 10)
covid19_top_10
## # A tibble: 10 × 5
##    Country_Region    tested positive   active hospitalzed
##    <chr>              <int>    <int>    <int>       <int>
##  1 United States  136937092  9850413        0           0
##  2 India          106267322    60959        0           0
##  3 Italy           17370389   934875 17176595     2401146
##  4 Russia          11319603   432269  7621860           0
##  5 Canada           9873530   259992  1354390           0
##  6 Australia        8874298        0   394222       36384
##  7 Israel           4915043      402        0       22726
##  8 Turkey           4351655   221499  4025622           0
##  9 Peru             3578707    59497        0           0
## 10 Brazil           3474441    10321        0           0

Identifying the Highest Positive Against Tested Cases

Getting the vectors

countries <- covid19_top_10$Country_Region
tested_cases <- covid19_top_10$tested
positive_cases <- covid19_top_10$positive
active_cases <- covid19_top_10$active
hospitalized_cases <- covid19_top_10$hospitalzed

Naming the vectors

names(positive_cases) <- countries
names(tested_cases) <- countries
names(active_cases) <- countries
names(hospitalized_cases) <- countries

Identifying

positive_cases
## United States         India         Italy        Russia        Canada 
##       9850413         60959        934875        432269        259992 
##     Australia        Israel        Turkey          Peru        Brazil 
##             0           402        221499         59497         10321
sum(positive_cases)
## [1] 11830227
mean(positive_cases)
## [1] 1183023
positive_cases/sum(positive_cases)
## United States         India         Italy        Russia        Canada 
##  8.326478e-01  5.152817e-03  7.902427e-02  3.653937e-02  2.197692e-02 
##     Australia        Israel        Turkey          Peru        Brazil 
##  0.000000e+00  3.398075e-05  1.872314e-02  5.029236e-03  8.724262e-04

Calculating the positive case rate

The rate at which a tested case will be a positive case

positive_cases/tested_cases
## United States         India         Italy        Russia        Canada 
##  7.193386e-02  5.736382e-04  5.382004e-02  3.818765e-02  2.633222e-02 
##     Australia        Israel        Turkey          Peru        Brazil 
##  0.000000e+00  8.178972e-05  5.089994e-02  1.662528e-02  2.970550e-03

Conclusion

positive_tested_top_3 <- c("United States" = 0.07, "India" = 0.06, "Italy" = 0.05)

Keeping relevant information about the top 3 countries with positive cases for covid19

# Creating the vectors for the top 3 countries with positive cases for covid19

United_States <- c(0.07, 136937092, 9850413, 0, 0)
India <- c(0.06, 106267322, 60959, 0, 0)
Italy <- c(0.05, 17370389, 934875, 17176595, 2401146)
# Creating a matrix for the top 3 countries
covid_mat <- rbind(United_States, India, Italy)

# Naming the columns of the matrix created
colnames(covid_mat) <- c("Ratio", "Tested", "Positive", "Active", "Hospitalized")

# Viewing the matrix created
covid_mat
##               Ratio    Tested Positive   Active Hospitalized
## United_States  0.07 136937092  9850413        0            0
## India          0.06 106267322    60959        0            0
## Italy          0.05  17370389   934875 17176595      2401146

Finally…

question <- "Which countries have had the highest number of positive cases against the number of tests?"

answer <- c("Positive tested cases" = positive_tested_top_3)

datasets <- list(
  original = covid19_df,
  allstates = covid19_df_all_states,
  daily = covid19_df_all_states_daily,
  top_10 = covid19_top_10
)

matrices <- list(covid_mat)
vectors <- list(vector_col, countries)

data_structure_list <- list("dataframe" = datasets, "matrix" = matrices, "vector" = vectors)

covid_analysis_list <- list(question, answer, data_structure_list)

covid_analysis_list[[2]]
## Positive tested cases.United States         Positive tested cases.India 
##                                0.07                                0.06 
##         Positive tested cases.Italy 
##                                0.05