# Understanding the Data
## Loading the dataset from the `covid19.csv` CSV file and quick exploration

``` r
library(readr)

# Loading the dataset
covid_df <- read.csv("C:/Users/litan/Downloads/COVID 19 PROJECT/covid19.csv")

# Displaing the dimension of the data: 
dim(covid_df)

## [1] 27641    12

# Storing the column names in a variable
vector_cols <- colnames(covid_df)

# Displaing the variable vector_cols
vector_cols

##  [1] "Date"             "Country_Region"   "Province_State"   "positive"        
##  [5] "active"           "hospitalized"     "hospitalizedCurr" "recovered"       
##  [9] "death"            "total_tested"     "daily_tested"     "daily_positive"

# Showing the first few rows of the dataset
head(covid_df)

##         Date Country_Region Province_State positive active hospitalized
## 1 2020-01-16        Iceland     All States        3     NA           NA
## 2 2020-01-17        Iceland     All States        4     NA           NA
## 3 2020-01-18        Iceland     All States        7     NA           NA
## 4 2020-01-20    South Korea     All States        1     NA           NA
## 5 2020-01-22  United States     All States        0     NA           NA
## 6 2020-01-22  United States  Massachusetts        0     NA           NA
##   hospitalizedCurr recovered death total_tested daily_tested daily_positive
## 1               NA        NA    NA           NA           NA             NA
## 2               NA        NA    NA           NA           NA              1
## 3               NA        NA    NA           NA           NA              3
## 4               NA        NA    NA            4           NA             NA
## 5               NA        NA     0            0           NA             NA
## 6               NA        NA     0            0           NA             NA

# Showing a global view of the dataset.
library(tibble)

glimpse(covid_df)

## Rows: 27,641
## Columns: 12
## $ Date             <chr> "2020-01-16", "2020-01-17", "2020-01-18", "2020-01-20…
## $ Country_Region   <chr> "Iceland", "Iceland", "Iceland", "South Korea", "Unit…
## $ Province_State   <chr> "All States", "All States", "All States", "All States…
## $ positive         <int> 3, 4, 7, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, NA, NA, NA,…
## $ active           <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ hospitalized     <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ hospitalizedCurr <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ recovered        <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ death            <int> NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, NA, NA…
## $ total_tested     <dbl> NA, NA, NA, 4, 0, 0, 0, 0, 0, 0, 27, 0, 0, 0, NA, NA,…
## $ daily_tested     <int> NA, NA, NA, NA, NA, NA, NA, 0, 0, 0, 5, 0, 0, 0, NA, …
## $ daily_positive   <int> NA, 1, 3, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, NA, NA…

The dataset contains 14 columns and 10,903 rows. This database provides information on the numbers (per day and cumulatively) of COVID-19 positive cases, deaths, tests performed and hospitalizations for each country through the column’s names store in the variable vector_cols.

This variable contains a character vector.
The use of the function glimpse() is the very first operation to do because we don’t only learn about the dimensions of the database but also about the names of the first columns and their types and content. It can replace the three previous operations: dim(), colnames(), and head().

Isolating the Rows We Need

Selecting only the rows related to "All States" and removing the Province_State.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Filter the "All States" Province states and remove the `Province_State` column
covid_df_all_states <- covid_df %>% 
  filter(Province_State == "All States") %>% 
  select(-Province_State)

We can remove Province_State without loosing information because after the filtering step this column only contains the value "All States".

Isolating the Columns We Need

Creating a dataset for the daily columns from covid_df_all_states dataframe

Let’s recall the description of the dataset’s columns.

Date: Date
Continent_Name: Continent names
Two_Letter_Country_Code: Country codes
Country_Region: Country names
Province_State: States/province names; value is All States when state/provincial level data is not available
positive: Cumulative number of positive cases reported.
active: Number of actively cases on that day.
hospitalized: Cumulative number of hospitalized cases reported.
hospitalizedCurr: Number of actively hospitalized cases on that day.
recovered: Cumulative number of recovered cases reported.
death: Cumulative number of deaths reported.
total_tested: Cumulative number of tests conducted.
daily_tested: Number of tests conducted on the day; if daily data is unavailable, daily tested is averaged across number of days in between.
daily_positive: Number of positive cases reported on the day; if daily data is unavailable, daily positive is averaged across number of days in.

# Selecting the columns with cumulative numbers
covid_df_all_states_daily <- covid_df_all_states %>% 
  select(Date, Country_Region, active, hospitalizedCurr, daily_tested, daily_positive)

head(covid_df_all_states_daily)

##         Date Country_Region active hospitalizedCurr daily_tested daily_positive
## 1 2020-01-16        Iceland     NA               NA           NA             NA
## 2 2020-01-17        Iceland     NA               NA           NA              1
## 3 2020-01-18        Iceland     NA               NA           NA              3
## 4 2020-01-20    South Korea     NA               NA           NA             NA
## 5 2020-01-22  United States     NA               NA           NA             NA
## 6 2020-01-23  United States     NA               NA            0              0

Extracting the Top Ten countries in the number of tested cases

Summarizing the data based on the `Country_Region` column.

covid_df_all_states_daily_sum <- covid_df_all_states_daily %>% 
  group_by(Country_Region) %>% 
  summarise(tested = sum(daily_tested), 
            positive = sum(daily_positive),
            active = sum(active),
            hospitalized = sum(hospitalizedCurr)) %>% 
  arrange(desc(tested)) #this is equivalent to `arrange(-tested)`

covid_df_all_states_daily_sum

## # A tibble: 146 × 5
##    Country_Region tested positive   active hospitalized
##    <chr>           <int>    <int>    <int>        <int>
##  1 Afghanistan        NA       NA       NA           NA
##  2 Albania            NA       NA       NA           NA
##  3 Algeria            NA       NA       NA           NA
##  4 Argentina          NA       NA       NA           NA
##  5 Armenia            NA       NA  1846922           NA
##  6 Australia          NA       NA       NA           NA
##  7 Austria            NA       NA       NA           NA
##  8 Azerbaijan         NA       NA       NA           NA
##  9 Bahrain            NA       NA       NA           NA
## 10 Bangladesh         NA       NA 14479558           NA
## # ℹ 136 more rows

#Date, Country_Region, active, hospitalizedCurr, daily_tested, daily_positive

Taking the top 10

covid_top_10 <- head(covid_df_all_states_daily_sum, 10)

covid_top_10

## # A tibble: 10 × 5
##    Country_Region tested positive   active hospitalized
##    <chr>           <int>    <int>    <int>        <int>
##  1 Afghanistan        NA       NA       NA           NA
##  2 Albania            NA       NA       NA           NA
##  3 Algeria            NA       NA       NA           NA
##  4 Argentina          NA       NA       NA           NA
##  5 Armenia            NA       NA  1846922           NA
##  6 Australia          NA       NA       NA           NA
##  7 Austria            NA       NA       NA           NA
##  8 Azerbaijan         NA       NA       NA           NA
##  9 Bahrain            NA       NA       NA           NA
## 10 Bangladesh         NA       NA 14479558           NA

Identifying the Highest Positive Against Tested Cases

Getting vectors

countries <- covid_top_10$Country_Region
tested_cases <- covid_top_10$tested
positive_cases <- covid_top_10$positive
active_cases <- covid_top_10$active
hospitalized_cases <- covid_top_10$hospitalized

Naming vectors

names(positive_cases) <- countries
names(tested_cases) <- countries
names(active_cases) <- countries
names(hospitalized_cases) <- countries

Identifying

positive_cases

## Afghanistan     Albania     Algeria   Argentina     Armenia   Australia 
##          NA          NA          NA          NA          NA          NA 
##     Austria  Azerbaijan     Bahrain  Bangladesh 
##          NA          NA          NA          NA

sum(positive_cases)

## [1] NA

mean(positive_cases)

## [1] NA

positive_cases/sum(positive_cases)

## Afghanistan     Albania     Algeria   Argentina     Armenia   Australia 
##          NA          NA          NA          NA          NA          NA 
##     Austria  Azerbaijan     Bahrain  Bangladesh 
##          NA          NA          NA          NA

positive_cases/tested_cases

## Afghanistan     Albania     Algeria   Argentina     Armenia   Australia 
##          NA          NA          NA          NA          NA          NA 
##     Austria  Azerbaijan     Bahrain  Bangladesh 
##          NA          NA          NA          NA

Conclusion

positive_tested_top_3 <- c("United Kingdom" = 0.11, "United States" = 0.10, "Turkey" = 0.08)

Keeping relevant information

# Creating vectors
united_kingdom <- c(0.11, 1473672, 166909, 0, 0)
united_states <- c(0.10, 17282363, 1877179, 0, 0)
turkey <- c(0.08, 2031192, 163941, 2980960, 0)

# Creating the matrix covid_mat
covid_mat <- rbind(united_kingdom, united_states, turkey)

# Naming columns
colnames(covid_mat) <- c("Ratio", "tested", "positive", "active", "hospitalized")

#d Displaying the matrix
covid_mat

##                Ratio   tested positive  active hospitalized
## united_kingdom  0.11  1473672   166909       0            0
## united_states   0.10 17282363  1877179       0            0
## turkey          0.08  2031192   163941 2980960            0

Putting all together

question <- "Which countries have had the highest number of positive cases against the number of tests?"

answer <- c("Positive tested cases" = positive_tested_top_3)

datasets <- list(
  original = covid_df,
  allstates = covid_df_all_states,
  daily = covid_df_all_states_daily,
  top_10 = covid_top_10
)

matrices <- list(covid_mat)
vectors <- list(vector_cols, countries)

data_structure_list <- list("dataframe" = datasets, "matrix" = matrices, "vector" = vectors)

covid_analysis_list <- list(question, answer, data_structure_list)

covid_analysis_list[[2]]

## Positive tested cases.United Kingdom  Positive tested cases.United States 
##                                 0.11                                 0.10 
##         Positive tested cases.Turkey 
##                                 0.08

Data Structures in R: Guided Project Solutions

Dataquest

6/6/2020

Isolating the Rows We Need

Isolating the Columns We Need

Extracting the Top Ten countries in the number of tested cases

Summarizing the data based on the `Country_Region` column.

Taking the top 10

Identifying the Highest Positive Against Tested Cases

Getting vectors

Naming vectors

Identifying

Conclusion

Keeping relevant information

Putting all together

Data Structures in R: Guided Project Solutions

Dataquest

6/6/2020

Isolating the Rows We Need

Isolating the Columns We Need

Extracting the Top Ten countries in the number of tested cases

Summarizing the data based on the Country_Region column.

Taking the top 10

Identifying the Highest Positive Against Tested Cases

Getting vectors

Naming vectors

Identifying

Conclusion

Keeping relevant information

Putting all together

Summarizing the data based on the `Country_Region` column.