library(readr)
# Loading the CSV file dataset
covid_df <- read_csv("covid19.csv")
## Parsed with column specification:
## cols(
## Date = col_date(format = ""),
## Continent_Name = col_character(),
## Two_Letter_Country_Code = col_character(),
## Country_Region = col_character(),
## Province_State = col_character(),
## positive = col_double(),
## hospitalized = col_double(),
## recovered = col_double(),
## death = col_double(),
## total_tested = col_double(),
## active = col_double(),
## hospitalizedCurr = col_double(),
## daily_tested = col_double(),
## daily_positive = col_double()
## )
# Determine the dimension of the dataframe and show
dim(covid_df)
## [1] 10903 14
# Determine and store the column names of dataframe
vector_cols <- colnames(covid_df)
# Display the content of vector_cols
vector_cols # Is a character vector
## [1] "Date" "Continent_Name"
## [3] "Two_Letter_Country_Code" "Country_Region"
## [5] "Province_State" "positive"
## [7] "hospitalized" "recovered"
## [9] "death" "total_tested"
## [11] "active" "hospitalizedCurr"
## [13] "daily_tested" "daily_positive"
# Display first few rows of dataset
head(covid_df)
## # A tibble: 6 x 14
## Date Continent_Name Two_Letter_Coun~ Country_Region Province_State
## <date> <chr> <chr> <chr> <chr>
## 1 2020-01-20 Asia KR South Korea All States
## 2 2020-01-22 North America US United States All States
## 3 2020-01-22 North America US United States Washington
## 4 2020-01-23 North America US United States All States
## 5 2020-01-23 North America US United States Washington
## 6 2020-01-24 Asia KR South Korea All States
## # ... with 9 more variables: positive <dbl>, hospitalized <dbl>,
## # recovered <dbl>, death <dbl>, total_tested <dbl>, active <dbl>,
## # hospitalizedCurr <dbl>, daily_tested <dbl>, daily_positive <dbl>
#Display total summary of dataset using library(tibble)
library(tibble)
glimpse(covid_df) # function `glimpse()` shows dimensions, column names, column types, and its rows
## Rows: 10,903
## Columns: 14
## $ Date <date> 2020-01-20, 2020-01-22, 2020-01-22, 2020-0...
## $ Continent_Name <chr> "Asia", "North America", "North America", "...
## $ Two_Letter_Country_Code <chr> "KR", "US", "US", "US", "US", "KR", "US", "...
## $ Country_Region <chr> "South Korea", "United States", "United Sta...
## $ Province_State <chr> "All States", "All States", "Washington", "...
## $ positive <dbl> 1, 1, 1, 1, 1, 2, 1, 1, 4, 0, 3, 0, 0, 0, 0...
## $ hospitalized <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ recovered <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ death <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ total_tested <dbl> 4, 1, 1, 1, 1, 27, 1, 1, 0, 0, 0, 0, 0, 0, ...
## $ active <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ hospitalizedCurr <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ daily_tested <dbl> 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ daily_positive <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Filter rows related to "All States" under `Province_States` and remove `Province_States` column
covid_df_all_states <- covid_df %>%
filter(Province_State == "All States") %>%
select(-Province_State)
# We can remove `Province_State` without losing relevant information because we filtered for `All States` first
# Extract the columns related to the daily measures
covid_df_all_states_daily <- covid_df_all_states %>%
select(Date, Country_Region, active, hospitalizedCurr, daily_tested, daily_positive)
# Extract the top ten cases countries data
covid_df_all_states_daily_sum <- covid_df_all_states_daily %>% group_by(Country_Region) %>%
summarise(
tested = sum(daily_tested),
positive = sum(daily_positive),
active = sum(active),
hospitalized = sum(hospitalizedCurr)) %>%
arrange(-tested)
## `summarise()` ungrouping output (override with `.groups` argument)
covid_df_all_states_daily_sum # Display
## # A tibble: 108 x 5
## Country_Region tested positive active hospitalized
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 United States 17282363 1877179 0 0
## 2 Russia 10542266 406368 6924890 0
## 3 Italy 4091291 251710 6202214 1699003
## 4 India 3692851 60959 0 0
## 5 Turkey 2031192 163941 2980960 0
## 6 Canada 1654779 90873 56454 0
## 7 United Kingdom 1473672 166909 0 0
## 8 Australia 1252900 7200 134586 6655
## 9 Peru 976790 59497 0 0
## 10 Poland 928256 23987 538203 0
## # ... with 98 more rows
covid_top_10 <- head(covid_df_all_states_daily_sum, 10) # inserting ", 10" displays top 10 countries
# Which countries have had the highest number of positive cases against the number of tests?
# creating vectors
countries <- covid_top_10$Country_Region
tested_cases <- covid_top_10$tested
positive_cases <- covid_top_10$positive
active_cases <- covid_top_10$active
hospitalized_cases <- covid_top_10$hospitalized
# naming vectors
names(tested_cases) <- countries
names(positive_cases) <- countries
names(active_cases) <- countries
names(hospitalized_cases) <- countries
positive_cases
## United States Russia Italy India Turkey
## 1877179 406368 251710 60959 163941
## Canada United Kingdom Australia Peru Poland
## 90873 166909 7200 59497 23987
sum(positive_cases)
## [1] 3108623
mean(positive_cases) # why is mean relevant here?
## [1] 310862.3
positive_cases / sum(positive_cases)
## United States Russia Italy India Turkey
## 0.603861903 0.130722831 0.080971543 0.019609647 0.052737498
## Canada United Kingdom Australia Peru Poland
## 0.029232557 0.053692262 0.002316138 0.019139342 0.007716278
positive_cases / tested_cases
## United States Russia Italy India Turkey
## 0.108618191 0.038546552 0.061523368 0.016507300 0.080711720
## Canada United Kingdom Australia Peru Poland
## 0.054915490 0.113260617 0.005746668 0.060910738 0.025840932
positive_tested_top_3 <- c("United Kingdom" = 0.11, "United States" = 0.10, "Turkey" = 0.08)
# Creating additional vectors...
united_kingdom <- c(0.11, 1473672, 166909, 0, 0)
united_states <- c(0.10, 17282363, 1877179, 0, 0)
turkey <- c(0.08, 2031192, 163941, 2980960, 0)
# Create matrix combining additional vectors using rbind()
covid_mat <- rbind(united_kingdom, united_states, turkey)
# Naming columns
colnames(covid_mat) <- c("Ratio", "tested", "positive", "active", "hospitalized")
covid_mat # Display matrix
## Ratio tested positive active hospitalized
## united_kingdom 0.11 1473672 166909 0 0
## united_states 0.10 17282363 1877179 0 0
## turkey 0.08 2031192 163941 2980960 0
question <- "Which countries have had the highest number of positive cases against the number of tests?"
answer <- c("Positive tested cases" = positive_tested_top_3)
dataset <- list(
original = covid_df,
AllStates = covid_df_all_states,
Daily = covid_df_all_states_daily,
top_10 = covid_top_10
)
matrices <- list(covid_mat)
vectors <- list(vector_cols, countries)
data_structure_list <- list("dataframe" = dataset, "matrix" = matrices, "vector" = vectors)
covid_analysis_list <- list(question, answer, data_structure_list)
covid_analysis_list[[2]]
## Positive tested cases.United Kingdom Positive tested cases.United States
## 0.11 0.10
## Positive tested cases.Turkey
## 0.08