## Loaded pacman in order to organize the dates variable in the dataset, which matters later
pacman::p_load(tidyverse, lubridate)
## Loaded in dataset
setwd("~/DATA 110/R_Datasets")
country_vaccinations <- read_csv("country_vaccinations.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## country = col_character(),
## iso_code = col_character(),
## date = col_date(format = ""),
## total_vaccinations = col_double(),
## people_vaccinated = col_double(),
## people_fully_vaccinated = col_double(),
## daily_vaccinations_raw = col_double(),
## daily_vaccinations = col_double(),
## total_vaccinations_per_hundred = col_double(),
## people_vaccinated_per_hundred = col_double(),
## people_fully_vaccinated_per_hundred = col_double(),
## daily_vaccinations_per_million = col_double(),
## vaccines = col_character(),
## source_name = col_character(),
## source_website = col_character()
## )
## Used na.omit command to remove any NA values in the dataset
vac_nona <- na.omit(country_vaccinations)
head(vac_nona)
## # A tibble: 6 x 15
## country iso_code date total_vaccinati~ people_vaccinat~ people_fully_va~
## <chr> <chr> <date> <dbl> <dbl> <dbl>
## 1 Albania ALB 2021-02-18 3049 2438 611
## 2 Argent~ ARG 2021-01-21 265724 249372 16352
## 3 Argent~ ARG 2021-01-22 279602 254456 25146
## 4 Argent~ ARG 2021-01-23 288064 258876 29188
## 5 Argent~ ARG 2021-01-24 292023 260036 31987
## 6 Argent~ ARG 2021-01-25 292386 260122 32264
## # ... with 9 more variables: daily_vaccinations_raw <dbl>,
## # daily_vaccinations <dbl>, total_vaccinations_per_hundred <dbl>,
## # people_vaccinated_per_hundred <dbl>,
## # people_fully_vaccinated_per_hundred <dbl>,
## # daily_vaccinations_per_million <dbl>, vaccines <chr>, source_name <chr>,
## # source_website <chr>
## I used the dplyr package for the select command so I could view the variables I was interested in from the larger dataset and named this new table 'data1'. I want to see the relationship over time between countries and daily vaccinations.
data1 <- vac_nona %>%
select(country, date, daily_vaccinations)
head(data1)
## # A tibble: 6 x 3
## country date daily_vaccinations
## <chr> <date> <dbl>
## 1 Albania 2021-02-18 254
## 2 Argentina 2021-01-21 11704
## 3 Argentina 2021-01-22 11263
## 4 Argentina 2021-01-23 11124
## 5 Argentina 2021-01-24 10342
## 6 Argentina 2021-01-25 9046
## Because there are so many countries, I decided to choose 8 that, after quick research, were ahead in covid vaccinations. These countries included Germany, the United Kingdom, USA, Canada, France, Israel, Brazil, and Chile.
data2 <- data1 %>%
filter(country %in% c("Germany", "United Kingdom", "United States", "Canada", "France", "Israel", "Chile", "Brazil"))
head(data2)
## # A tibble: 6 x 3
## country date daily_vaccinations
## <chr> <date> <dbl>
## 1 Brazil 2021-02-06 199739
## 2 Brazil 2021-02-07 211375
## 3 Brazil 2021-02-08 211604
## 4 Brazil 2021-02-09 218237
## 5 Brazil 2021-02-10 228375
## 6 Brazil 2021-02-11 190540
## Changed the 'date' variable from character to Date using the pacman package
str(data2)
## tibble [377 x 3] (S3: tbl_df/tbl/data.frame)
## $ country : chr [1:377] "Brazil" "Brazil" "Brazil" "Brazil" ...
## $ date : Date[1:377], format: "2021-02-06" "2021-02-07" ...
## $ daily_vaccinations: num [1:377] 199739 211375 211604 218237 228375 ...
## - attr(*, "na.action")= 'omit' Named int [1:3590] 1 2 3 4 5 6 7 8 9 10 ...
## ..- attr(*, "names")= chr [1:3590] "1" "2" "3" "4" ...
## Used pacman package to format the the date as year, month, day to make it easier to work with when I go to filter the dates
data2$date <- data2$date %>%
ymd()
## Used dplyr 'filter' command in order to choose only February 2021 dates, so as to make the final visualization less daunting
feb_vacc <- data2 %>%
filter(month(date)==2)
## Used ggplot in order to create a line graph that shows how covid vaccinations have been going for the 8 countries I selected in the month of February. I also made sure to include lables for both my y and x axis as well as my title
plot1<-ggplot(feb_vacc, aes(x=date, y=daily_vaccinations, group=country)) +
geom_line(aes(color=country))+
geom_point(aes(color=country)) +
ggtitle("COVID Vaccinations in 8 Countries (Feb 2021)")+
xlab("February 2021")+
ylab("Daily Vaccinations")
plot1
## I wanted to also see how the vaccination process has been doing more recently, so I chose the most updated date in the data set (that was available for all countries) which was March 3rd, 2021. I decided to use the total vaccinations per hundred variable as that is adjusted to population and would allow me to view the data more objectively. I made use of the dplyr 'select' and 'filter' commands to do so.
data3 <- vac_nona %>%
select(country, date, total_vaccinations_per_hundred) %>%
filter(country %in% c("Canada", "Germany", "United Kingdom", "United States", "Israel", "Brazil", "Chile", "France")) %>%
filter(date == "2021-03-03")
head(data3)
## # A tibble: 6 x 3
## country date total_vaccinations_per_hundred
## <chr> <date> <dbl>
## 1 Brazil 2021-03-03 4.36
## 2 Canada 2021-03-03 5.54
## 3 Chile 2021-03-03 20.8
## 4 France 2021-03-03 7.4
## 5 Germany 2021-03-03 8.13
## 6 Israel 2021-03-03 97.5
## I decided that I wanted to visualize this data as a barplot to make it easier to compare the countries. So, I created a data frame with the countries and their corresponding total vaccination numbers (adjusted to population) and named this data frame 'vac_adj'
library(RColorBrewer)
library(ggplot2)
vac_adj <- data.frame(country=c("Brazil", "Canada", "Chile", "France", "Germany", "Israel", "UK", "US"),
total_vaccinations_per_hundred=c(4.36, 5.54, 20.83, 7.4, 8.13, 97.52, 32.33, 24.08))
head(vac_adj)
## country total_vaccinations_per_hundred
## 1 Brazil 4.36
## 2 Canada 5.54
## 3 Chile 20.83
## 4 France 7.40
## 5 Germany 8.13
## 6 Israel 97.52
## I used ggplot2 to create my visualization and the RColorBrewer package to fill in the colors and make it more visually appealing. I also added a title and labels to my my y and x axis using the x and y lab.
ggplot(vac_adj, aes(x=country, y=total_vaccinations_per_hundred, fill=country)) +
geom_bar(stat="identity") +
scale_fill_brewer(palette="Set3") +
ggtitle("Total COVID Vaccinations per Hundred People (as of March 3rd)")+
xlab("Country")+
ylab("Total Vaccinations/Hundred")
Short Essay - Asma Tariq
DATA 110 - Project 1
For this project I wanted to explore the status of the world when it comes to COVID-19 vaccinations. I decided to use the dataset titled “COVID-19 World Vaccination Progress – Daily and Total Vaccination for COVID-19 in the World” published by Gabriel Preda on Kaggle. I chose this dataset because of the large amount of information it offers, including total vaccinations, people fully vaccinated, people vaccinated per hundred, and the vaccines that country is administrating, as well as many other variables. The dataset includes numerical variables, which encompasses all the vaccination numbers, and categorical variables, which include the country name, country code, the type of vaccine, and the source of information for each country. This is a very large dataset with a lot of information, and to make it more digestible I knew I had to narrow down the countries I was most interested in exploring. I chose to focus on eight countries that have been most active in administrating COVID-19 vaccinations, which are the United States, United Kingdom, France, Germany, Canada, Israel, Chile, and Brazil.
This dataset unfortunately had many N/A variables, and because I did not want to work with countries with mostly unavailable information, I used the ‘na.omit’ command in order to remove all N/A values. This condensed the dataset from 5140 variables to 1550 variables and narrowed down the list of countries I could choose from further. For my first visualization, I decided to explore the progress of daily vaccinations in the month of February. After a quick str command on the dataset, I realized that the ‘date’ column was defined as a character, which would make it harder to work with since I wanted to create a line graph showing daily vaccinations over time. I decided to use the pacman package, and specifically the ymd (year, month, day) command to define the dates as numerical values rather than categorical values. With the use of the pipe operator and dplyr commands, I selected the specific variables I wanted to look at, both for my first visualization and my second one. Using ggplot2, I created a line graph describing the relationship over time between countries and daily vaccinations. I was intrigued by the difference between the United States and the other 5 countries: according to many media articles and reporting, the U.S. was allegedly doing poorly in its vaccine distribution. Yet, in my visualization, it seemed that the U.S. was doing astronomically better than other nations. I then considered the fact that the variable I used, daily vaccinations, was not adjusted to population. The U.S. is much larger than countries like Israel and Canada, so of course, daily vaccinations would be much higher. In order to view this data more objectively and see how well the US was doing compared to other countries, I chose to create a bar plot that showed the total number of vaccinations per one hundred people. I was also interested in seeing how these countries were doing more recently, so I filtered the dates to view the most current date in the dataset: March 3rd, 2021. Unsurprisingly, the visualization allowed me to see that the U.S. was in fact not doing as well as it looked in the line graph. Israel and the U.K. were both doing much better in vaccine administration.
If it were possible, I would have loved to explore the progress of COVID-19 vaccinations in countries with less resources and less media attention. Unfortunately, many of these countries had too many N/A variables, therefore making it very difficult to make visualizations or track any progress. I am also disappointed that I could not include China in my variables because of the fact that most of their data is unreported—I’m interested in seeing how the most populous country in the world is doing in regards to their response to COVID-19.
Sources: https://www.bbc.com/news/world-56025355 https://www.visualcapitalist.com/most-populous-countries/