Data Wrangling Assessment Task 3: Dataset challenge

Required packages

library(dplyr)
library(tidyr) 
library(knitr) 
library(magrittr)
library(lubridate)
library(ggplot2)
library(forecast)

Data, Tidy & Manipulate Data I (Steps 1, 2, 6 - Get, Tidy and Merge Data)

These two data sets contain information related to Covid19, retrieved from Covid19data.com.au

The First data set is about Covid19 Daily cases

Variables include:

Date - Date from 1/08/2021 to 12/10/2021
State - The state in Australia
Cases - The number of new daily Covid19 confirmed cases

Source

Covid19data, 2021, Cases: States and Territories, Covid19data, viewed 13 October 2021,<https://www.covid19data.com.au/states-and-territories>

The Second data set is about the number of double vaccinated people

Variables include:

Date - Date from 1/08/2021 to 12/10/2021
State - The state in Australia
DoubleVaxxed - The total number of double vaccinated people

Source

Covid19data, 2021, COVID-19 Vaccinations in Australia, Covid19data, viewed 13 October 2021, <https://www.covid19data.com.au/vaccines>

# First we download the two data sets from covid19data

# Then we import data using read.csv and assign them as cases and doses 
cases <- read.csv("daily cases.csv")
head(cases)

doses <- read.csv("2 Doses.csv")
head(doses)

# In order to conform the tidy data principles: Each variable needs to have its own column, each observation needs to have its own row and each value needs to have its own cell. To do this, we need to convert the wide data to long data.
# The current data has column headers which are values, not variable names.

cases <- cases %>% 
  pivot_longer(!ï..Date, names_to = "State", values_to = "Cases")

# Here we need to make sure all variables are the same class to perform the pivot_longer function on doses.
doses$NSW <- as.character(doses$NSW)
doses$VIC <- as.character(doses$VIC)
doses$QLD <- as.character(doses$QLD)
doses$SA <- as.character(doses$SA)
doses$WA <- as.character(doses$WA)
doses$TAS <- as.character(doses$TAS)
doses$NT <- as.character(doses$NT)
doses$ACT <- as.character(doses$ACT)

doses <- doses %>% 
  pivot_longer(!ï..Date, names_to = "State", values_to = "DoubleVaxxed")

# We then use bind_cols function to merge the data sets and assign as CovidData
CovidData <- bind_cols(cases, doses)
head(CovidData)

# As you can see, there are duplicate variables Date and State. Here we drop the duplicates
CovidData <- select(CovidData, -ï..Date...4, -State...5)


# Here we rename the column names
CovidData <- CovidData %>% 
  rename(Date = ï..Date...1,
         State = State...2)
# Here we remove the commas in the DoubleVaxxed column
CovidData$DoubleVaxxed <- as.numeric(gsub(",","",CovidData$DoubleVaxxed))

CovidData

Understand (Steps 3, 4, 5 - Inspect, convert variable classes)

# Here we use the str() function to inspect the structure of the data.
str(CovidData)

## tibble [584 x 4] (S3: tbl_df/tbl/data.frame)
##  $ Date        : chr [1:584] "1/08/2021" "1/08/2021" "1/08/2021" "1/08/2021" ...
##  $ State       : chr [1:584] "NSW" "VIC" "QLD" "SA" ...
##  $ Cases       : int [1:584] 241 4 9 3 0 0 0 0 209 2 ...
##  $ DoubleVaxxed: num [1:584] 1252304 1038227 753447 275925 340905 ...

# From this, we can clearly see that Date, State and DoubleVaxxed need to have their variable classes converted appropriately.
CovidData$Date <- as.Date(CovidData$Date, format = "%d/%m/%Y")
CovidData$State <- as.factor(CovidData$State)
levels(CovidData$State)

## [1] "ACT" "NSW" "NT"  "QLD" "SA"  "TAS" "VIC" "WA"

CovidData$Cases <- as.numeric(CovidData$Cases)
CovidData$DoubleVaxxed <- as.numeric(CovidData$DoubleVaxxed)

# Here we use lapply to check our classes after the conversions
lapply(CovidData, class)

## $Date
## [1] "Date"
## 
## $State
## [1] "factor"
## 
## $Cases
## [1] "numeric"
## 
## $DoubleVaxxed
## [1] "numeric"

CovidData

Tidy & Manipulate Data II (STEP 7 - Create new variable)

# Here we would like to create/mutate new variables from the existing variables.
# We will calculate the percentage of people who are double vaccinated per state.
# In order to do this, we need to find out the total population of each state. (Source: Australian Bureau of Statistics, National, state and territory population March 2021)

# We create a new list and join it to our main data frame.
mylist <- list(c("NSW", "VIC", "QLD", "SA", "WA", "TAS", "NT", "ACT"), c(8176400, 6648600, 5206400, 1771700, 2675800, 542000, 247000, 431800))
mylist

## [[1]]
## [1] "NSW" "VIC" "QLD" "SA"  "WA"  "TAS" "NT"  "ACT"
## 
## [[2]]
## [1] 8176400 6648600 5206400 1771700 2675800  542000  247000  431800

# Convert to data frame
mydf <- as.data.frame(mylist)
colnames(mydf) <- c("State", "Population")

# Join to original data frame
CovidData %<>%
  left_join(mydf)
CovidData

# Calculate percentage of people who are double vaccinated, per state.
CovidData %<>%
  mutate(Percentage_Fully_Vaxxed = (DoubleVaxxed / Population) * 100)
head(CovidData)

Scan I (Step 8 - Missing Values)

# Here we use the colSums and is.na function to see if there are any missing values. This also clearly shows which variable has missing values. This also shows how many values are missing.
colSums(is.na(CovidData))

##                    Date                   State                   Cases 
##                       0                       0                       3 
##            DoubleVaxxed              Population Percentage_Fully_Vaxxed 
##                       0                       0                       0

# To see the position of the missing values, we use the which function. This shows us that the missing values all fall on 2021-10-12 
which(is.na(CovidData$Cases))

## [1] 580 581 583

CovidData[c(580, 581, 583), ]

# A potential reason for the missing values is most likely due to not being able to acquire such up to date data from certain states.
# To deal with the missing values in this case, we remove the most recent values dated 2021-10-12
CovidData <- CovidData[-c(577:584), ]
CovidData

Scan II (Step 9 - Outliers)

# First we create histograms of Cases and DoubleVaxxed to get a visual idea of the distribution.
# We can see that overall, the data is not normally distributed.
# This means we will not use the z-score method
ggplot(CovidData, aes(x = Cases)) +
        geom_histogram(color = "grey30", fill = "white") + 
        facet_grid(State ~ ., scales = "free")

ggplot(CovidData, aes(x = DoubleVaxxed)) +
        geom_histogram(color = "grey30", fill = "white") + 
        facet_grid(State ~ ., scales = "free")

# We can scan the data for outliers using boxplots.
# We cannot see any outliers in the DoubleVaxxed variable (we can group them into states)

ggplot(CovidData) +
  aes(x = State, y = DoubleVaxxed) + 
  geom_boxplot(fill = "#0c4c8a") +
  theme_minimal()

# Or we can show a boxplot of Cases as a whole
boxplot(CovidData$Cases)

# In this section we impute the outliers with the mean
q1 <- summary(CovidData$Cases)[[2]]
q3 <- summary(CovidData$Cases)[[5]]
iqr <- q3 - q1
lower_fence <- q1 - (1.5 * iqr)
upper_fence <- q3 + (1.5 * iqr)
up_outliers <- which(CovidData$Cases > upper_fence)
low_outliers <- which(CovidData$Cases < lower_fence)

bp <- boxplot(CovidData$Cases)

outliers <- bp$out
CovidData$Cases[up_outliers] <- mean(CovidData$Cases)
CovidData$Cases[low_outliers] <- mean(CovidData$Cases)

boxplot(CovidData$Cases)

# Cannot see outliers with time series line graph
CovidData %>%
  ggplot( aes(x= Date, y = DoubleVaxxed, group = State, color = State)) +
    geom_line() +
    ggtitle("Total number of Double Vaccinated People per State") +
    ylab("No. of People Double Vaccinated") +
    theme_minimal()

# Outliers here could be due to data entry errors, which can arise from human error during collection or recording.
# Outliers here could also be due to sampling error. These new daily cases are only from a small sample of the population who actually get tested.
# This could also be the reason why the DoubleVaxxed variable does not have outliers. It does not have the sampling error stated above.

Transform (Step 10 - Data transformation)

# In this step we will decrease the skewness and convert the distribution into a normal distribution.
# We create a histogram of the raw data
hist(CovidData$DoubleVaxxed)

boxcox_vaxxed <- BoxCox(CovidData$DoubleVaxxed, lambda = "auto")

attr(boxcox_vaxxed, which = "lambda")

## [1] 0.05255673

hist(boxcox_vaxxed)