Quiz 4 Getting and Cleaning Data

📚 Specialization: Data Science: Foundations using R Specialization
📖 Course: Getting and Cleaning Data
- 🧑‍🏫 Instructor: Jeffrey Leek
📆 Week 4
- Start: 2022/05/23
- Finish: 2022/05/24
📦 Github Repository: Static Document

# Checking if the subfolder already exists.
if (!dir.exists("data")) {
    
    # Creating a subfolder to store the data.
    dir.create(path = "./data")
}

Question 1

# Downloading the file "The American Community Survey".
utils::download.file(url = "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06hid.csv",
                     destfile = "./data/acs.csv",
                     mode = "wb")

# Downloading the code book.
utils::download.file(url = "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FPUMSDataDict06.pdf",
                     destfile = "./data/code_book.pdf",
                     mode = "wb")

Importing the downloaded file into R Object.

# Loading the ACS data.
acs <- utils::read.csv(file = "./data/acs.csv")

# Performing the given expression.
strsplit(x = colnames(acs), "wgtp")[123]

## [[1]]
## [1] ""   "15"

Question 2

# Downloading the GDP data.
utils::download.file(url = "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FGDP.csv",
                     destfile = "./data/gdp.csv")

# Loading GDP data.
df_gdp <- utils::read.csv(file = "./data/gdp.csv",
                          skip = 3,                   # The first rows of this database is blank
                          header = TRUE)              # Forcing the read.csv readh the first rows as header.

# First rows of GDP data.
    # CASE: github_document
    if(!knitr::is_html_output()) {
    
        # Static table using Kable Package.
        df_gdp %>%
            head() %>%
            kableExtra::kbl() %>%
            kableExtra::kable_styling()
    
    # CASE: hmtl_document
    } else {
    
        # Interactive table using DT package.
        DT::datatable(df_gdp)
    }

# Data Cleaning
df_gdp_tidy <- df_gdp %>%
    mutate(US.dollars. = sub(pattern = " ", replacement = "", x = US.dollars.)) %>%
    mutate(US.dollars. = gsub(pattern = ",", replacement = "", x = US.dollars.)) %>%
    mutate(gdp = as.numeric(US.dollars.)) %>%
    mutate(ranking = as.numeric(Ranking)) %>%
    select(-c(X.1, X.2, X.3, X.4, X.5, X.6)) %>%
    na.omit() %>%
    rename("CountryName" = Economy)

After the data manipulation, I have calculated the GDP average as follows:

# Calculating the GDP Average
mean(df_gdp_tidy$gdp, na.rm = TRUE)

## [1] 377652.4

Question 3

The pattern used to filter countries that start with “United” is: ^United.

# Readings: https://r4ds.had.co.nz/strings.html
#    ^ to match the start of the string.
#
# Solution 1:
grep(pattern = "^United", x = df_gdp_tidy$CountryName)

## [1]  1  6 32

# Solution 2:
df_gdp_tidy %>% filter(grepl(pattern = "^United", x = CountryName))

##     X Ranking          CountryName US.dollars.      gdp ranking
## 1 USA       1        United States   16244600  16244600       1
## 2 GBR       6       United Kingdom    2471784   2471784       6
## 3 ARE      32 United Arab Emirates     348595    348595      32

Both solutions have the same 3 countries, which starts with the United.

Question 4

# Downloading the GDP data.
utils::download.file(url = "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FGDP.csv",
                     destfile = "./data/gdp.csv")

# Downloading the Educational data.
utils::download.file(url = "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FEDSTATS_Country.csv",
                     destfile = "./data/education.csv")

# Loading Education data.
df_education <- utils::read.csv(file = "./data/education.csv")

# First rows os Education Data.
    # CASE: github_document
    if(!knitr::is_html_output()) {
    
        # Static table using Kable Package.
        df_education %>%
            head() %>%
            kableExtra::kbl() %>%
            kableExtra::kable_styling()
    
    # CASE: hmtl_document
    } else {
    
        # Interactive table using DT package.
        DT::datatable(df_education)
    }

# Loading GDP data.
df_gdp <- utils::read.csv(file = "./data/gdp.csv",
                          skip = 3,                   # The first rows of this database is blank
                          header = TRUE)              # Forcing the read.csv readh the first rows as header.

# CLEANING
df_gdp <- df_gdp %>%
    mutate(US.dollars. = sub(pattern = " ", replacement = "", x = US.dollars.)) %>%
    mutate(US.dollars. = gsub(pattern = ",", replacement = "", x = US.dollars.)) %>%
    mutate(US.dollars. = as.numeric(US.dollars.)) %>%
    mutate(Ranking = as.numeric(Ranking))

# CLEANING
df_gdp_tidy <- df_gdp %>%
    select(-c(X.1, X.2, X.3, X.4, X.5, X.6)) %>%
    rename(country_shortcode = X, ranking = Ranking, country = Economy, gdp = US.dollars.) %>%
    mutate(gdp = as.numeric(gdp)) %>%
    na.omit() %>%
    filter(ranking %in% 1:190)

# CLEANING
df_education_tidy <- df_education %>%
    rename(country_shortcode = CountryCode)

# Merging
df_gdp_edu <- merge(x = df_education_tidy, y = df_gdp_tidy)

In the Special.Notes column, it is possible to track when the Fiscal Year ends. I have used the expression “June 30” to find all the countries with their fiscal year ending on this date.

# Printing the countries with the fiscal year ending on 30 June (end of June).
    # CASE: github_document
    if(!knitr::is_html_output()) {
    
        # Static table using Kable Package.
        df_gdp_edu %>%
            select(country, Special.Notes) %>%
            filter(grepl(pattern = "June 30", x = Special.Notes)) %>%
            kableExtra::kbl() %>% kableExtra::kable_styling()
    
    # CASE: hmtl_document
    } else {
    
        # Interactive table using DT package.
        df_gdp_edu %>%
            select(country, Special.Notes) %>%
            filter(grepl(pattern = "June 30", x = Special.Notes)) %>%
            DT::datatable()
    }

Counting the number of countries.

# Filtering and counting the countries with fiscal year ending on June.
df_gdp_edu %>%
    select(country, Special.Notes) %>%
    filter(grepl(pattern = "June 30", x = Special.Notes)) %>%
    nrow()

## [1] 13

Question 5

Executing the given code:

# Loading the quantmod package
library(quantmod)

# Gathering data from Amazon.
amzn = getSymbols("AMZN",auto.assign=FALSE)

# Gathering the dates from each data.
sampleTimes = index(amzn)

I have converted it into a tibble dataframe to be able to use the tidyverse package.

# Converting the amzn object into Tibble.
data_q5 <- as_tibble(amzn)

# Adding the date column.
data_q5['date'] <- sampleTimes

# Filtering data only from 2012.
data_q5_2012 <- data_q5 %>%
    filter(date >= "2012-01-01" & date <= "2012-12-31")

# Showing the data gathered.
    # CASE: github_document
    if(!knitr::is_html_output()) {
    
        # Static table using Kable Package.
        data_q5_2012 %>%
            head() %>%
            kableExtra::kbl() %>%
            kableExtra::kable_styling()
    
    # CASE: hmtl_document
    } else {
    
        # Interactive table using DT package.
        DT::datatable(data_q5_2012)
    }

The number of observations in 2012:

# How many values in 2012
nrow(data_q5_2012)

## [1] 250

I have used the Lubridate package to work with a date type. In addition, I have added a column to store the wee day.

# Ensuring to show the Week days in English.s
Sys.setlocale("LC_ALL","English")

## [1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252"

# Adding a new column to store the wee days.
data_q5_2012_wdays <- data_q5_2012 %>%
    mutate(wday = lubridate::wday(x = lubridate::ymd(date), label=TRUE))

# Showing the data with the new column of week day.
    # CASE: github_document
    if(!knitr::is_html_output()) {
    
        # Static table using Kable Package.
        data_q5_2012_wdays %>%
            head() %>%
            kableExtra::kbl() %>%
            kableExtra::kable_styling()
    
    # CASE: hmtl_document
    } else {
    
        # Interactive table using DT package.
        DT::datatable(data_q5_2012_wdays)
    }

Finally, the number of Monday in 2012:

# Number of observations in 2012 on Mondays.
data_q5_2012_wdays%>%
    filter(wday == "Mon") %>%
    nrow()

## [1] 47

`Quiz 4` Getting and Cleaning Data

👨🏻‍💻 Anderson H Uyekita

Question 1

Question 2

Question 3

Question 4

Question 5