Quiz 4 Getting and Cleaning
Data# Checking if the subfolder already exists.
if (!dir.exists("data")) {
# Creating a subfolder to store the data.
dir.create(path = "./data")
}
# Downloading the file "The American Community Survey".
utils::download.file(url = "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06hid.csv",
destfile = "./data/acs.csv",
mode = "wb")
# Downloading the code book.
utils::download.file(url = "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FPUMSDataDict06.pdf",
destfile = "./data/code_book.pdf",
mode = "wb")
Importing the downloaded file into R Object.
# Loading the ACS data.
acs <- utils::read.csv(file = "./data/acs.csv")
# Performing the given expression.
strsplit(x = colnames(acs), "wgtp")[123]
## [[1]]
## [1] "" "15"
# Downloading the GDP data.
utils::download.file(url = "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FGDP.csv",
destfile = "./data/gdp.csv")
# Loading GDP data.
df_gdp <- utils::read.csv(file = "./data/gdp.csv",
skip = 3, # The first rows of this database is blank
header = TRUE) # Forcing the read.csv readh the first rows as header.
# First rows of GDP data.
# CASE: github_document
if(!knitr::is_html_output()) {
# Static table using Kable Package.
df_gdp %>%
head() %>%
kableExtra::kbl() %>%
kableExtra::kable_styling()
# CASE: hmtl_document
} else {
# Interactive table using DT package.
DT::datatable(df_gdp)
}
# Data Cleaning
df_gdp_tidy <- df_gdp %>%
mutate(US.dollars. = sub(pattern = " ", replacement = "", x = US.dollars.)) %>%
mutate(US.dollars. = gsub(pattern = ",", replacement = "", x = US.dollars.)) %>%
mutate(gdp = as.numeric(US.dollars.)) %>%
mutate(ranking = as.numeric(Ranking)) %>%
select(-c(X.1, X.2, X.3, X.4, X.5, X.6)) %>%
na.omit() %>%
rename("CountryName" = Economy)
After the data manipulation, I have calculated the GDP average as follows:
# Calculating the GDP Average
mean(df_gdp_tidy$gdp, na.rm = TRUE)
## [1] 377652.4
The pattern used to filter countries that start with βUnitedβ is:
^United.
# Readings: https://r4ds.had.co.nz/strings.html
# ^ to match the start of the string.
#
# Solution 1:
grep(pattern = "^United", x = df_gdp_tidy$CountryName)
## [1] 1 6 32
# Solution 2:
df_gdp_tidy %>% filter(grepl(pattern = "^United", x = CountryName))
## X Ranking CountryName US.dollars. gdp ranking
## 1 USA 1 United States 16244600 16244600 1
## 2 GBR 6 United Kingdom 2471784 2471784 6
## 3 ARE 32 United Arab Emirates 348595 348595 32
Both solutions have the same 3 countries, which starts with the United.
# Downloading the GDP data.
utils::download.file(url = "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FGDP.csv",
destfile = "./data/gdp.csv")
# Downloading the Educational data.
utils::download.file(url = "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FEDSTATS_Country.csv",
destfile = "./data/education.csv")
# Loading Education data.
df_education <- utils::read.csv(file = "./data/education.csv")
# First rows os Education Data.
# CASE: github_document
if(!knitr::is_html_output()) {
# Static table using Kable Package.
df_education %>%
head() %>%
kableExtra::kbl() %>%
kableExtra::kable_styling()
# CASE: hmtl_document
} else {
# Interactive table using DT package.
DT::datatable(df_education)
}
# Loading GDP data.
df_gdp <- utils::read.csv(file = "./data/gdp.csv",
skip = 3, # The first rows of this database is blank
header = TRUE) # Forcing the read.csv readh the first rows as header.
# CLEANING
df_gdp <- df_gdp %>%
mutate(US.dollars. = sub(pattern = " ", replacement = "", x = US.dollars.)) %>%
mutate(US.dollars. = gsub(pattern = ",", replacement = "", x = US.dollars.)) %>%
mutate(US.dollars. = as.numeric(US.dollars.)) %>%
mutate(Ranking = as.numeric(Ranking))
# CLEANING
df_gdp_tidy <- df_gdp %>%
select(-c(X.1, X.2, X.3, X.4, X.5, X.6)) %>%
rename(country_shortcode = X, ranking = Ranking, country = Economy, gdp = US.dollars.) %>%
mutate(gdp = as.numeric(gdp)) %>%
na.omit() %>%
filter(ranking %in% 1:190)
# CLEANING
df_education_tidy <- df_education %>%
rename(country_shortcode = CountryCode)
# Merging
df_gdp_edu <- merge(x = df_education_tidy, y = df_gdp_tidy)
In the Special.Notes column, it is possible to track
when the Fiscal Year ends. I have used the expression βJune 30β to find
all the countries with their fiscal year ending on this date.
# Printing the countries with the fiscal year ending on 30 June (end of June).
# CASE: github_document
if(!knitr::is_html_output()) {
# Static table using Kable Package.
df_gdp_edu %>%
select(country, Special.Notes) %>%
filter(grepl(pattern = "June 30", x = Special.Notes)) %>%
kableExtra::kbl() %>% kableExtra::kable_styling()
# CASE: hmtl_document
} else {
# Interactive table using DT package.
df_gdp_edu %>%
select(country, Special.Notes) %>%
filter(grepl(pattern = "June 30", x = Special.Notes)) %>%
DT::datatable()
}
Counting the number of countries.
# Filtering and counting the countries with fiscal year ending on June.
df_gdp_edu %>%
select(country, Special.Notes) %>%
filter(grepl(pattern = "June 30", x = Special.Notes)) %>%
nrow()
## [1] 13
Executing the given code:
# Loading the quantmod package
library(quantmod)
# Gathering data from Amazon.
amzn = getSymbols("AMZN",auto.assign=FALSE)
# Gathering the dates from each data.
sampleTimes = index(amzn)
I have converted it into a tibble dataframe to be able to use the tidyverse package.
# Converting the amzn object into Tibble.
data_q5 <- as_tibble(amzn)
# Adding the date column.
data_q5['date'] <- sampleTimes
# Filtering data only from 2012.
data_q5_2012 <- data_q5 %>%
filter(date >= "2012-01-01" & date <= "2012-12-31")
# Showing the data gathered.
# CASE: github_document
if(!knitr::is_html_output()) {
# Static table using Kable Package.
data_q5_2012 %>%
head() %>%
kableExtra::kbl() %>%
kableExtra::kable_styling()
# CASE: hmtl_document
} else {
# Interactive table using DT package.
DT::datatable(data_q5_2012)
}
The number of observations in 2012:
# How many values in 2012
nrow(data_q5_2012)
## [1] 250
I have used the Lubridate package to work with a date type. In addition, I have added a column to store the wee day.
# Ensuring to show the Week days in English.s
Sys.setlocale("LC_ALL","English")
## [1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252"
# Adding a new column to store the wee days.
data_q5_2012_wdays <- data_q5_2012 %>%
mutate(wday = lubridate::wday(x = lubridate::ymd(date), label=TRUE))
# Showing the data with the new column of week day.
# CASE: github_document
if(!knitr::is_html_output()) {
# Static table using Kable Package.
data_q5_2012_wdays %>%
head() %>%
kableExtra::kbl() %>%
kableExtra::kable_styling()
# CASE: hmtl_document
} else {
# Interactive table using DT package.
DT::datatable(data_q5_2012_wdays)
}
Finally, the number of Monday in 2012:
# Number of observations in 2012 on Mondays.
data_q5_2012_wdays%>%
filter(wday == "Mon") %>%
nrow()
## [1] 47