knitr::opts_chunk$set(echo = TRUE, message=FALSE, warning=FALSE)
library(ggplot2)
library(dplyr)
library(see)
library(data.table)
library(correlation)
library(tidycensus)
library(kableExtra)
library(reactable)
train_data <- fread('train.csv')
test_data <- fread('test.csv')This R Markdown-produced HTML (using the R Studio IDE) is the final requirement for the the Coursera.org online course “Reproducible Templates for Analysis and Dissemination” taught by Dr. Melinda Higgins, who at the time of recording the course material was a Research Professor/Senior Biostatistician in the Emory University School of Nursing.
I will present in Course-Required Elements the elements requested in the final assignment description.
The Kaggle section presents code to create predictions, as well as EDA, for the Kaggle.com Microbusiness Density competition sponsored by the website hosting company, GoDaddy.
A few bits about the Kaggle.com Microbusiness Density competition sponsored by the website hosting company, GoDaddy.
More engagement with this competition appears in Kaggle.
Totally unrelated to the Kaggle competition but required for this course. :)
Let’s look at the correlation between temperature and pressure in the built-in pressure dataset using the cor_test function of the correlation R package, aided by the see R package.
result <- cor_test(pressure, "temperature", "pressure")
plot(result)kable(head(cars)) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "responsive"),
## To avoid table being too wide
latex_options="scale_down",
fixed_thead = list(enabled = T, background = "orange")
)| speed | dist |
|---|---|
| 4 | 2 |
| 4 | 10 |
| 7 | 4 |
| 7 | 22 |
| 8 | 16 |
| 9 | 10 |
This R Markdown uses the flatly theme and haddock highlight style.
Some basics about the Kaggle GoDaddy per-county in the United States microbusiness density competition appear above at Kaggle.
Using the most recent AKA last microbusiness density in the training set for the microbusiness density predictions in the test dataset yields a public leaderboard score of 1.0936 that is not easy to beat.
To try something novel, let’s use the last microbusiness density except when prior the microbusiness density (so two months prior to the first test data month) was more than 25 percent different from the last microbusiness density. In such cases, let’s make the prediction the next-to-last microbusiness density.
By the way, the cutoff is used throughout this code
via a parameter
set at the top of the code in the YAML. .
This code uses many dplyr package functions.
Recalling that just using the last microbusiness density caused a public leaderboard score of 1.0936, the tested adjustment yielded a worse score of 1.2013.
Kaggle Submissions - hard to beat the latest microbusiness density
last_train_data <- train_data %>%
group_by(county, state) %>%
arrange(first_day_of_month) %>%
## lag1 is one month prior to the current month per county
mutate(lag1 =
lag(microbusiness_density, 1L, order_by = first_day_of_month)) %>%
## now take the most recent month, so lag 1 is the next-to-last month
slice(which.max(first_day_of_month)) %>%
ungroup() %>%
rename(last_value = microbusiness_density,
next_to_last_value = lag1) %>%
select(cfips, county, state, next_to_last_value, last_value)
last_train_data_diff <- last_train_data %>%
mutate(last_vs_next_to_last_difference_pct =
round(100 * abs(last_value - next_to_last_value) / last_value),1) %>%
select(cfips, county, state, next_to_last_value, last_value,
last_vs_next_to_last_difference_pct)
test_data2 <-
left_join(test_data, last_train_data_diff) %>%
mutate(microbusiness_density = case_when(
## Difference >= 2%, take average
last_vs_next_to_last_difference_pct > params$cutoff ~
# (last_value + next_to_last_value) / 2,
next_to_last_value,
## otherwise, use the last value
TRUE ~ last_value))
test_data2_submission <- test_data2 %>%
select(row_id, microbusiness_density)
## Write out the CSV for submission
fwrite(test_data2_submission,
paste0(
'last_value_except_next_to_last_when_GT_',
params$cutoff,
'_pct_diff.csv'))Let’s use the tidycensus R package to get the population per county and the population density, and see what we observe about those counties that used a prediction other than their most recent microbusiness density, repeated.
acsdata_county_pop0 <-
get_acs(
geography = 'county',
## 2021 5-year American Community Survey, latest available
year = 2021,
variables = c(Total_Population = "B02001_001"),
output = "wide")
acsdata_county_geo <- get_acs(geography = 'county',
year = 2020,
geometry = TRUE,
table = 'B01003',
keep_geo_vars = TRUE,
cache_table = TRUE)
acsdata_county_pop <-
left_join(acsdata_county_pop0, acsdata_county_geo %>%
select(GEOID, ALAND)) %>%
mutate(Population_Density_per_square_mile = round((Total_PopulationE / ALAND) * (1609*1609), 2)) %>%
## E for estimate
rename(Total_Population = Total_PopulationE,
cfips = GEOID) %>%
## Only keep the population estimate (not margin of error)
select(NAME, cfips, Total_Population, Population_Density_per_square_mile)
fwrite(acsdata_county_pop, "acsdata_county_pop.csv")Let’s look at the correlation between the U.S. Census (2021 ACS 5-year) total population, the latest microbusiness density, and the percentage change vs. the next-to-last microbusiness density. Again, we use the cor_test function of the correlation R package, aided by the see R package.
acsdata_county_pop <- fread("acsdata_county_pop.csv")
test_data2_sub <- test_data2 %>%
mutate(NAME = paste0(county, ", ", state)) %>%
distinct(NAME, .keep_all = TRUE) %>%
select(NAME, next_to_last_value, last_value,
last_vs_next_to_last_difference_pct)
pct_diff_last_vs_next_to_last_with_census_data <-
left_join(test_data2_sub,
acsdata_county_pop) %>%
select(NAME, Total_Population, Population_Density_per_square_mile,
next_to_last_value, last_value,
last_vs_next_to_last_difference_pct) %>%
arrange(NAME)
result1 <- cor_test(pct_diff_last_vs_next_to_last_with_census_data, "last_value", "next_to_last_value")
plot(result1)result2 <- cor_test(pct_diff_last_vs_next_to_last_with_census_data, "last_value", "Total_Population")
plot(result2)result3 <- cor_test(pct_diff_last_vs_next_to_last_with_census_data, "last_value", "last_vs_next_to_last_difference_pct")
plot(result3)result4 <- cor_test(pct_diff_last_vs_next_to_last_with_census_data, "Total_Population", "last_vs_next_to_last_difference_pct")
plot(result4)Table using the reactable R package.
altered_2_pct_diff_last_vs_next_to_last_with_census_data <-
pct_diff_last_vs_next_to_last_with_census_data %>%
dplyr::filter(last_vs_next_to_last_difference_pct > params$cutoff) %>%
arrange(NAME)
## reactable usage for a more attractive table
reactable(
altered_2_pct_diff_last_vs_next_to_last_with_census_data,
compact = TRUE, # for minimum row height
filterable = TRUE, # for individual column filters
striped = TRUE, # banded rows
resizable = TRUE, # for resizable column widths
defaultPageSize = 50,
pageSizeOptions = 50
)This image features some of my (Rick Pack) interests. I run in master’s track meets, focusing on the 400m sprint, for the Atlanta Track Club. In one image, I took a picture with fellow developer (broadly defined), Jeff Chen, who is among the fastest master’s track sprinters. The guy in yellow is age 80+ William Rhoad, the barefoot runner, who is strong in many events and often runs barefoot!
I am from Louisville, Kentucky, attended undergrad at American University before the University of Kentucky and have a master’s degree in applied statistics from Penn State University. I live and work in the Raleigh-Durham, North Carolina area.
I contributed to the bivariate choropleth R package biscale, hence the two-color heatmap.
The dancer is Sara @ssstrellez on Instagram and this video amazes me. I like improvisational dancing with break-dancing elements.
Finally, I eat a lot of parsley and drink a lot of yerba maté tea.
About Me