if (!requireNamespace("pacman", quietly = TRUE)) install.packages("pacman")
pacman::p_load(
ipumsr,
tidyverse,
data.table,
caret,
randomForest,
doParallel,
sf,
tmap,
viridis,
plotly)
Our first step in addressing demand is to look at the ages. To look at the ages we will use census data that includes the distribution and locations of ages throughout the united states.
setwd("C:/Users/sc363/OneDrive/Work Items/Workspace/CareBoard/")
age_data <- fread("./00_CENSUSAges.csv")
The next step is to take our assumptions about the amount of care for these age groups and apply them. Essentially for each age group in our data we will multiply the count of that group by the number of hours needed to provide care for this group.We will then create a new variable called DEMAND which is the total amount of Demand within the society.
One thing that we can do to help with our assumptions and make sure that they are somewhat justified is to look at the distribution of care giving within houses of different ages. For instance, let’s look at how much childcare occurs in households of diffferent child ages. To do this we need to use the Timne Use Data.
atus <- fread("./03_ATUSdata.csv") %>%
filter(nchild == 1)
childa <- atus %>%
filter(child_age == "Under Five") %>%
filter(CHILDCARE == 1)
childa$SCC_ALL_LN <- 0
childb <- atus %>%
filter(child_age == "Under Five") %>%
filter(CHILDCARE == 0 & SCC_ALL_LN > 0)
childb$DURATION <- 0
child <- rbind(childa, childb)
child$Childcare_Time <- (child$DURATION + child$SCC_ALL_LN)/60
child <- child %>%
group_by(CASEID) %>%
summarise(Childcare_Time = sum(Childcare_Time, na.rm = TRUE))
# Create a distribution plot of Childcare_Time
ggplot(child, aes(x = Childcare_Time)) +
geom_histogram(binwidth = 1, fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Childcare Time",
x = "Childcare Time",
y = "Frequency") +
theme_minimal()
Assuming 8-12 hours of sleep per night for children in this age group,
it can be seen that it is not unreasonable to assume 24 hours of care is
needed for this group. The bimodal distribution in this is interesting
but when analyzing it comes to show differences between those who are
employed and those who stay at home with the child. For those who stay
at home with the child, essentially their entire day is spent in care
activities while those who go to work spend their entire time when at
home.
atus <- fread("./03_ATUSdata.csv") %>%
filter(nchild == 1)
childa <- atus %>%
filter(child_age == "Five_Eleven") %>%
filter(CHILDCARE == 1)
childa$SCC_ALL_LN <- 0
childb <- atus %>%
filter(child_age == "Five_Eleven") %>%
filter(CHILDCARE == 0 & SCC_ALL_LN > 0)
childb$DURATION <- 0
child <- rbind(childa, childb)
child$Childcare_Time <- (child$DURATION + child$SCC_ALL_LN)/60
child <- child %>%
group_by(CASEID) %>%
summarise(Childcare_Time = sum(Childcare_Time, na.rm = TRUE))
# Create a distribution plot of Childcare_Time
ggplot(child, aes(x = Childcare_Time)) +
geom_histogram(binwidth = 1, fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Childcare Time",
x = "Childcare Time",
y = "Frequency") +
theme_minimal()
For those in the age catagory of five-11 we see a similar distribution as above. The main difference in this catagory compared to the other one likely is due to the amount of time children in this age catagory now spend at school, or a formal institution outside the house. Nonetheless we can see thatin days where the child is home, such as during summer break, there is an entirety of a day providing care activities to them. Thus the assumption of 24 hours of care needed likely holds true for this group as well.
This is variable is significantly harder to measure. The issue we have is that secondary activity time only is asked for children under the age of 13. as such children in their teens aren’t included as needing secondary child care. This is likely a flaweed assumption but means that we only have access to the normal child care variable.
atus <- fread("./03_ATUSdata.csv") %>%
filter(nchild == 1)
childa <- atus %>%
filter(child_age == "Twelve_Eighteen") %>%
filter(CHILDCARE == 1)
childa$SCC_ALL_LN <- 0
childb <- atus %>%
filter(child_age == "Twelve_Eighteen") %>%
filter(CHILDCARE == 0 & SCC_ALL_LN > 0)
childb$DURATION <- 0
child <- rbind(childa, childb)
child$Childcare_Time <- (child$DURATION + child$SCC_ALL_LN)/60
child <- child %>%
group_by(CASEID) %>%
summarise(Childcare_Time = sum(Childcare_Time, na.rm = TRUE))
# Create a distribution plot of Childcare_Time
ggplot(child, aes(x = Childcare_Time)) +
geom_histogram(binwidth = 1, fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Childcare Time",
x = "Childcare Time",
y = "Frequency") +
theme_minimal()
We see a very different distribution for these children, but this is likely due to the exclusion of secondary childcare activities. This is especially problematic because in this group, the majority of childcare activities are likely secondary. Gone are the days of engaging in activities such as “changing diapers” and now are the days of watching tv while the child plays or does other activities independently in their room.
Regardless, it should be admited that there is likely some level of independence associated in this group with children able to move around outside the hose independently (and some times even drive). Thus for our assumption we’ll give them a bit lower than 24 hours of care needed, perhaps we’ll go as low as 16 which provides this group with up to 8 hours of independence a day!
median(child$Childcare_Time)
## [1] 1.416667
Now we have done some analysis of childcare, a question is what about for the care of prime age adults? This catagory will likely fall predomenatly in the household care catagory. To start let’s look at single individuals in the prime age and see how much hours they spend on household care (This likely represents a baseline for how much a single person needs in this catagory.)
atus <- fread("./03_ATUSdata.csv") %>%
filter(nchild == 0 & marst == "Single-Never-Married" & HOUSEHOLDCARE == 1)
adult <- atus %>%
group_by(CASEID) %>%
summarise(Household_Time = sum(DURATION ,na.rm = TRUE)/60)
# Create a distribution plot of Childcare_Time
ggplot(adult, aes(x = Household_Time)) +
geom_histogram(binwidth = 1, fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Household Care for Single individuals",
x = "Household Care Time",
y = "Frequency") +
theme_minimal()
What is the median in this?
median(adult$Household_Time)
## [1] 1.333333
about 1.3 hours, so not a huge amount. Let’s see how this changes for those who are married.
atus <- fread("./03_ATUSdata.csv") %>%
filter(nchild == 0 & marst == "Married" & HOUSEHOLDCARE == 1)
adult <- atus %>%
group_by(CASEID) %>%
summarise(Household_Time = sum(DURATION ,na.rm = TRUE)/60)
# Create a distribution plot of Childcare_Time
ggplot(adult, aes(x = Household_Time)) +
geom_histogram(binwidth = 1, fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Household Care for Single individuals",
x = "Household Care Time",
y = "Frequency") +
theme_minimal()
As can be seen, when there are two people living in a house together, the amount of household care needed does in fact go up but not by a tremendous amount with the main difference being the reduction in people doing 0 household care.
median(adult$Household_Time)
## [1] 2.366667
I think for now, it’s a conservative estimate to thus say that prime age individuals need 2 hours of care per day, so let’s include this in the analysis. I then blend the groups between 18 and 25 which is where prime age starts, to show a steady decrease in care
Finally, let’s take a look at the distribution of time spent on ElderCare activities. We need to better refine this over time because at this moment this is time spent on eldercare activities for all individuals, and not able to be distributed by the age of the oldest people in the house.
atus <- fread("./03_ATUSdata.csv") %>%
filter(ELDERCARE == 1)
adult <- atus %>%
group_by(CASEID) %>%
summarise(Eldercare_Time = sum(DURATION ,na.rm = TRUE)/60)
# Create a distribution plot of Childcare_Time
ggplot(adult, aes(x = Eldercare_Time)) +
geom_histogram(binwidth = 1, fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Household Care for Single individuals",
x = "Elder Care Time",
y = "Frequency") +
theme_minimal()
As previously mentioned, this distribution alone isn’t very useful, but perhaps it can be refined over time. For now, let’s focus on our own assumptions of eldercare. I assume those over the age of 85 are generally in need of a lot of care, let’s say 20 hours. Let’s then blend this catagory in with the time for prime age leading to a steady and consistent increase to this time value.
The code below uses the assumptions we’ve generated wo create a value of demand for each different age group throughout the country.
age_data <- age_data %>%
mutate(
child_under_five = CN5AA2010*24,
child_five_nine = CN5AB2010*24,
child_ten_fourteen = CN5AC2010*20,
child_fifteen_seventeen = CN5AD2010*16,
adult_eighteen_nineteen = CN5AE2010*10,
adult_twenty = CN5AF2010*8,
adult_twentyone = CN5AG2010*6,
adult_twentytwo_twentyfour = CN5AH2010*4,
adult_twentyfive_twentynine = CN5AI2010*2,
adult_thirty_thirtyfour = CN5AJ2010*2,
adult_thirtyfive_thirtynine = CN5AK2010*2,
adult_forty_fortyfour = CN5AL2010*2,
adult_fortyfive_fortynine = CN5AM2010*2,
adult_fifty_fiftyfour = CN5AN2010*2,
adult_fiftyfive_fiftynine = CN5AO2010*3,
adult_sixty_sixtyone = CN5AP2010*3,
adult_sixty_two_sixtyfour = CN5AQ2010*4,
adult_sixtyfive_sixtysix = CN5AR2010*4,
adult_sixty_seven_sixtynine = CN5AS2010*5,
adult_seventy_seventyfour = CN5AT2010*6,
adult_seventyfive_seventynine = CN5AU2010*12,
adult_eighty_eightyfour = CN5AV2010*16,
adult_eightyfive_over = CN5AW2010*20,
DEMAND = child_under_five + child_five_nine + child_ten_fourteen + child_fifteen_seventeen +
adult_eighteen_nineteen + adult_twenty + adult_twentyone + adult_twentytwo_twentyfour +
adult_twentyfive_twentynine + adult_thirty_thirtyfour + adult_thirtyfive_thirtynine +
adult_forty_fortyfour + adult_fortyfive_fortynine + adult_fifty_fiftyfour +
adult_fiftyfive_fiftynine + adult_sixty_sixtyone + adult_sixty_two_sixtyfour +
adult_sixtyfive_sixtysix + adult_sixty_seven_sixtynine + adult_seventy_seventyfour +
adult_seventyfive_seventynine + adult_eighty_eightyfour + adult_eightyfive_over
)
Let’s generate a general bar chart that we can use to see how demand is distributed by age group.
# Summarize the data by calculating the sum of each age group column
age_data_sums <- age_data %>%
summarise(across(starts_with("child_") | starts_with("adult_"), sum, na.rm = TRUE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(starts_with("child_") | starts_with("adult_"), sum,
## na.rm = TRUE)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
# Reshape the summarized data into a long format
age_data_long_sums <- age_data_sums %>%
pivot_longer(cols = everything(),
names_to = "age_group",
values_to = "total_count")
# Ensure the age_group factor levels follow the original dataset order
age_data_long_sums$age_group <- factor(age_data_long_sums$age_group, levels = age_data_long_sums$age_group)
# Create a simple bar chart with the original order
ggplot(age_data_long_sums, aes(x = age_group, y = total_count)) +
geom_bar(stat = "identity", fill = "skyblue") +
theme_minimal() +
labs(title = "Total Demand by Age Group",
x = "Age Group",
y = "Total Demand") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability
Lets now work on adding an intensity dimension to this. First thing we need to do is determine what we are going to consider a health characteristics that increases intensity. To do this we will use our NHIS data.
nhis <- fread("./03_NHISdata.csv")
# Ensure the health column is numeric
nhis$HEALTH <- as.numeric(nhis$HEALTH)
# Subset out observations where health is 7 or 9
nhis <- nhis %>% filter(!HEALTH %in% c(7, 9))
# Define age groups and categorize
nhis <- nhis %>%
mutate(age_group = cut(AGE,
breaks = c(-Inf, 5, 10, 15, 18, 20, 21, 22, 25, 30, 35, 40, 45, 50, 55, 60, 62, 65, 67, 70, 75, 80, 85, Inf),
labels = c("Under 5", "5-9", "10-14", "15-17", "18-19", "20", "21", "22-24", "25-29", "30-34", "35-39", "40-44",
"45-49", "50-54", "55-59", "60-61", "62-64", "65-66", "67-69", "70-74", "75-79", "80-84", "85+"),
right = FALSE))
# Calculate average health score for each age group
average_health_by_age <- nhis %>%
group_by(age_group) %>%
summarize(average_health = median(HEALTH, na.rm = TRUE))
health <- c(average_health_by_age$average_health)
health <- c(rep(1, 4), health)
age_data_long_sums$health <- health
ggplot(age_data_long_sums, aes(x = age_group, y = total_count, fill = as.factor(health))) +
geom_bar(stat = "identity", color = "black", width = 0.7) +
scale_fill_manual(values = c("1" = "#FF9999", "2" = "#66B2FF", "3" = "#99FF99"),
name = "Health Level",
labels = c("1" = "No Info", "2" = "Medium", "3" = "Low")) +
labs(title = "Total Count of Demand hours by Age Group and Health Level",
x = "Age Group",
y = "Total Count of People") +
theme_minimal(base_size = 15) +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "top")
We do see that there is a point, around the age of 60, where people’s health starts to decrease. We should keep this in mind. One thing we might want to do though is evaluate this in a bit more detail to understand the count of hours dedicated to each of these groups. This is a bit more complicated to do but let’s take a crack at it.
age_hp <- nhis %>%
group_by(age_group, health) %>%
summarize(total_SAMPWEIGHT = sum(SAMPWEIGHT, na.rm = TRUE))
## `summarise()` has grouped output by 'age_group'. You can override using the
## `.groups` argument.
age_hp <- nhis %>%
group_by(age_group, health) %>%
summarize(total_SAMPWEIGHT = sum(SAMPWEIGHT, na.rm = TRUE)) %>%
mutate(
total_SAMPWEIGHT = case_when(
health == "Excellent" ~ total_SAMPWEIGHT * 1,
health == "Very Good" ~ total_SAMPWEIGHT * 1,
health == "Good" ~ total_SAMPWEIGHT * 2,
health == "Fair" ~ total_SAMPWEIGHT * 3,
health == "Poor" ~ total_SAMPWEIGHT * 5
)
)
## `summarise()` has grouped output by 'age_group'. You can override using the
## `.groups` argument.
age_hp$health <- factor(age_hp$health, levels = c("Excellent", "Very Good", "Good", "Fair", "Poor"))
Lets start with a barplot
# Create a facet grid
ggplot(age_hp, aes(x = age_group, y = total_SAMPWEIGHT)) +
geom_bar(stat = "identity", aes(fill = age_group)) +
facet_wrap(~health) +
labs(title = "Total SAMPWEIGHT by Age Group Faceted by Health Status",
x = "Age Group",
y = "Total SAMPWEIGHT") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Lets do a 3d scatterplot
# Group by age_group and health, summarize total SAMPWEIGHT, and then apply multipliers based on health category
age_hp <- nhis %>%
group_by(age_group, health) %>%
summarize(total_SAMPWEIGHT = sum(SAMPWEIGHT, na.rm = TRUE)) %>%
mutate(
total_SAMPWEIGHT = case_when(
health == "Excellent" ~ total_SAMPWEIGHT * 1,
health == "Very Good" ~ total_SAMPWEIGHT * 1,
health == "Good" ~ total_SAMPWEIGHT * 2,
health == "Fair" ~ total_SAMPWEIGHT * 3,
health == "Poor" ~ total_SAMPWEIGHT * 5
)
)
## `summarise()` has grouped output by 'age_group'. You can override using the
## `.groups` argument.
# Reorder health factor levels
age_hp$health <- factor(age_hp$health, levels = c("Excellent", "Very Good", "Good", "Fair", "Poor"))
# Create a 3D bar plot using scatter3d to simulate bars
fig_bar <- plot_ly(
age_hp,
x = ~age_group,
y = ~as.numeric(factor(health, levels = c("Excellent", "Very Good", "Good", "Fair", "Poor"))), # Numeric mapping for ordered levels
z = ~total_SAMPWEIGHT,
type = "scatter3d",
mode = "markers",
marker = list(size = 10, color = ~total_SAMPWEIGHT, colorscale = "Blues", showscale = TRUE)
)
# Customize layout with explicit ticktext to display proper health levels
fig_bar <- fig_bar %>%
layout(
scene = list(
xaxis = list(title = "Age Group"),
yaxis = list(title = "Health", tickvals = 1:5, ticktext = c("Excellent", "Very Good", "Good", "Fair", "Poor")),
zaxis = list(title = "Total SAMPWEIGHT")
),
title = "3D Plot of Age Group, Health, and Total SAMPWEIGHT"
)
# Show the plot
fig_bar
Now lets do a plant chart
# Convert categorical variables to factors with numeric levels for plotting
age_hp$age_group_numeric <- as.numeric(factor(age_hp$age_group))
age_hp$health_numeric <- as.numeric(factor(age_hp$health))
# Create a 3D plane chart
fig_plane <- plot_ly(
age_hp,
x = ~age_group_numeric,
y = ~as.numeric(factor(health, levels = c("Excellent", "Very Good", "Good", "Fair", "Poor"))),
z = ~total_SAMPWEIGHT,
type = "mesh3d"
)
# Customize layout
fig_plane <- fig_plane %>%
layout(
scene = list(
xaxis = list(title = "Age Group (Numeric Levels)", tickvals = unique(age_hp$age_group_numeric), ticktext = unique(age_hp$age_group)),
yaxis = list(title = "Health", tickvals = 1:5, ticktext = c("Excellent", "Very Good", "Good", "Fair", "Poor")),
zaxis = list(title = "Total SAMPWEIGHT")
),
title = "3D Plane Chart of Age Group, Health, and Total SAMPWEIGHT"
)
# Show the plot
fig_plane
Just for the hell of it let’s look at this in regards to some of our demographic control variables.
For example, how does this demand relate to sex?
age_hp <- nhis %>%
group_by(age_group, health, sex) %>%
summarize(total_SAMPWEIGHT = sum(SAMPWEIGHT, na.rm = TRUE)) %>%
mutate(
total_SAMPWEIGHT = case_when(
health == "Excellent" ~ total_SAMPWEIGHT * 1,
health == "Very Good" ~ total_SAMPWEIGHT * 1,
health == "Good" ~ total_SAMPWEIGHT * 2,
health == "Fair" ~ total_SAMPWEIGHT * 3,
health == "Poor" ~ total_SAMPWEIGHT * 5
)
)
## `summarise()` has grouped output by 'age_group', 'health'. You can override
## using the `.groups` argument.
age_hp$health <- factor(age_hp$health, levels = c("Excellent", "Very Good", "Good", "Fair", "Poor"))
# Create a 3D scatter plot, with color representing different 'Role' categories
fig_role <- plot_ly(
age_hp,
x = ~age_group,
y = ~as.numeric(factor(health, levels = c("Excellent", "Very Good", "Good", "Fair", "Poor"))),
z = ~total_SAMPWEIGHT,
color = ~sex, # Color by 'Role'
colors = "Set1", # Use a distinct color palette for Role
type = "scatter3d",
mode = "markers",
marker = list(size = 10, showscale = FALSE) # Adjust marker size
)
# Customize layout
fig_role <- fig_role %>%
layout(
scene = list(
xaxis = list(title = "Age Group"),
yaxis = list(title = "Health", tickvals = 1:5, ticktext = c("Excellent", "Very Good", "Good", "Fair", "Poor")),
zaxis = list(title = "Total SAMPWEIGHT")
),
title = "3D Scatter Plot of Age Group, Health, Total SAMPWEIGHT by Role"
)
# Show the plot
fig_role
what about the care status?
age_hp <- nhis %>%
group_by(age_group, health, Role) %>%
summarize(total_SAMPWEIGHT = sum(SAMPWEIGHT, na.rm = TRUE)) %>%
mutate(
total_SAMPWEIGHT = case_when(
health == "Excellent" ~ total_SAMPWEIGHT * 1,
health == "Very Good" ~ total_SAMPWEIGHT * 1,
health == "Good" ~ total_SAMPWEIGHT * 2,
health == "Fair" ~ total_SAMPWEIGHT * 3,
health == "Poor" ~ total_SAMPWEIGHT * 5
)
)
## `summarise()` has grouped output by 'age_group', 'health'. You can override
## using the `.groups` argument.
age_hp$health <- factor(age_hp$health, levels = c("Excellent", "Very Good", "Good", "Fair", "Poor"))
# Create a 3D scatter plot, with color representing different 'Role' categories
fig_role <- plot_ly(
age_hp,
x = ~age_group,
y = ~as.numeric(factor(health, levels = c("Excellent", "Very Good", "Good", "Fair", "Poor"))),
z = ~total_SAMPWEIGHT,
color = ~Role, # Color by 'Role'
colors = "Set1", # Use a distinct color palette for Role
type = "scatter3d",
mode = "markers",
marker = list(size = 10, showscale = FALSE) # Adjust marker size
)
# Customize layout
fig_role <- fig_role %>%
layout(
scene = list(
xaxis = list(title = "Age Group"),
yaxis = list(title = "Health", tickvals = 1:5, ticktext = c("Excellent", "Very Good", "Good", "Fair", "Poor")),
zaxis = list(title = "Total SAMPWEIGHT")
),
title = "3D Scatter Plot of Age Group, Health, Total SAMPWEIGHT by Role"
)
# Show the plot
fig_role
What about the employment status?
age_hp <- nhis %>%
group_by(age_group, health, empstat) %>%
summarize(total_SAMPWEIGHT = sum(SAMPWEIGHT, na.rm = TRUE)) %>%
mutate(
total_SAMPWEIGHT = case_when(
health == "Excellent" ~ total_SAMPWEIGHT * 1,
health == "Very Good" ~ total_SAMPWEIGHT * 1,
health == "Good" ~ total_SAMPWEIGHT * 2,
health == "Fair" ~ total_SAMPWEIGHT * 3,
health == "Poor" ~ total_SAMPWEIGHT * 5
)
)
## `summarise()` has grouped output by 'age_group', 'health'. You can override
## using the `.groups` argument.
age_hp$health <- factor(age_hp$health, levels = c("Excellent", "Very Good", "Good", "Fair", "Poor"))
age_hp <- age_hp %>%
mutate(empstat = ifelse(empstat == "", "NILF", empstat))
# Create a 3D scatter plot, with color representing different 'Role' categories
fig_role <- plot_ly(
age_hp,
x = ~age_group,
y = ~as.numeric(factor(health, levels = c("Excellent", "Very Good", "Good", "Fair", "Poor"))),
z = ~total_SAMPWEIGHT,
color = ~empstat, # Color by 'Role'
colors = "Set1", # Use a distinct color palette for Role
type = "scatter3d",
mode = "markers",
marker = list(size = 10, showscale = FALSE) # Adjust marker size
)
# Customize layout
fig_role <- fig_role %>%
layout(
scene = list(
xaxis = list(title = "Age Group"),
yaxis = list(title = "Health", tickvals = 1:5, ticktext = c("Excellent", "Very Good", "Good", "Fair", "Poor")),
zaxis = list(title = "Total SAMPWEIGHT")
),
title = "3D Scatter Plot of Age Group, Health, Total SAMPWEIGHT by Role"
)
# Show the plot
fig_role