#LOAD LIBRARIES
knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(modelsummary)
## Warning: package 'modelsummary' was built under R version 4.3.3
library(tidyr)
library(readr)
#load data
# Load
Fertility <- read.csv("fertility2.csv")
# View the first few rows of the data
head(Fertility)
## rownames morekids gender1 gender2 age afam hispanic other work
## 1 1 no female female 35 no no no 40
## 2 2 no female male 33 no no no 52
## 3 3 no female female 28 no no no 12
## 4 4 no male male 28 no no no 0
## 5 5 no female male 31 no no no 0
## 6 6 no female male 33 no no no 0
#clean Data
# Select relevant variables and rename them for clarity
Fertility <- Fertility %>%
select(morekids, gender1, gender2, age, afam, hispanic, other, work) %>%
rename(
MoreThanTwoChildren = morekids,
GenderFirstChild = gender1,
GenderSecondChild = gender2,
Age = age,
AfricanAmerican = afam,
Hispanic = hispanic,
OtherEthnicities = other,
WeeksWorked = work
)
# Convert relevant variables to factors
Fertility <- Fertility %>%
mutate(
MoreThanTwoChildren = factor(MoreThanTwoChildren),
GenderFirstChild = factor(GenderFirstChild),
GenderSecondChild = factor(GenderSecondChild),
AfricanAmerican = factor(AfricanAmerican),
Hispanic = factor(Hispanic),
OtherEthnicities = factor(OtherEthnicities)
)
# Handle missing values by removing rows with any missing data
Fertility <- Fertility %>%
drop_na()
# Verify the cleaned data
summary(Fertility)
## MoreThanTwoChildren GenderFirstChild GenderSecondChild Age
## no :18672 female:14549 female:14818 Min. :21.00
## yes:11328 male :15451 male :15182 1st Qu.:28.00
## Median :31.00
## Mean :30.35
## 3rd Qu.:33.00
## Max. :35.00
## AfricanAmerican Hispanic OtherEthnicities WeeksWorked
## no :28402 no :27768 no :28295 Min. : 0.00
## yes: 1598 yes: 2232 yes: 1705 1st Qu.: 0.00
## Median : 6.00
## Mean :19.21
## 3rd Qu.:45.00
## Max. :52.00
#Descriptive Statistics
# Generate descriptive statistics for key variables
datasummary_skim(Fertility)
| Unique | Missing Pct. | Mean | SD | Min | Median | Max | Histogram | |
|---|---|---|---|---|---|---|---|---|
| Age | 15 | 0 | 30.4 | 3.4 | 21.0 | 31.0 | 35.0 | |
| WeeksWorked | 53 | 0 | 19.2 | 21.9 | 0.0 | 6.0 | 52.0 | |
| N | % | |||||||
| MoreThanTwoChildren | no | 18672 | 62.2 | |||||
| yes | 11328 | 37.8 | ||||||
| GenderFirstChild | female | 14549 | 48.5 | |||||
| male | 15451 | 51.5 | ||||||
| GenderSecondChild | female | 14818 | 49.4 | |||||
| male | 15182 | 50.6 | ||||||
| AfricanAmerican | no | 28402 | 94.7 | |||||
| yes | 1598 | 5.3 | ||||||
| Hispanic | no | 27768 | 92.6 | |||||
| yes | 2232 | 7.4 | ||||||
| OtherEthnicities | no | 28295 | 94.3 | |||||
| yes | 1705 | 5.7 |
#Descriptive Visualization
# Conditional Density Plot: Weeks Worked by More Than Two Children
ggplot(Fertility, aes(x = WeeksWorked, fill = MoreThanTwoChildren)) +
geom_density(alpha = 0.5) +
labs(
title = "Conditional Density of Weeks Worked by Having More Than Two Children",
x = "Weeks Worked",
y = "Density",
fill = "More Than Two Children"
) +
theme_minimal()
# CDF Plot: Weeks Worked by More Than Two Children
ggplot(Fertility, aes(x = WeeksWorked, color = MoreThanTwoChildren)) +
stat_ecdf(geom = "step") +
labs(
title = "Cumulative Distribution of Weeks Worked by Having More Than Two Children",
x = "Weeks Worked",
y = "Cumulative Probability",
color = "More Than Two Children"
) +
theme_minimal()
# Load necessary libraries
library(dplyr)
library(ggplot2)
library(modelsummary)
library(interactions)
## Warning: package 'interactions' was built under R version 4.3.3
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(sjPlot)
## Warning: package 'sjPlot' was built under R version 4.3.3
#Regression
# Perform the regression analysis
regression_model <- lm(WeeksWorked ~ MoreThanTwoChildren * AfricanAmerican + MoreThanTwoChildren * Hispanic + MoreThanTwoChildren * Age + GenderFirstChild + GenderSecondChild, data = Fertility)
# Summarize the regression results in a well-formatted table
modelsummary(regression_model, stars = TRUE)
| (1) | |
|---|---|
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | |
| (Intercept) | -3.893** |
| (1.396) | |
| MoreThanTwoChildrenyes | -8.223*** |
| (2.390) | |
| AfricanAmericanyes | 11.842*** |
| (0.743) | |
| Hispanicyes | 1.128+ |
| (0.669) | |
| Age | 0.824*** |
| (0.046) | |
| GenderFirstChildmale | 0.189 |
| (0.248) | |
| GenderSecondChildmale | -0.249 |
| (0.247) | |
| MoreThanTwoChildrenyes × AfricanAmericanyes | -0.974 |
| (1.111) | |
| MoreThanTwoChildrenyes × Hispanicyes | -0.308 |
| (0.949) | |
| MoreThanTwoChildrenyes × Age | 0.046 |
| (0.077) | |
| Num.Obs. | 30000 |
| R2 | 0.047 |
| R2 Adj. | 0.046 |
| AIC | 269022.6 |
| BIC | 269114.0 |
| Log.Lik. | -134500.277 |
| F | 163.408 |
| RMSE | 21.42 |
#Interaction Plot
# Load necessary libraries
library(ggplot2)
library(dplyr)
# Aggregate data to get mean WeeksWorked by Age, MoreThanTwoChildren, and AfricanAmerican
summary_data <- Fertility %>%
group_by(Age, MoreThanTwoChildren, AfricanAmerican) %>%
summarize(MeanWeeksWorked = mean(WeeksWorked, na.rm = TRUE))
## `summarise()` has grouped output by 'Age', 'MoreThanTwoChildren'. You can
## override using the `.groups` argument.
# Create a line plot
line_plot <- ggplot(summary_data, aes(x = Age, y = MeanWeeksWorked, color = MoreThanTwoChildren, linetype = AfricanAmerican)) +
geom_line(size = 1) +
labs(
title = "Average Weeks Worked by Age, Family Size, and Ethnicity",
x = "Age of Mother",
y = "Average Weeks Worked"
) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Display the plot
print(line_plot)
#END