#LOAD LIBRARIES

knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(modelsummary)
## Warning: package 'modelsummary' was built under R version 4.3.3
library(tidyr)
library(readr)

#load data

# Load 
Fertility <- read.csv("fertility2.csv")

# View the first few rows of the data
head(Fertility)
##   rownames morekids gender1 gender2 age afam hispanic other work
## 1        1       no  female  female  35   no       no    no   40
## 2        2       no  female    male  33   no       no    no   52
## 3        3       no  female  female  28   no       no    no   12
## 4        4       no    male    male  28   no       no    no    0
## 5        5       no  female    male  31   no       no    no    0
## 6        6       no  female    male  33   no       no    no    0

#clean Data

# Select relevant variables and rename them for clarity
Fertility <- Fertility %>%
  select(morekids, gender1, gender2, age, afam, hispanic, other, work) %>%
  rename(
    MoreThanTwoChildren = morekids,
    GenderFirstChild = gender1,
    GenderSecondChild = gender2,
    Age = age,
    AfricanAmerican = afam,
    Hispanic = hispanic,
    OtherEthnicities = other,
    WeeksWorked = work
  )

# Convert relevant variables to factors
Fertility <- Fertility %>%
  mutate(
    MoreThanTwoChildren = factor(MoreThanTwoChildren),
    GenderFirstChild = factor(GenderFirstChild),
    GenderSecondChild = factor(GenderSecondChild),
    AfricanAmerican = factor(AfricanAmerican),
    Hispanic = factor(Hispanic),
    OtherEthnicities = factor(OtherEthnicities)
  )

# Handle missing values by removing rows with any missing data
Fertility <- Fertility %>%
  drop_na()

# Verify the cleaned data
summary(Fertility)
##  MoreThanTwoChildren GenderFirstChild GenderSecondChild      Age       
##  no :18672           female:14549     female:14818      Min.   :21.00  
##  yes:11328           male  :15451     male  :15182      1st Qu.:28.00  
##                                                         Median :31.00  
##                                                         Mean   :30.35  
##                                                         3rd Qu.:33.00  
##                                                         Max.   :35.00  
##  AfricanAmerican Hispanic    OtherEthnicities  WeeksWorked   
##  no :28402       no :27768   no :28295        Min.   : 0.00  
##  yes: 1598       yes: 2232   yes: 1705        1st Qu.: 0.00  
##                                               Median : 6.00  
##                                               Mean   :19.21  
##                                               3rd Qu.:45.00  
##                                               Max.   :52.00

#Descriptive Statistics

# Generate descriptive statistics for key variables
datasummary_skim(Fertility)
tinytable_pgzy5bc1zmafvp2600qj
Unique Missing Pct. Mean SD Min Median Max Histogram
Age 15 0 30.4 3.4 21.0 31.0 35.0
WeeksWorked 53 0 19.2 21.9 0.0 6.0 52.0
N %
MoreThanTwoChildren no 18672 62.2
yes 11328 37.8
GenderFirstChild female 14549 48.5
male 15451 51.5
GenderSecondChild female 14818 49.4
male 15182 50.6
AfricanAmerican no 28402 94.7
yes 1598 5.3
Hispanic no 27768 92.6
yes 2232 7.4
OtherEthnicities no 28295 94.3
yes 1705 5.7

#Descriptive Visualization

# Conditional Density Plot: Weeks Worked by More Than Two Children
ggplot(Fertility, aes(x = WeeksWorked, fill = MoreThanTwoChildren)) +
  geom_density(alpha = 0.5) +
  labs(
    title = "Conditional Density of Weeks Worked by Having More Than Two Children",
    x = "Weeks Worked",
    y = "Density",
    fill = "More Than Two Children"
  ) +
  theme_minimal()

# CDF Plot: Weeks Worked by More Than Two Children
ggplot(Fertility, aes(x = WeeksWorked, color = MoreThanTwoChildren)) +
  stat_ecdf(geom = "step") +
  labs(
    title = "Cumulative Distribution of Weeks Worked by Having More Than Two Children",
    x = "Weeks Worked",
    y = "Cumulative Probability",
    color = "More Than Two Children"
  ) +
  theme_minimal()

# Load necessary libraries
library(dplyr)
library(ggplot2)
library(modelsummary)
library(interactions)
## Warning: package 'interactions' was built under R version 4.3.3
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(sjPlot)
## Warning: package 'sjPlot' was built under R version 4.3.3

#Regression

# Perform the regression analysis
regression_model <- lm(WeeksWorked ~ MoreThanTwoChildren * AfricanAmerican + MoreThanTwoChildren * Hispanic + MoreThanTwoChildren * Age + GenderFirstChild + GenderSecondChild, data = Fertility)

# Summarize the regression results in a well-formatted table
modelsummary(regression_model, stars = TRUE)
tinytable_pc8jct3fxnwyfywpjhur
(1)
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
(Intercept) -3.893**
(1.396)
MoreThanTwoChildrenyes -8.223***
(2.390)
AfricanAmericanyes 11.842***
(0.743)
Hispanicyes 1.128+
(0.669)
Age 0.824***
(0.046)
GenderFirstChildmale 0.189
(0.248)
GenderSecondChildmale -0.249
(0.247)
MoreThanTwoChildrenyes × AfricanAmericanyes -0.974
(1.111)
MoreThanTwoChildrenyes × Hispanicyes -0.308
(0.949)
MoreThanTwoChildrenyes × Age 0.046
(0.077)
Num.Obs. 30000
R2 0.047
R2 Adj. 0.046
AIC 269022.6
BIC 269114.0
Log.Lik. -134500.277
F 163.408
RMSE 21.42

#Interaction Plot

# Load necessary libraries
library(ggplot2)
library(dplyr)

# Aggregate data to get mean WeeksWorked by Age, MoreThanTwoChildren, and AfricanAmerican
summary_data <- Fertility %>%
  group_by(Age, MoreThanTwoChildren, AfricanAmerican) %>%
  summarize(MeanWeeksWorked = mean(WeeksWorked, na.rm = TRUE))
## `summarise()` has grouped output by 'Age', 'MoreThanTwoChildren'. You can
## override using the `.groups` argument.
# Create a line plot
line_plot <- ggplot(summary_data, aes(x = Age, y = MeanWeeksWorked, color = MoreThanTwoChildren, linetype = AfricanAmerican)) +
  geom_line(size = 1) +
  labs(
    title = "Average Weeks Worked by Age, Family Size, and Ethnicity",
    x = "Age of Mother",
    y = "Average Weeks Worked"
  ) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Display the plot
print(line_plot)

#END