# Visualization 1: Boxplot of number of children by education level
boxplot_education <- ggplot(selected_data, aes(x = factor(education), y = kids)) +
  geom_boxplot() +
  labs(title = "Distribution of Number of Children by Education Level", x = "Education Level (years)", y = "Number of Children")
print(boxplot_education)

# Visualization 2: Heatmap of education level vs. number of children
heatmap_education_kids <- ggplot(selected_data, aes(x = education, y = kids)) +
  geom_bin2d() +
  labs(title = "Heatmap of Education Level vs. Number of Children", x = "Education Level (years)", y = "Number of Children") +
  scale_fill_gradient(low = "white", high = "blue")
print(heatmap_education_kids)

# Initial regression analysis: Simple regression
model1 <- lm(kids ~ education, data = selected_data)
summary_model1 <- summary(model1)

# Initial regression analysis: Multiple regression
model2 <- lm(kids ~ education + age + agefirstbirth + ethnicity + immigrant + city16, data = selected_data)
summary_model2 <- summary(model2)

# Print regression summaries
print(summary_model1)
## 
## Call:
## lm(formula = kids ~ education, data = selected_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8844 -0.9079 -0.2976  0.5804  6.1907 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.128541   0.114268   36.13   <2e-16 ***
## education   -0.122064   0.008658  -14.10   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.398 on 3310 degrees of freedom
## Multiple R-squared:  0.05665,    Adjusted R-squared:  0.05637 
## F-statistic: 198.8 on 1 and 3310 DF,  p-value: < 2.2e-16
print(summary_model2)
## 
## Call:
## lm(formula = kids ~ education + age + agefirstbirth + ethnicity + 
##     immigrant + city16, data = selected_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6909 -0.8334 -0.1666  0.6328  5.5822 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.430103   0.152093  22.553  < 2e-16 ***
## education      -0.034677   0.008802  -3.940 8.33e-05 ***
## age             0.030171   0.001380  21.864  < 2e-16 ***
## agefirstbirth  -0.085818   0.005072 -16.918  < 2e-16 ***
## ethnicityother  0.166511   0.055772   2.986  0.00285 ** 
## immigrantyes   -0.004762   0.070380  -0.068  0.94605    
## city16yes       0.002037   0.045907   0.044  0.96461    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.274 on 3305 degrees of freedom
## Multiple R-squared:  0.2175, Adjusted R-squared:  0.2161 
## F-statistic: 153.1 on 6 and 3305 DF,  p-value: < 2.2e-16
# Create regression table
modelsummary(list("Simple Model" = model1, "Multiple Model" = model2))
tinytable_p6lzcloslpwi1y0ljh5f
Simple Model Multiple Model
(Intercept) 4.129 3.430
(0.114) (0.152)
education -0.122 -0.035
(0.009) (0.009)
age 0.030
(0.001)
agefirstbirth -0.086
(0.005)
ethnicityother 0.167
(0.056)
immigrantyes -0.005
(0.070)
city16yes 0.002
(0.046)
Num.Obs. 3312 3312
R2 0.057 0.218
R2 Adj. 0.056 0.216
AIC 11623.7 11014.4
BIC 11642.0 11063.2
Log.Lik. -5808.826 -5499.198
RMSE 1.40 1.27