# Visualization 1: Boxplot of number of children by education level
# Scatter plot of education vs. number of children
ggplot(selected_data, aes(x = education, y = kids)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Relationship between Education Level and Number of Children",
       x = "Years of Education",
       y = "Number of Children")
## `geom_smooth()` using formula = 'y ~ x'

# Visualization 2: Heatmap of education level vs. number of children
# Histogram of education level
ggplot(selected_data, aes(x = education)) +
  geom_histogram(binwidth = 1, fill = "blue", color = "black") +
  labs(title = "Distribution of Education Level",
       x = "Years of Education",
       y = "Frequency")

# Initial regression analysis: Simple regression
model1 <- lm(kids ~ education, data = selected_data)
summary_model1 <- summary(model1)

# Initial regression analysis: Multiple regression
model2 <- lm(kids ~ education + age + agefirstbirth + ethnicity + immigrant + city16, data = selected_data)
summary_model2 <- summary(model2)

# Print regression summaries
print(summary_model1)
## 
## Call:
## lm(formula = kids ~ education, data = selected_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8844 -0.9079 -0.2976  0.5804  6.1907 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.128541   0.114268   36.13   <2e-16 ***
## education   -0.122064   0.008658  -14.10   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.398 on 3310 degrees of freedom
## Multiple R-squared:  0.05665,    Adjusted R-squared:  0.05637 
## F-statistic: 198.8 on 1 and 3310 DF,  p-value: < 2.2e-16
print(summary_model2)
## 
## Call:
## lm(formula = kids ~ education + age + agefirstbirth + ethnicity + 
##     immigrant + city16, data = selected_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6909 -0.8334 -0.1666  0.6328  5.5822 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.430103   0.152093  22.553  < 2e-16 ***
## education      -0.034677   0.008802  -3.940 8.33e-05 ***
## age             0.030171   0.001380  21.864  < 2e-16 ***
## agefirstbirth  -0.085818   0.005072 -16.918  < 2e-16 ***
## ethnicityother  0.166511   0.055772   2.986  0.00285 ** 
## immigrantyes   -0.004762   0.070380  -0.068  0.94605    
## city16yes       0.002037   0.045907   0.044  0.96461    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.274 on 3305 degrees of freedom
## Multiple R-squared:  0.2175, Adjusted R-squared:  0.2161 
## F-statistic: 153.1 on 6 and 3305 DF,  p-value: < 2.2e-16
# Create regression table
modelsummary(list("Simple Model" = model1, "Multiple Model" = model2))
tinytable_rteu71gvfr2sqbvgjup4
Simple Model Multiple Model
(Intercept) 4.129 3.430
(0.114) (0.152)
education -0.122 -0.035
(0.009) (0.009)
age 0.030
(0.001)
agefirstbirth -0.086
(0.005)
ethnicityother 0.167
(0.056)
immigrantyes -0.005
(0.070)
city16yes 0.002
(0.046)
Num.Obs. 3312 3312
R2 0.057 0.218
R2 Adj. 0.056 0.216
AIC 11623.7 11014.4
BIC 11642.0 11063.2
Log.Lik. -5808.826 -5499.198
F 198.774 153.129
RMSE 1.40 1.27
# Install the AICcmodavg package
install.packages("AICcmodavg")
## 'C:/Users/kevin/AppData/Local/R/win-library/4.4'의 위치에 패키지(들)을 설치합니다.
## (왜냐하면 'lib'가 지정되지 않았기 때문입니다)
## 'TMB', 'unmarked', 'VGAM'(들)을 또한 설치합니다.
## 패키지 'TMB'를 성공적으로 압축해제하였고 MD5 sums 이 확인되었습니다
## 패키지 'unmarked'를 성공적으로 압축해제하였고 MD5 sums 이 확인되었습니다
## 패키지 'VGAM'를 성공적으로 압축해제하였고 MD5 sums 이 확인되었습니다
## 패키지 'AICcmodavg'를 성공적으로 압축해제하였고 MD5 sums 이 확인되었습니다
## 
## 다운로드된 바이너리 패키지들은 다음의 위치에 있습니다
##  C:\Users\kevin\AppData\Local\Temp\RtmpCw8L3T\downloaded_packages
# Load the AICcmodavg package
library(AICcmodavg)


aic_simple <- AIC(model1)
aic_multiple <- AIC(model2)

bic_simple <- BIC(model1)
bic_multiple <- BIC(model2)

list(AIC_simple = aic_simple, AIC_multiple = aic_multiple,
     BIC_simple = bic_simple, BIC_multiple = bic_multiple)
## $AIC_simple
## [1] 11623.65
## 
## $AIC_multiple
## [1] 11014.4
## 
## $BIC_simple
## [1] 11641.97
## 
## $BIC_multiple
## [1] 11063.24
# Interaction plot between education and agefirstbirth
interaction.plot <- ggplot(selected_data, aes(x = education, y = kids, color = agefirstbirth)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Interaction Effect of Education and Age at First Birth on Number of Children",
       x = "Years of Education",
       y = "Number of Children",
       color = "Age at First Birth")

print(interaction.plot)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?