# I'm using "vryrel" as my quantitative variable, which is the percentage of individuals in each state who claim that religion is very important to them. I'm hypothesizing that a higher percentage corresponds to a lower age at first marriage. 

#I'm using "perUrb" as my categorical variable, which is the percentage of the state's population that lives in an urban area. To make it categorical, I'm splitting all states into either "rural" (less than the mean of the distribution that is living in urban area) or "urban" (more than the mean of the distirbution living in urban area).

mean(final$perUrb)
## [1] 73.5818
median(final$perUrb)
## [1] 73.735
final <- final %>%
  mutate(urbanrural = ifelse(perUrb >= 73, "urban", "rural"))
summary(final$t_ageFM)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   24.45   26.65   27.32   27.31   28.10   29.75
hist(final$t_ageFM)

# Median and mean are about the same (27.3) and the minimum is only 24.4.Overall it's pretty normally distributed with most observed values hovering around the median/mean.

summary(final$vryrel)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   23.80   33.17   39.05   39.62   45.70   56.60
hist(final$vryrel)

# median and mean are again about the same (39%) with the max being 56 and the lowest being 23. It's a broader distribution with less heaping around the mean so could be interesting to see how it corresponds to the age at first marriage variable.

table(final$urbanrural)
## 
## rural urban 
##    22    28
# just shows that splitting it into two categories based on the mean/median was somewhat effective at equally distributing the states. I originally split it based on 50% and only 4 would've been categorized as rural if I had gone that route. 


ggplot(final, aes(x = t_ageFM, y = vryrel)) +
  geom_point(color = "steelblue") +
  labs(title = "Scatterplot of Age at First Marriage and Religiosity by State",
       x = "Age at First Marriage",
       y = "Percentage Identifying as Very Relgious") +
  theme_minimal() + geom_smooth(method = "lm", se = FALSE, color = "red")
## `geom_smooth()` using formula = 'y ~ x'

#at first glance, it doesn't look like there's much relationship but the trend line does indicate that states that are less religious have a higher age at first marriage


boxplot(t_ageFM ~ urbanrural, data = final,
        main = "Age at First Marriage by Urban/Rural Divice",
        xlab = "Urban/Rural State", ylab = "Age at First Marriage",
        col = c("skyblue", "orange"))

#nice way to show that the rural states may indeed have statistically significant lower ages at first marriage
cor.test(final$vryrel, final$t_ageFM, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  final$vryrel and final$t_ageFM
## t = -2.0714, df = 48, p-value = 0.04372
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.523092649 -0.008807485
## sample estimates:
##       cor 
## -0.286453
#confirms there's a slightly negative correlation with religiosity and age at first marriage; statistically significant per the p value; coefficient shows it's a pretty weak relationship though

final$area_binary <- ifelse(final$urbanrural == "urban", 1, 0)
cor.test(final$area_binary, final$t_ageFM, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  final$area_binary and final$t_ageFM
## t = 3.527, df = 48, p-value = 0.0009367
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2006615 0.6499441
## sample estimates:
##       cor 
## 0.4536701
#confirms somewhat of a positive correlation, where age at first marriage increases with urban-ness; p value shows a high degree of statistical significance; I would say it's a moderate relationship


religionmodel <- lm(t_ageFM ~ vryrel, data = final)
summary(religionmodel)
## 
## Call:
## lm(formula = t_ageFM ~ vryrel, data = final)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.55270 -0.60830 -0.00936  0.77351  2.36536 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 28.78609    0.72872  39.502   <2e-16 ***
## vryrel      -0.03723    0.01797  -2.071   0.0437 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.094 on 48 degrees of freedom
## Multiple R-squared:  0.08206,    Adjusted R-squared:  0.06293 
## F-statistic: 4.291 on 1 and 48 DF,  p-value: 0.04372
# so when religion is 0, the average age at first marriage would be about 29. For every percentage point that religion goes up, age at first marriage drops by .03.


urbanmodel <- lm(t_ageFM ~ area_binary, data = final)
summary(urbanmodel)
## 
## Call:
## lm(formula = t_ageFM ~ area_binary, data = final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3107 -0.4817 -0.0497  0.6198  1.9893 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  26.7386     0.2169 123.300  < 2e-16 ***
## area_binary   1.0221     0.2898   3.527 0.000937 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.017 on 48 degrees of freedom
## Multiple R-squared:  0.2058, Adjusted R-squared:  0.1893 
## F-statistic: 12.44 on 1 and 48 DF,  p-value: 0.0009367
# for rural states, the average age at first marriage is around 26. For urban states, it's about one year higher on average. 

bothmodel <- lm(t_ageFM ~ vryrel + area_binary, data = final)
summary(bothmodel)
## 
## Call:
## lm(formula = t_ageFM ~ vryrel + area_binary, data = final)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.83667 -0.55967 -0.06823  0.67978  1.89696 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 27.90640    0.71715  38.913  < 2e-16 ***
## vryrel      -0.02834    0.01662  -1.705  0.09479 .  
## area_binary  0.94167    0.28809   3.269  0.00202 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9975 on 47 degrees of freedom
## Multiple R-squared:  0.2521, Adjusted R-squared:  0.2203 
## F-statistic:  7.92 on 2 and 47 DF,  p-value: 0.001085
#so when religion is 0 and the state is rural, the average age at first marriage is around 27.9; to me this kind of complicates things because one relationship is negative and the other is positive so I'm going to flip around the urban/rural variable

final$ruralbinary <- ifelse(final$urbanrural == "rural", 1, 0)
bothmodel2 <- lm(t_ageFM ~ vryrel + ruralbinary, data = final)
summary(bothmodel2)
## 
## Call:
## lm(formula = t_ageFM ~ vryrel + ruralbinary, data = final)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.83667 -0.55967 -0.06823  0.67978  1.89696 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 28.84807    0.66501  43.380  < 2e-16 ***
## vryrel      -0.02834    0.01662  -1.705  0.09479 .  
## ruralbinary -0.94167    0.28809  -3.269  0.00202 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9975 on 47 degrees of freedom
## Multiple R-squared:  0.2521, Adjusted R-squared:  0.2203 
## F-statistic:  7.92 on 2 and 47 DF,  p-value: 0.001085
# so this shows that urban states with 0 religion would have a slightly higher age at first marriage compared to the previous model; also the p value is pretty good on this one

# Extract coefficients
coefs <- coef(bothmodel2)

# Build LaTeX-style equation string
eq_latex <- paste0(
  "$\\hat{t\\_ageFM} = ",
  round(coefs[1], 3), " + ",
  round(coefs[2], 3), "\\cdot vryrel + ",
  round(coefs[3], 3), "\\cdot ruralbinary$"
)

# Print equation
eq_latex
## [1] "$\\hat{t\\_ageFM} = 28.848 + -0.028\\cdot vryrel + -0.942\\cdot ruralbinary$"
summary(bothmodel2)$coefficients["vryrel", ]
##    Estimate  Std. Error     t value    Pr(>|t|) 
## -0.02833754  0.01661997 -1.70502930  0.09479130
#shows test statistics for the religion variable.

n <- nrow(final)
k <- length(coef(bothmodel2)) - 1  
df_resid <- n - k - 1
df_resid
## [1] 47
#calculating degrees of freedom

final$residuals <- resid(bothmodel2)
final$fitted <- fitted(bothmodel2)

ggplot(final, aes(x = fitted, y = residuals)) +
  geom_point(color = "steelblue") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Residual Plot",
       x = "Fitted values",
       y = "Residuals") +
  theme_minimal()

#residual plot looks like it should because the residuals are randomly scattered around zero
#I'd need to know more about the relationship between urbanism and religious affinity in order to determine if the independent variables are potentially homoscedastic

#I would also say that the last model is the one with the best fit because it has the highest r-square and adjusted r-square
grViz("
digraph mediation_model {
  graph [layout = dot, rankdir = LR]

  # Nodes
  vryrel     [label = 'urbanrural (Independent Variable)', shape = box, style = filled, fillcolor = lightblue]
  urbanrural [label = 'vryrel (Mediating Variable)', shape = box, style = filled, fillcolor = white]
  t_ageFM    [label = 't_ageFM (Dependent Variable)', shape = box, style = filled, fillcolor = lightpink]

  # Edges
  vryrel -> urbanrural
  urbanrural -> t_ageFM
  vryrel -> t_ageFM
}
")
grViz("
digraph moderation_diagram {
  graph [layout = dot, rankdir = LR]

  # Nodes
  vryrel     [label = 'urbanrural (Independent Variable)', shape = box, style = filled, fillcolor = lightblue]
  t_ageFM    [label = 't_ageFM (Dependent Variable)', shape = box, style = filled, fillcolor = lightpink]
  urbanrural [label = 'vryrel (Moderator)', shape = box, style = filled, fillcolor = white]
  relation   [label = '', shape = point, width = 0.1]

  # Edges
  vryrel -> relation [arrowhead = none]
  relation -> t_ageFM
  urbanrural -> relation
}
")
final$ruralbinary <- factor(final$ruralbinary)

# Fit regression model with interaction
model <- lm(t_ageFM ~ vryrel * ruralbinary, data = final)

# Sequence of vryrel values
vryrel_seq <- seq(min(final$vryrel), max(final$vryrel), length.out = 100)

# New data for predictions
newdata <- expand.grid(
  vryrel = vryrel_seq,
  ruralbinary = levels(final$ruralbinary)
)

# Predicted values
newdata$pred <- predict(model, newdata)

ggplot(newdata, aes(x = vryrel, y = pred, color = ruralbinary)) +
  geom_line(size = 1.2) +
  labs(
    x = "vryrel (Quantitative IV)",
    y = "Predicted t_ageFM (DV)",
    color = "Rural Binary",
    title = "Predicted Values of t_ageFM by vryrel and ruralbinary"
  ) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.