# Load libraries
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data("midwest")

Null Hypothesis (H0) for ANOVA test: There is no significant difference in the college education rates (percollege) among the different state in midwest.

model<-aov(percollege ~ state, data=midwest)
summary(model)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## state         4    774  193.52   5.122 0.000485 ***
## Residuals   432  16322   37.78                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

In this case, the p-value is very small (0.000485), indicating that there is strong evidence against the null hypothesis. Therefore, we reject the null hypothesis. There is a significant difference in college education rates among the different states in the Midwest.

I want to examine the relationship between “percbelowpoverty” (percent of people below poverty-response) and “percollege” (percent college educated) in urban and rural counties using linear regression

urban<- midwest %>% filter(inmetro == 1)
rural<-midwest %>% filter(inmetro == 0)

#linear regression model
model_urban <- lm(percbelowpoverty ~ percollege, urban)
summary(model_urban)
## 
## Call:
## lm(formula = percbelowpoverty ~ percollege, data = urban)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9760 -3.5336 -0.1495  2.7718 12.3581 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12.23876    1.05600  11.590   <2e-16 ***
## percollege  -0.08658    0.04465  -1.939   0.0544 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.048 on 148 degrees of freedom
## Multiple R-squared:  0.02478,    Adjusted R-squared:  0.01819 
## F-statistic:  3.76 on 1 and 148 DF,  p-value: 0.0544
model_rural<- lm(percbelowpoverty ~ percollege, rural)
summary(model_rural)
## 
## Call:
## lm(formula = percbelowpoverty ~ percollege, data = rural)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.999 -3.480 -1.024  2.255 33.002 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 17.38276    1.23664   14.06  < 2e-16 ***
## percollege  -0.23092    0.07449   -3.10  0.00213 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.197 on 285 degrees of freedom
## Multiple R-squared:  0.03262,    Adjusted R-squared:  0.02923 
## F-statistic:  9.61 on 1 and 285 DF,  p-value: 0.002129
ggplot(rural, aes(x = percbelowpoverty, y = percollege)) +
  geom_point() +
  geom_smooth(method='lm', formula = x~y, se=FALSE, color = "blue")+
  labs(x = "percbelowpoverty", y = "College Education Rates") +
  ggtitle("Scatter Plot of percbelowpoverty vs. College Education Rates (Inmetro Counties)")
## Warning: Computation failed in `stat_smooth()`
## Caused by error:
## ! object 'y' not found

Wkt from previous outout-> The minimum residual is approximately -7.999, and the maximum is about 33.002. These values suggest that the residuals vary across the dataset, indicating that the model does not fit the data perfectly.

Multiple linear regression model predicting “percbelowpoverty” (the percentage of the population below the poverty line) based on the variables “percollege,” “perchsd,” “percprof,” and their interaction terms in the counties with “inmetro” status.

model_lm2 <- lm(percbelowpoverty ~ percollege * perchsd * percprof, data = urban)
summary(model_lm2)
## 
## Call:
## lm(formula = percbelowpoverty ~ percollege * perchsd * percprof, 
##     data = urban)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.3053 -2.2490 -0.0197  2.0597  7.7872 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                 65.006842  21.057738   3.087  0.00243 **
## percollege                   1.624188   1.428227   1.137  0.25737   
## perchsd                     -0.799990   0.276251  -2.896  0.00438 **
## percprof                    -7.727712   6.275649  -1.231  0.22022   
## percollege:perchsd          -0.018744   0.016787  -1.117  0.26606   
## percollege:percprof          0.120905   0.172466   0.701  0.48443   
## perchsd:percprof             0.108015   0.080185   1.347  0.18010   
## percollege:perchsd:percprof -0.001565   0.001979  -0.791  0.43023   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.065 on 142 degrees of freedom
## Multiple R-squared:  0.4636, Adjusted R-squared:  0.4371 
## F-statistic: 17.53 on 7 and 142 DF,  p-value: < 2.2e-16
ggplot(data = urban, aes(x = percollege, y = percbelowpoverty)) +
  geom_point() +
  geom_smooth(method = "lm", formula = y~x, se = FALSE, color = "lightblue") +  
  labs(title = "Scatterplot with Fitted Line", x = "percollege", y = "percbelowpoverty")

plot(model_lm2)