###1.

library(ggplot2)
data("midwest")
#creating new column of educated people and percentage educated(perceducated)

#midwest$educated <- (midwest$percollege*midwest$poptotal + midwest$perchsd*midwest$poptotal +midwest$percprof*midwest$poptotal)/100

midwest$educated <- (midwest$perchsd*midwest$poptotal)/100

midwest$perceducated <- (midwest$educated/midwest$poptotal)*100
head(midwest,5)
## # A tibble: 5 × 30
##     PID county   state  area poptotal popdensity popwhite popblack popamerindian
##   <int> <chr>    <chr> <dbl>    <int>      <dbl>    <int>    <int>         <int>
## 1   561 ADAMS    IL    0.052    66090      1271.    63917     1702            98
## 2   562 ALEXAND… IL    0.014    10626       759      7054     3496            19
## 3   563 BOND     IL    0.022    14991       681.    14477      429            35
## 4   564 BOONE    IL    0.017    30806      1812.    29344      127            46
## 5   565 BROWN    IL    0.018     5836       324.     5264      547            14
## # ℹ 21 more variables: popasian <int>, popother <int>, percwhite <dbl>,
## #   percblack <dbl>, percamerindan <dbl>, percasian <dbl>, percother <dbl>,
## #   popadults <int>, perchsd <dbl>, percollege <dbl>, percprof <dbl>,
## #   poppovertyknown <int>, percpovertyknown <dbl>, percbelowpoverty <dbl>,
## #   percchildbelowpovert <dbl>, percadultpoverty <dbl>,
## #   percelderlypoverty <dbl>, inmetro <int>, category <chr>, educated <dbl>,
## #   perceducated <dbl>
# Linear regression 
#check if we can find relation between percent educated and percentage poverty
model1 <- lm(percbelowpoverty ~ perceducated, data=midwest)
summary(model1)
## 
## Call:
## lm(formula = percbelowpoverty ~ perceducated, data = midwest)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.9617 -2.8131 -0.6258  2.1152 30.3485 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  50.78508    2.53822   20.01   <2e-16 ***
## perceducated -0.51746    0.03421  -15.13   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.174 on 435 degrees of freedom
## Multiple R-squared:  0.3447, Adjusted R-squared:  0.3432 
## F-statistic: 228.8 on 1 and 435 DF,  p-value: < 2.2e-16

The extremely low p-value shows that the model as a whole is significant. There is a statistically significant relationship between the percentage of educated individuals and the percentage of people below the poverty line in the Midwest counties. We should also consider other factors not included in the model could be influencing poverty rates too.

plot(model1)

correlation1 <- cor(midwest$percbelowpoverty, predict(model1))
correlation1
## [1] 0.5870967

A value close to 0 indicates a weak or no linear relationship between poverty and educated.

confidence_interval1 <- predict(model1, interval = "confidence")
head(confidence_interval1)
##        fit      lwr      upr
## 1 11.91964 11.51977 12.31951
## 2 19.87879 18.84408 20.91350
## 3 14.90666 14.40572 15.40759
## 4 11.73087 11.32558 12.13617
## 5 15.15166 14.63034 15.67298
## 6 11.13205 10.70067 11.56342

These confidence intervals provide a range of values within which we can be reasonably confident that the true values lie.

###2

# Median Income Prediction
# Response Variable: Median Income (poppovertyknown)

#linear regression to predict Median Income
model2 <- lm(poppovertyknown ~ perchsd + percollege + percwhite + percblack, data=midwest)
summary(model2)
## 
## Call:
## lm(formula = poppovertyknown ~ perchsd + percollege + percwhite + 
##     percblack, data = midwest)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -923486  -40305   14027   41246 4133762 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 388391.1   310050.0   1.253  0.21100    
## perchsd       -877.3     3250.9  -0.270  0.78739    
## percollege    8709.2     3126.8   2.785  0.00558 ** 
## percwhite    -4743.3     2495.1  -1.901  0.05796 .  
## percblack    24012.4     3498.6   6.863 2.35e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 239500 on 432 degrees of freedom
## Multiple R-squared:  0.3391, Adjusted R-squared:  0.333 
## F-statistic: 55.41 on 4 and 432 DF,  p-value: < 2.2e-16

From the above output: The percollege and percblack variables are statistically significant predictors of poppovertyknown, as shown by their low p-values whereas perchsd and percwhite are not statistically significant predictors at 0.05 significance level.

# Visualize the relationship between the response and explanatory variables
pairs(midwest[c("poppovertyknown", "perchsd", "percollege", "percwhite", "percblack")])

# Calculate the correlation coefficient matrix
correlation_matrix2 <- cor(midwest[c("poppovertyknown", "perchsd", "percollege", "percwhite", "percblack")])
print(correlation_matrix2)
##                 poppovertyknown     perchsd percollege   percwhite   percblack
## poppovertyknown       1.0000000  0.13491014  0.2968054 -0.47370346  0.55152672
## perchsd               0.1349101  1.00000000  0.7769244 -0.01860919  0.01370922
## percollege            0.2968054  0.77692441  1.0000000 -0.21473889  0.23727678
## percwhite            -0.4737035 -0.01860919 -0.2147389  1.00000000 -0.75958466
## percblack             0.5515267  0.01370922  0.2372768 -0.75958466  1.00000000

This above matrix shows the pairwise correlations between different variables.

->poppovertyknown has a strong positive correlation with percblack-> the percentage of the black population increases, the percentage of the population in poverty known also increases. ->There is a strong negative correlation with percwhite-> the percentage of the white population increases, the percentage of the population in poverty known decreases.

->correlation does not imply causation and I’ve not considered other factors in this analysis.

# confidence interval for the response variable
confidence_interval2 <- predict(model2, interval = "confidence")
head(confidence_interval2)
##         fit         lwr        upr
## 1  96575.13  72846.9789  120303.28
## 2 909048.65 761035.4937 1057061.82
## 3  86562.49  53119.6823  120005.30
## 4  30743.28    254.4239   61232.15
## 5 251276.13 207657.8903  294894.37
## 6  21897.82  -5850.8634   49646.51