###1.
library(ggplot2)
data("midwest")
#creating new column of educated people and percentage educated(perceducated)
#midwest$educated <- (midwest$percollege*midwest$poptotal + midwest$perchsd*midwest$poptotal +midwest$percprof*midwest$poptotal)/100
midwest$educated <- (midwest$perchsd*midwest$poptotal)/100
midwest$perceducated <- (midwest$educated/midwest$poptotal)*100
head(midwest,5)
## # A tibble: 5 × 30
## PID county state area poptotal popdensity popwhite popblack popamerindian
## <int> <chr> <chr> <dbl> <int> <dbl> <int> <int> <int>
## 1 561 ADAMS IL 0.052 66090 1271. 63917 1702 98
## 2 562 ALEXAND… IL 0.014 10626 759 7054 3496 19
## 3 563 BOND IL 0.022 14991 681. 14477 429 35
## 4 564 BOONE IL 0.017 30806 1812. 29344 127 46
## 5 565 BROWN IL 0.018 5836 324. 5264 547 14
## # ℹ 21 more variables: popasian <int>, popother <int>, percwhite <dbl>,
## # percblack <dbl>, percamerindan <dbl>, percasian <dbl>, percother <dbl>,
## # popadults <int>, perchsd <dbl>, percollege <dbl>, percprof <dbl>,
## # poppovertyknown <int>, percpovertyknown <dbl>, percbelowpoverty <dbl>,
## # percchildbelowpovert <dbl>, percadultpoverty <dbl>,
## # percelderlypoverty <dbl>, inmetro <int>, category <chr>, educated <dbl>,
## # perceducated <dbl>
# Linear regression
#check if we can find relation between percent educated and percentage poverty
model1 <- lm(percbelowpoverty ~ perceducated, data=midwest)
summary(model1)
##
## Call:
## lm(formula = percbelowpoverty ~ perceducated, data = midwest)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.9617 -2.8131 -0.6258 2.1152 30.3485
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50.78508 2.53822 20.01 <2e-16 ***
## perceducated -0.51746 0.03421 -15.13 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.174 on 435 degrees of freedom
## Multiple R-squared: 0.3447, Adjusted R-squared: 0.3432
## F-statistic: 228.8 on 1 and 435 DF, p-value: < 2.2e-16
The extremely low p-value shows that the model as a whole is significant. There is a statistically significant relationship between the percentage of educated individuals and the percentage of people below the poverty line in the Midwest counties. We should also consider other factors not included in the model could be influencing poverty rates too.
plot(model1)
correlation1 <- cor(midwest$percbelowpoverty, predict(model1))
correlation1
## [1] 0.5870967
A value close to 0 indicates a weak or no linear relationship between poverty and educated.
confidence_interval1 <- predict(model1, interval = "confidence")
head(confidence_interval1)
## fit lwr upr
## 1 11.91964 11.51977 12.31951
## 2 19.87879 18.84408 20.91350
## 3 14.90666 14.40572 15.40759
## 4 11.73087 11.32558 12.13617
## 5 15.15166 14.63034 15.67298
## 6 11.13205 10.70067 11.56342
These confidence intervals provide a range of values within which we can be reasonably confident that the true values lie.
###2
# Median Income Prediction
# Response Variable: Median Income (poppovertyknown)
#linear regression to predict Median Income
model2 <- lm(poppovertyknown ~ perchsd + percollege + percwhite + percblack, data=midwest)
summary(model2)
##
## Call:
## lm(formula = poppovertyknown ~ perchsd + percollege + percwhite +
## percblack, data = midwest)
##
## Residuals:
## Min 1Q Median 3Q Max
## -923486 -40305 14027 41246 4133762
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 388391.1 310050.0 1.253 0.21100
## perchsd -877.3 3250.9 -0.270 0.78739
## percollege 8709.2 3126.8 2.785 0.00558 **
## percwhite -4743.3 2495.1 -1.901 0.05796 .
## percblack 24012.4 3498.6 6.863 2.35e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 239500 on 432 degrees of freedom
## Multiple R-squared: 0.3391, Adjusted R-squared: 0.333
## F-statistic: 55.41 on 4 and 432 DF, p-value: < 2.2e-16
From the above output: The percollege and percblack variables are statistically significant predictors of poppovertyknown, as shown by their low p-values whereas perchsd and percwhite are not statistically significant predictors at 0.05 significance level.
# Visualize the relationship between the response and explanatory variables
pairs(midwest[c("poppovertyknown", "perchsd", "percollege", "percwhite", "percblack")])
# Calculate the correlation coefficient matrix
correlation_matrix2 <- cor(midwest[c("poppovertyknown", "perchsd", "percollege", "percwhite", "percblack")])
print(correlation_matrix2)
## poppovertyknown perchsd percollege percwhite percblack
## poppovertyknown 1.0000000 0.13491014 0.2968054 -0.47370346 0.55152672
## perchsd 0.1349101 1.00000000 0.7769244 -0.01860919 0.01370922
## percollege 0.2968054 0.77692441 1.0000000 -0.21473889 0.23727678
## percwhite -0.4737035 -0.01860919 -0.2147389 1.00000000 -0.75958466
## percblack 0.5515267 0.01370922 0.2372768 -0.75958466 1.00000000
This above matrix shows the pairwise correlations between different variables.
->poppovertyknown has a strong positive correlation with percblack-> the percentage of the black population increases, the percentage of the population in poverty known also increases. ->There is a strong negative correlation with percwhite-> the percentage of the white population increases, the percentage of the population in poverty known decreases.
->correlation does not imply causation and I’ve not considered other factors in this analysis.
# confidence interval for the response variable
confidence_interval2 <- predict(model2, interval = "confidence")
head(confidence_interval2)
## fit lwr upr
## 1 96575.13 72846.9789 120303.28
## 2 909048.65 761035.4937 1057061.82
## 3 86562.49 53119.6823 120005.30
## 4 30743.28 254.4239 61232.15
## 5 251276.13 207657.8903 294894.37
## 6 21897.82 -5850.8634 49646.51