Data 605 Discussion Post 12

Load in the Education dataset
Build the Multiple Regression Model
Conclusion

Load in the Education dataset

library(ggplot2)
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 3.5.3

## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --

## v tibble  2.0.1     v purrr   0.2.5
## v tidyr   0.8.2     v dplyr   0.7.8
## v readr   1.3.1     v stringr 1.3.1
## v tibble  2.0.1     v forcats 0.3.0

## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

setwd("C:/Users/arnou/Documents/Education Data")
df <- read.csv(file="states_education_data.csv", header=TRUE, sep=",")

Build the Multiple Regression Model

df$Expenditure_per_child <- df$TOTAL_EXPENDITURE*1000/(df$GRADES_ALL_G)
df$Total_Scores <- (df$AVG_MATH_4_SCORE+df$AVG_MATH_8_SCORE+df$AVG_READING_4_SCORE+df$AVG_READING_8_SCORE)/4


ggplot(data = df, aes(x = AVG_MATH_4_SCORE, y = AVG_MATH_8_SCORE)) + 
  geom_point(color='blue') +
  geom_smooth(method = "lm", se = FALSE)+xlim(150,300)

## Warning: Removed 960 rows containing non-finite values (stat_smooth).

## Warning: Removed 960 rows containing missing values (geom_point).

ggplot(data = df, aes(x = Expenditure_per_child, y = AVG_MATH_8_SCORE)) + 
  geom_point(color='blue') +
  geom_smooth(method = "lm", se = FALSE)+xlim(0,30000)

## Warning: Removed 1016 rows containing non-finite values (stat_smooth).

## Warning: Removed 1016 rows containing missing values (geom_point).

ggplot(data = df, aes(x = AVG_READING_8_SCORE, y = AVG_MATH_8_SCORE)) + 
  geom_point(color='blue') +
  geom_smooth(method = "lm", se = FALSE)+xlim(225,300)

## Warning: Removed 1014 rows containing non-finite values (stat_smooth).

## Warning: Removed 1014 rows containing missing values (geom_point).

ggplot(data = df, aes(x = AVG_READING_4_SCORE, y = AVG_MATH_8_SCORE)) + 
  geom_point(color='blue') +
  geom_smooth(method = "lm", se = FALSE)+xlim(150,300)

## Warning: Removed 970 rows containing non-finite values (stat_smooth).

## Warning: Removed 970 rows containing missing values (geom_point).

linear_model <- lm(AVG_MATH_8_SCORE~AVG_MATH_4_SCORE + Expenditure_per_child + AVG_READING_8_SCORE + AVG_READING_4_SCORE,df)



plot(linear_model)

linear_model

## 
## Call:
## lm(formula = AVG_MATH_8_SCORE ~ AVG_MATH_4_SCORE + Expenditure_per_child + 
##     AVG_READING_8_SCORE + AVG_READING_4_SCORE, data = df)
## 
## Coefficients:
##           (Intercept)       AVG_MATH_4_SCORE  Expenditure_per_child  
##             1.669e+01              6.781e-01             -8.839e-05  
##   AVG_READING_8_SCORE    AVG_READING_4_SCORE  
##             3.123e-02              4.344e-01

summary(linear_model)

## 
## Call:
## lm(formula = AVG_MATH_8_SCORE ~ AVG_MATH_4_SCORE + Expenditure_per_child + 
##     AVG_READING_8_SCORE + AVG_READING_4_SCORE, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.400  -2.507  -0.048   2.283   9.431 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            1.669e+01  8.133e+00   2.052   0.0408 *  
## AVG_MATH_4_SCORE       6.781e-01  3.420e-02  19.830   <2e-16 ***
## Expenditure_per_child -8.839e-05  5.012e-05  -1.764   0.0785 .  
## AVG_READING_8_SCORE    3.123e-02  2.522e-02   1.238   0.2164    
## AVG_READING_4_SCORE    4.344e-01  3.831e-02  11.339   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.531 on 415 degrees of freedom
##   (1072 observations deleted due to missingness)
## Multiple R-squared:  0.8755, Adjusted R-squared:  0.8743 
## F-statistic: 729.6 on 4 and 415 DF,  p-value: < 2.2e-16

Conclusion

I am using the model above to run multiple regression to try and predict the average math score for a state in 8th grade the variables that I am using to predict the 8th grade math score are the 8th grade reading score the 4th grade reading and math scores and the total expenditure per student for the state

By developing a linear model with all of these variables the adjusted r squared score is .87 which means that approximately 87% of the variance in 8th grade math scores can be explained with these predictor variables