library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.3
capstone_data <- read_excel("Data for Capstone.xlsx")
# Check how many NAs
sum(is.na(capstone_data$`Scores (4th Grade)`))
## [1] 0
# Check for NaN or Inf
sum(is.nan(capstone_data$`Scores (4th Grade)`))
## [1] 0
sum(is.infinite(capstone_data$`Scores (4th Grade)`))
## [1] 0
capstone_data_clean<-capstone_data
capstone_data_clean$`Scores (4th Grade)`<-as.numeric(capstone_data_clean$`Scores (4th Grade)`)
## Warning: NAs introduced by coercion
capstone_data_clean <- capstone_data_clean[!is.na(capstone_data_clean$`Scores (4th Grade)`) &
!is.nan(capstone_data_clean$`Scores (4th Grade)`) &
!is.infinite(capstone_data$`Scores (4th Grade)`), ]
model1 <- lm(`Scores (4th Grade)` ~ Year + Treated, data = capstone_data_clean)
summary(model1)
##
## Call:
## lm(formula = `Scores (4th Grade)` ~ Year + Treated, data = capstone_data_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.950 -5.156 1.266 5.368 11.059
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.375e+03 1.581e+02 -8.696 7.58e-14 ***
## Year 7.994e-01 7.871e-02 10.157 < 2e-16 ***
## Treated 4.106e+00 1.359e+00 3.020 0.00322 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.862 on 99 degrees of freedom
## Multiple R-squared: 0.5348, Adjusted R-squared: 0.5254
## F-statistic: 56.91 on 2 and 99 DF, p-value: < 2.2e-16
model1 <- lm(`Scores (4th Grade)` ~ Year + Treated, data = capstone_data_clean)
summary(model1)
##
## Call:
## lm(formula = `Scores (4th Grade)` ~ Year + Treated, data = capstone_data_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.950 -5.156 1.266 5.368 11.059
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.375e+03 1.581e+02 -8.696 7.58e-14 ***
## Year 7.994e-01 7.871e-02 10.157 < 2e-16 ***
## Treated 4.106e+00 1.359e+00 3.020 0.00322 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.862 on 99 degrees of freedom
## Multiple R-squared: 0.5348, Adjusted R-squared: 0.5254
## F-statistic: 56.91 on 2 and 99 DF, p-value: < 2.2e-16
library(dplyr)
capstone_data %>%
filter(is.na(as.numeric(`Scores (4th Grade)`))) %>%
select(`Scores (4th Grade)`)
## Warning: There was 1 warning in `filter()`.
## ℹ In argument: `is.na(as.numeric(`Scores (4th Grade)`))`.
## Caused by warning:
## ! NAs introduced by coercion
## # A tibble: 2 × 1
## `Scores (4th Grade)`
## <chr>
## 1 —
## 2 —
# Convert to numeric, turning invalid entries into NA
capstone_data$`Scores (4th Grade)` <- as.numeric(capstone_data$`Scores (4th Grade)`)
## Warning: NAs introduced by coercion
# Remove rows with NA in the dependent variable
capstone_data_clean <- capstone_data[!is.na(capstone_data$`Scores (4th Grade)`), ]
# See which rows are not numeric
non_numeric <- capstone_data$`Scores (4th Grade)`[is.na(as.numeric(capstone_data$`Scores (4th Grade)`))]
non_numeric
## [1] NA NA
# Convert to numeric, invalid entries become NA
capstone_data$`Scores (4th Grade)` <- as.numeric(capstone_data$`Scores (4th Grade)`)
# Remove rows with NA in the dependent variable
capstone_data_clean <- capstone_data[!is.na(capstone_data$`Scores (4th Grade)`), ]
model1 <- lm(`Scores (4th Grade)` ~ Year + Treated, data = capstone_data_clean)
summary(model1)
##
## Call:
## lm(formula = `Scores (4th Grade)` ~ Year + Treated, data = capstone_data_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.950 -5.156 1.266 5.368 11.059
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.375e+03 1.581e+02 -8.696 7.58e-14 ***
## Year 7.994e-01 7.871e-02 10.157 < 2e-16 ***
## Treated 4.106e+00 1.359e+00 3.020 0.00322 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.862 on 99 degrees of freedom
## Multiple R-squared: 0.5348, Adjusted R-squared: 0.5254
## F-statistic: 56.91 on 2 and 99 DF, p-value: < 2.2e-16
# Check structure of your variables
str(capstone_data_clean)
## tibble [102 × 5] (S3: tbl_df/tbl/data.frame)
## $ State : chr [1:102] "Missouri" "Indiana" "Ohio" "Kentucky" ...
## $ Scores (4th Grade): num [1:102] 222 221 219 215 214 211 208 204 229 225 ...
## $ Year : num [1:102] 1992 1992 1992 1992 1992 ...
## $ Subject : chr [1:102] "Math" "Math" "Math" "Math" ...
## $ Treated : num [1:102] 0 1 1 0 1 0 0 1 1 0 ...
model1 <- lm(`Scores (4th Grade)` ~ Year + Treated, data = capstone_data_clean)
summary(model1)
##
## Call:
## lm(formula = `Scores (4th Grade)` ~ Year + Treated, data = capstone_data_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.950 -5.156 1.266 5.368 11.059
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.375e+03 1.581e+02 -8.696 7.58e-14 ***
## Year 7.994e-01 7.871e-02 10.157 < 2e-16 ***
## Treated 4.106e+00 1.359e+00 3.020 0.00322 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.862 on 99 degrees of freedom
## Multiple R-squared: 0.5348, Adjusted R-squared: 0.5254
## F-statistic: 56.91 on 2 and 99 DF, p-value: < 2.2e-16
The linear model has a Multiple R-squared of 0.5348, which means that approximately 53.5% of the variation in 4th grade scores is explained by the independent variables Year and Treated (if there are vouchers present or not). The overall F-test is highly significant (F = 56.91, p < 2.2e-16), indicating that the model explains a significant portion of the variation in scores. Year is highly significant and treated is also significant. Since both predictors have p-values less than 0.05, they significantly contribute to explaining 4th grade scores. There are no insignificant variables in this model.
The significant independent variables in the model are Year and Treated. Year has an estimate of approximately 0.7994, meaning that for each additional year, the 4th grade scores increase by about 0.8 points, assuming the treatment status stays the same. This shows a steady improvement in scores over time. Treated (states who had vouchers) have an estimate of approximately 4.106, meaning that being in the treated group is associated with an increase of about 4.1 points in the 4th grade scores, holding the year constant. This indicates a positive effect of the treatment on scores. Individually, both variables positively affect the dependent variable, with Year reflecting gradual progress over time and Treated indicating a boost due to treatment.
plot(model1, which = 1)
The model violates the assumption of linearity, as shown by the curved
pattern in the residuals vs fitted values plot.