#Loading required packages (might not end up using all of the loaded ones)
library(prettydoc) #For the theme used in this document
library(ggplot2)
library(tidyverse)
library(dplyr)
#install.packages("caret")
#install.packages("skimr")
#install.packages("rpart")
#install.packages("randomForest")
#install.packages("MatchIt")
library(MatchIt)
library(stargazer)
library(caret)
library(skimr)
library(rpart)
library(rpart.plot)
library(randomForest)
library(rattle)
library(neuralnet)
library(nnet)##Loading and cleaning the data and Identifying the problem
#Setting up the directory
setwd("D:/UGA Coursework/Second Year/AAEC 8610/HWs/HW11Lost")
getwd()## [1] "D:/UGA Coursework/Second Year/AAEC 8610/HWs/HW11Lost"
df <- read.csv2("ecls.csv", sep = ",",
stringsAsFactors = TRUE)
df1 <- df %>%
select(c5r2mtsc_std, catholic, w3income_1k, p5numpla, w3momed_hsb)
df1 <- as.data.frame(df1)
df1$c5r2mtsc_std <- as.numeric(df1$c5r2mtsc_std)
df1$w3momed_hsb <- as.numeric(df1$w3momed_hsb)
df1$w3income_1k <- as.numeric(df1$w3income_1k)
#Do test scores on c5r2mtsc_std differ for pupils of catholic and non-catholic schools on average?
t.test(c5r2mtsc_std ~ catholic, data = df1)##
## Welch Two Sample t-test
##
## data: c5r2mtsc_std by catholic
## t = 0.53061, df = 1341.3, p-value = 0.5958
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -73.56233 128.11145
## sample estimates:
## mean in group 0 mean in group 1
## 2483.336 2456.061
#Can you conclude that going to a catholic school leads to higher math scores? Why or why not?
naiiveregression <- lm(c5r2mtsc_std ~ catholic, data = df1)
summary(naiiveregression)##
## Call:
## lm(formula = c5r2mtsc_std ~ catholic, data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2482.34 -1244.34 23.66 1214.66 2510.94
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2483.34 21.28 116.70 <2e-16 ***
## catholic -27.27 51.41 -0.53 0.596
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1427 on 5427 degrees of freedom
## Multiple R-squared: 5.185e-05, Adjusted R-squared: -0.0001324
## F-statistic: 0.2814 on 1 and 5427 DF, p-value: 0.5958
stargazer(naiiveregression, title = "Naiive Regression", type = "text",
dep.var.caption = "Dependent variable: Tests Scores",
dep.var.labels.include = FALSE,
out = "regression_results.html")##
## Naiive Regression
## ====================================================
## Dependent variable: Tests Scores
## --------------------------------
## ----------------------------------------------------
## catholic -27.275
## (51.415)
##
## Constant 2,483.336***
## (21.280)
##
## ----------------------------------------------------
## Observations 5,429
## R2 0.0001
## Adjusted R2 -0.0001
## Residual Std. Error 1,427.338 (df = 5427)
## F Statistic 0.281 (df = 1; 5427)
## ====================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
#Do the means of the covariates (income, mother’s education etc) differ between catholic vs. non-catholic
#schools?
#t.test(w3income_1k, ~ catholic, data = df1)
#t.test(w3momed_hsb, ~ catholic, data = df1)##Estimate the propensity to go to catholic school, by hand
logit <- glm(catholic ~ w3income_1k + p5numpla + w3momed_hsb, family = binomial(), data = df1)
#Make a prediction for who is likely to go to catholic school, using this model
df1$predict <- predict(logit)
#Check that there is common support.
plot1 <- ggplot(df1, aes(predict, fill = as.factor(catholic))) +
geom_density()
plot1#Plot the income variable against the logit prediction (by catholic), and add the lowess smooth
#densities to the plot. They should look similar, but not perfectly aligned.
plot2 <- ggplot(df1, aes(predict, y = w3income_1k, fill = as.factor(catholic))) +
geom_point() +
geom_smooth(method="loess")
plot2##Run a Matching algorithm to get a balanced dataset
# Run a MatchIt command with the same equation as your logit.
Matchit <- matchit(catholic ~ w3income_1k + p5numpla + w3momed_hsb, method = "nearest", data = df)
propscore <- match.data(Matchit)
propscore$c5r2mtsc_std <- as.numeric(propscore$c5r2mtsc_std)
#Sort the data by distance, and show the 10 first observations.
head(propscore[order(propscore$distance),], 10)## childid catholic race race_white
## 4247 1108020C 0 HISPANIC, RACE SPECIFIED 0
## 4274 1117012C 1 BLACK OR AFRICAN AMERICAN, NON-HISPANIC 0
## 4212 1099009C 0 HISPANIC, RACE NOT SPECIFIED 0
## 4271 1116018C 1 HISPANIC, RACE NOT SPECIFIED 0
## 482 0119017C 1 ASIAN 0
## 845 0214024C 0 WHITE, NON-HISPANIC 1
## 181 0040042C 0 WHITE, NON-HISPANIC 1
## 2083 0485014C 1 WHITE, NON-HISPANIC 1
## 571 0142016C 1 WHITE, NON-HISPANIC 1
## 1042 0270010C 0 WHITE, NON-HISPANIC 1
## race_black race_hispanic race_asian p5numpla p5hmage p5hdage
## 4247 0 1 0 2 34 35
## 4274 1 0 0 2 33 39
## 4212 0 1 0 1 28 33
## 4271 0 1 0 1 36 44
## 482 0 0 1 2 36 46
## 845 0 0 0 2 30 32
## 181 0 0 0 2 34 44
## 2083 0 0 0 2 43 43
## 571 0 0 0 1 42 42
## 1042 0 0 0 1 39 41
## w3daded w3momed w3daded_hsb
## 4247 8TH GRADE OR BELOW VOC/TECH PROGRAM 1
## 4274 HIGH SCHOOL DIPLOMA/EQUIVALENT HIGH SCHOOL DIPLOMA/EQUIVALENT 1
## 4212 8TH GRADE OR BELOW 8TH GRADE OR BELOW 1
## 4271 8TH GRADE OR BELOW HIGH SCHOOL DIPLOMA/EQUIVALENT 1
## 482 SOME COLLEGE SOME COLLEGE 0
## 845 9TH - 12TH GRADE SOME COLLEGE 1
## 181 HIGH SCHOOL DIPLOMA/EQUIVALENT HIGH SCHOOL DIPLOMA/EQUIVALENT 1
## 2083 9TH - 12TH GRADE VOC/TECH PROGRAM 1
## 571 HIGH SCHOOL DIPLOMA/EQUIVALENT HIGH SCHOOL DIPLOMA/EQUIVALENT 1
## 1042 SOME COLLEGE HIGH SCHOOL DIPLOMA/EQUIVALENT 0
## w3momed_hsb w3momscr w3dadscr w3inccat w3income w3povrty
## 4247 1 34.95 39.2 $15,001 TO $20,000 17500.5 1
## 4274 1 38.18 39.18 $15,001 TO $20,000 17500.5 0
## 4212 1 34.95 39.2 $15,001 TO $20,000 17500.5 1
## 4271 1 34.95 53.5 $15,001 TO $20,000 17500.5 1
## 482 0 34.95 34.95 $15,001 TO $20,000 17500.5 1
## 845 0 34.95 39.18 $15,001 TO $20,000 17500.5 1
## 181 1 34.95 29.6 $20,001 TO $25,000 22500.5 0
## 2083 1 38.18 39.18 $20,001 TO $25,000 22500.5 1
## 571 1 35.78 64.89 $10,001 TO $15,000 12500.5 1
## 1042 1 38.18 35.63 $10,001 TO $15,000 12500.5 1
## p5fstamp c5r2mtsc c5r2mtsc_std w3income_1k distance weights subclass
## 4247 0 31.548 1945 17.5005 0.01823061 1 598
## 4274 0 25.325 2022 17.5005 0.01823061 1 598
## 4212 0 40.079 1480 17.5005 0.02245746 1 595
## 4271 0 53.528 2715 17.5005 0.02245746 1 595
## 482 0 36.418 1743 17.5005 0.02900985 1 705
## 845 0 42.728 1246 17.5005 0.02900985 1 705
## 181 0 52.464 2431 22.5005 0.03094559 1 222
## 2083 0 55.627 3147 22.5005 0.03094559 1 222
## 571 0 59.059 3790 12.5005 0.03368196 1 802
## 1042 0 48.419 395 12.5005 0.03368196 1 802
#Compare the means
#t.test(c5r2mtsc_std, ~ catholic, data = propscore)##Compute the average treatment effect in your new sample
newdata1 <- lm(c5r2mtsc_std ~ catholic, data = propscore)
stargazer(newdata1, title = "Naiive Regression", type = "text",
dep.var.caption = "Dependent variable: Tests Scores",
dep.var.labels.include = FALSE,
out = "regression_results.html")##
## Naiive Regression
## ====================================================
## Dependent variable: Tests Scores
## --------------------------------
## ----------------------------------------------------
## catholic -179.890***
## (66.676)
##
## Constant 2,635.952***
## (47.147)
##
## ----------------------------------------------------
## Observations 1,860
## R2 0.004
## Adjusted R2 0.003
## Residual Std. Error 1,437.801 (df = 1858)
## F Statistic 7.279*** (df = 1; 1858)
## ====================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
newdata2 <- lm(c5r2mtsc_std ~ catholic + w3income_1k + p5numpla + w3momed_hsb, data = propscore)
stargazer(newdata2, title = "Regression", type = "text",
dep.var.caption = "Dependent variable: Tests Scores",
dep.var.labels.include = FALSE,
out = "regression_results.html")##
## Regression
## ====================================================
## Dependent variable: Tests Scores
## --------------------------------
## ----------------------------------------------------
## catholic -179.890***
## (64.955)
##
## w3income_1k150.0005 1,108.063**
## (503.376)
##
## w3income_1k17.5005 87.445
## (703.019)
##
## w3income_1k200.001 1,339.521**
## (527.573)
##
## w3income_1k22.5005 757.413
## (595.525)
##
## w3income_1k27.5005 333.257
## (554.316)
##
## w3income_1k32.5005 254.180
## (529.948)
##
## w3income_1k37.5005 399.418
## (518.690)
##
## w3income_1k45.0005 528.122
## (507.930)
##
## w3income_1k62.5005 707.855
## (500.938)
##
## w3income_1k7.5005 -55.085
## (857.993)
##
## w3income_1k87.5005 881.825*
## (501.613)
##
## p5numpla -3.311
## (116.873)
##
## w3momed_hsb -439.341***
## (83.333)
##
## Constant 1,931.012***
## (514.208)
##
## ----------------------------------------------------
## Observations 1,860
## R2 0.061
## Adjusted R2 0.054
## Residual Std. Error 1,400.684 (df = 1845)
## F Statistic 8.603*** (df = 14; 1845)
## ====================================================
## Note: *p<0.1; **p<0.05; ***p<0.01