HW11 (Lost)

#Loading required packages (might not end up using all of the loaded ones)
library(prettydoc) #For the theme used in this document
library(ggplot2)
library(tidyverse)
library(dplyr)
#install.packages("caret")
#install.packages("skimr")
#install.packages("rpart")
#install.packages("randomForest")
#install.packages("MatchIt")
library(MatchIt)
library(stargazer)
library(caret)
library(skimr)
library(rpart)
library(rpart.plot)
library(randomForest)
library(rattle)
library(neuralnet)
library(nnet)

##Loading and cleaning the data and Identifying the problem

#Setting up the directory
setwd("D:/UGA Coursework/Second Year/AAEC 8610/HWs/HW11Lost")
getwd()

## [1] "D:/UGA Coursework/Second Year/AAEC 8610/HWs/HW11Lost"

df <- read.csv2("ecls.csv", sep = ",",
                       stringsAsFactors = TRUE)
df1 <- df %>%
  select(c5r2mtsc_std, catholic, w3income_1k, p5numpla, w3momed_hsb)

df1 <- as.data.frame(df1)

df1$c5r2mtsc_std <- as.numeric(df1$c5r2mtsc_std)
df1$w3momed_hsb <- as.numeric(df1$w3momed_hsb)
df1$w3income_1k <- as.numeric(df1$w3income_1k)


#Do test scores on c5r2mtsc_std differ for pupils of catholic and non-catholic schools on average?
t.test(c5r2mtsc_std ~ catholic, data = df1)

## 
##  Welch Two Sample t-test
## 
## data:  c5r2mtsc_std by catholic
## t = 0.53061, df = 1341.3, p-value = 0.5958
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -73.56233 128.11145
## sample estimates:
## mean in group 0 mean in group 1 
##        2483.336        2456.061

#Can you conclude that going to a catholic school leads to higher math scores? Why or why not?
naiiveregression <- lm(c5r2mtsc_std ~ catholic, data = df1)
summary(naiiveregression)

## 
## Call:
## lm(formula = c5r2mtsc_std ~ catholic, data = df1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2482.34 -1244.34    23.66  1214.66  2510.94 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2483.34      21.28  116.70   <2e-16 ***
## catholic      -27.27      51.41   -0.53    0.596    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1427 on 5427 degrees of freedom
## Multiple R-squared:  5.185e-05,  Adjusted R-squared:  -0.0001324 
## F-statistic: 0.2814 on 1 and 5427 DF,  p-value: 0.5958

stargazer(naiiveregression, title = "Naiive Regression", type = "text",
          dep.var.caption = "Dependent variable: Tests Scores",
          dep.var.labels.include = FALSE,
          out = "regression_results.html")

## 
## Naiive Regression
## ====================================================
##                     Dependent variable: Tests Scores
##                     --------------------------------
## ----------------------------------------------------
## catholic                        -27.275             
##                                 (51.415)            
##                                                     
## Constant                      2,483.336***          
##                                 (21.280)            
##                                                     
## ----------------------------------------------------
## Observations                     5,429              
## R2                               0.0001             
## Adjusted R2                     -0.0001             
## Residual Std. Error      1,427.338 (df = 5427)      
## F Statistic               0.281 (df = 1; 5427)      
## ====================================================
## Note:                    *p<0.1; **p<0.05; ***p<0.01

#Do the means of the covariates (income, mother’s education etc) differ between catholic vs. non-catholic
#schools?

#t.test(w3income_1k, ~ catholic, data = df1)
#t.test(w3momed_hsb, ~ catholic, data = df1)

##Estimate the propensity to go to catholic school, by hand

logit <- glm(catholic ~ w3income_1k + p5numpla + w3momed_hsb, family = binomial(), data = df1)

#Make a prediction for who is likely to go to catholic school, using this model
df1$predict <- predict(logit)

#Check that there is common support.
plot1 <- ggplot(df1, aes(predict, fill = as.factor(catholic))) +
  geom_density() 
plot1

#Plot the income variable against the logit prediction (by catholic), and add the lowess smooth
#densities to the plot. They should look similar, but not perfectly aligned.
plot2 <- ggplot(df1, aes(predict, y = w3income_1k, fill = as.factor(catholic))) +
  geom_point() +
  geom_smooth(method="loess")
plot2

##Run a Matching algorithm to get a balanced dataset

# Run a MatchIt command with the same equation as your logit.
Matchit <- matchit(catholic ~ w3income_1k + p5numpla + w3momed_hsb, method = "nearest", data = df)
propscore <- match.data(Matchit)
propscore$c5r2mtsc_std <- as.numeric(propscore$c5r2mtsc_std)

#Sort the data by distance, and show the 10 first observations.
head(propscore[order(propscore$distance),], 10)

##       childid catholic                                    race race_white
## 4247 1108020C        0                HISPANIC, RACE SPECIFIED          0
## 4274 1117012C        1 BLACK OR AFRICAN AMERICAN, NON-HISPANIC          0
## 4212 1099009C        0            HISPANIC, RACE NOT SPECIFIED          0
## 4271 1116018C        1            HISPANIC, RACE NOT SPECIFIED          0
## 482  0119017C        1                                   ASIAN          0
## 845  0214024C        0                     WHITE, NON-HISPANIC          1
## 181  0040042C        0                     WHITE, NON-HISPANIC          1
## 2083 0485014C        1                     WHITE, NON-HISPANIC          1
## 571  0142016C        1                     WHITE, NON-HISPANIC          1
## 1042 0270010C        0                     WHITE, NON-HISPANIC          1
##      race_black race_hispanic race_asian p5numpla p5hmage p5hdage
## 4247          0             1          0        2      34      35
## 4274          1             0          0        2      33      39
## 4212          0             1          0        1      28      33
## 4271          0             1          0        1      36      44
## 482           0             0          1        2      36      46
## 845           0             0          0        2      30      32
## 181           0             0          0        2      34      44
## 2083          0             0          0        2      43      43
## 571           0             0          0        1      42      42
## 1042          0             0          0        1      39      41
##                             w3daded                        w3momed w3daded_hsb
## 4247             8TH GRADE OR BELOW               VOC/TECH PROGRAM           1
## 4274 HIGH SCHOOL DIPLOMA/EQUIVALENT HIGH SCHOOL DIPLOMA/EQUIVALENT           1
## 4212             8TH GRADE OR BELOW             8TH GRADE OR BELOW           1
## 4271             8TH GRADE OR BELOW HIGH SCHOOL DIPLOMA/EQUIVALENT           1
## 482                    SOME COLLEGE                   SOME COLLEGE           0
## 845                9TH - 12TH GRADE                   SOME COLLEGE           1
## 181  HIGH SCHOOL DIPLOMA/EQUIVALENT HIGH SCHOOL DIPLOMA/EQUIVALENT           1
## 2083               9TH - 12TH GRADE               VOC/TECH PROGRAM           1
## 571  HIGH SCHOOL DIPLOMA/EQUIVALENT HIGH SCHOOL DIPLOMA/EQUIVALENT           1
## 1042                   SOME COLLEGE HIGH SCHOOL DIPLOMA/EQUIVALENT           0
##      w3momed_hsb w3momscr w3dadscr           w3inccat w3income w3povrty
## 4247           1    34.95     39.2 $15,001 TO $20,000  17500.5        1
## 4274           1    38.18    39.18 $15,001 TO $20,000  17500.5        0
## 4212           1    34.95     39.2 $15,001 TO $20,000  17500.5        1
## 4271           1    34.95     53.5 $15,001 TO $20,000  17500.5        1
## 482            0    34.95    34.95 $15,001 TO $20,000  17500.5        1
## 845            0    34.95    39.18 $15,001 TO $20,000  17500.5        1
## 181            1    34.95     29.6 $20,001 TO $25,000  22500.5        0
## 2083           1    38.18    39.18 $20,001 TO $25,000  22500.5        1
## 571            1    35.78    64.89 $10,001 TO $15,000  12500.5        1
## 1042           1    38.18    35.63 $10,001 TO $15,000  12500.5        1
##      p5fstamp c5r2mtsc c5r2mtsc_std w3income_1k   distance weights subclass
## 4247        0   31.548         1945     17.5005 0.01823061       1      598
## 4274        0   25.325         2022     17.5005 0.01823061       1      598
## 4212        0   40.079         1480     17.5005 0.02245746       1      595
## 4271        0   53.528         2715     17.5005 0.02245746       1      595
## 482         0   36.418         1743     17.5005 0.02900985       1      705
## 845         0   42.728         1246     17.5005 0.02900985       1      705
## 181         0   52.464         2431     22.5005 0.03094559       1      222
## 2083        0   55.627         3147     22.5005 0.03094559       1      222
## 571         0   59.059         3790     12.5005 0.03368196       1      802
## 1042        0   48.419          395     12.5005 0.03368196       1      802

#Compare the means
#t.test(c5r2mtsc_std, ~ catholic, data = propscore)

##Compute the average treatment effect in your new sample

newdata1 <- lm(c5r2mtsc_std ~ catholic, data = propscore)

stargazer(newdata1, title = "Naiive Regression", type = "text",
          dep.var.caption = "Dependent variable: Tests Scores",
          dep.var.labels.include = FALSE,
          out = "regression_results.html")

## 
## Naiive Regression
## ====================================================
##                     Dependent variable: Tests Scores
##                     --------------------------------
## ----------------------------------------------------
## catholic                      -179.890***           
##                                 (66.676)            
##                                                     
## Constant                      2,635.952***          
##                                 (47.147)            
##                                                     
## ----------------------------------------------------
## Observations                     1,860              
## R2                               0.004              
## Adjusted R2                      0.003              
## Residual Std. Error      1,437.801 (df = 1858)      
## F Statistic             7.279*** (df = 1; 1858)     
## ====================================================
## Note:                    *p<0.1; **p<0.05; ***p<0.01

newdata2 <- lm(c5r2mtsc_std ~ catholic + w3income_1k + p5numpla + w3momed_hsb, data = propscore)
stargazer(newdata2, title = "Regression", type = "text",
          dep.var.caption = "Dependent variable: Tests Scores",
          dep.var.labels.include = FALSE,
          out = "regression_results.html")

## 
## Regression
## ====================================================
##                     Dependent variable: Tests Scores
##                     --------------------------------
## ----------------------------------------------------
## catholic                      -179.890***           
##                                 (64.955)            
##                                                     
## w3income_1k150.0005           1,108.063**           
##                                (503.376)            
##                                                     
## w3income_1k17.5005               87.445             
##                                (703.019)            
##                                                     
## w3income_1k200.001            1,339.521**           
##                                (527.573)            
##                                                     
## w3income_1k22.5005              757.413             
##                                (595.525)            
##                                                     
## w3income_1k27.5005              333.257             
##                                (554.316)            
##                                                     
## w3income_1k32.5005              254.180             
##                                (529.948)            
##                                                     
## w3income_1k37.5005              399.418             
##                                (518.690)            
##                                                     
## w3income_1k45.0005              528.122             
##                                (507.930)            
##                                                     
## w3income_1k62.5005              707.855             
##                                (500.938)            
##                                                     
## w3income_1k7.5005               -55.085             
##                                (857.993)            
##                                                     
## w3income_1k87.5005              881.825*            
##                                (501.613)            
##                                                     
## p5numpla                         -3.311             
##                                (116.873)            
##                                                     
## w3momed_hsb                   -439.341***           
##                                 (83.333)            
##                                                     
## Constant                      1,931.012***          
##                                (514.208)            
##                                                     
## ----------------------------------------------------
## Observations                     1,860              
## R2                               0.061              
## Adjusted R2                      0.054              
## Residual Std. Error      1,400.684 (df = 1845)      
## F Statistic             8.603*** (df = 14; 1845)    
## ====================================================
## Note:                    *p<0.1; **p<0.05; ***p<0.01

HW11 (Lost)

Duflo’s Plumber Economist

2023-05-03