Causal_Inference

This code is part of the second assignment of Causal Inference. The analysis includes a sample of 332 institutions who have received the Hispanic Serving Designation as of the year 2017. 137 of them are recipients of an HSI related grant, while 195 have not received any type grant yet, remained eligible.

Introduction

Description of project

Uploading data downloaded from IPEDS

ipeds1<-read.csv("C:/Users/PCMcC/Documents/Causal Inference/Final Project/Data/other.csv")
ipeds2<-read.csv("C:/Users/PCMcC/Documents/Causal Inference/Final Project/Data/staff.csv")
hsi<-read.csv("C:/Users/PCMcC/Documents/Causal Inference/Final Project/Data/ipedsdata2017.csv")

#merging files with various information
ipeds3<-merge(ipeds1,ipeds2, by = "unitid")
ipeds4<-merge(ipeds3,hsi, by = "unitid")
ipeds4<-ipeds4%>%
  filter(complete.cases(.))
#removing unused databases
rm(ipeds1, ipeds2)

#names(ipeds4)

Descriptive Statististics

There are 137 institutions who have received an HSI grant included in this sample and 195 who are eligible but have not received a grant.

#Summary Statistics for HSI Institutions with Grant (treatment group)
ipedsHSIGrant<-filter(ipeds4, HSI_GRANT==1)
attach(ipedsHSIGrant)
vars<-cbind(HSI_GRANT,PELL_PERC, FT_UG, HISP_UG_TOTAL_FT, STUDENTFACULTYRATIO, TOTAL_ENROLLMENT, HISPGRADRATE, HISPSTAFF, DISABILITIES,WHITEGRADRATE,  TYPE) 
library(stargazer)
df <- data.frame(vars)
cols <- c('HSI_GRANT', 'PELL_PERC', "FT_UG", "HISP_UG_TOTAL_FT", "STUDENTFACULTYRATIO", "TOTAL_ENROLLMENT",  "HISPGRADRATE", "HISPSTAFF", "DISABILITIES", "WHITEGRADRATE",  "TYPE")
stargazer(df[, cols], type = "text", summary.stat = c("N","min", "p25", "median", "p75", "max", "mean", "sd")
)

## 
## =================================================================================
## Statistic            N  Min Pctl(25) Median Pctl(75)  Max      Mean     St. Dev. 
## ---------------------------------------------------------------------------------
## HSI_GRANT           137  1     1       1       1       1      1.000      0.000   
## PELL_PERC           137  9     30      37      47      86     39.416     14.614  
## FT_UG               137  7    122     184     282     382    198.489    103.391  
## HISP_UG_TOTAL_FT    137  5     83     191     279     377    186.226    110.490  
## STUDENTFACULTYRATIO 137  6     17      22      26      35     21.380     5.920   
## TOTAL_ENROLLMENT    137 389  3,995   9,652   19,000  71,551 13,515.280 12,178.210
## HISPGRADRATE        137  9     20      27      41      77     31.693     15.233  
## HISPSTAFF           137 11     99     170     370    5,191   358.686    588.807  
## DISABILITIES        137  2     2       2       3       3      2.482      0.502   
## WHITEGRADRATE       137  0     24      34      45      84     35.708     16.561  
## TYPE                137  2     3       3       4       4      3.204      0.655   
## ---------------------------------------------------------------------------------

#Summary Statistics for HSI Institutions with NO Grant (control group)
ipedsHSIGrant2<-filter(ipeds4, HSI_GRANT==0)
attach(ipedsHSIGrant2)
vars2<-cbind(HSI_GRANT,PELL_PERC, FT_UG, HISP_UG_TOTAL_FT, STUDENTFACULTYRATIO, TOTAL_ENROLLMENT, HISPGRADRATE, HISPSTAFF, DISABILITIES,WHITEGRADRATE,  TYPE) 
library(stargazer)
df <- data.frame(vars2)
cols <- c('HSI_GRANT', 'PELL_PERC', "FT_UG", "HISP_UG_TOTAL_FT", "STUDENTFACULTYRATIO", "TOTAL_ENROLLMENT",  "HISPGRADRATE", "HISPSTAFF", "DISABILITIES", "WHITEGRADRATE",  "TYPE")
stargazer(df[, cols], type = "text", summary.stat = c("N","min", "p25", "median", "p75", "max", "mean", "sd")
)

## 
## ===============================================================================
## Statistic            N  Min Pctl(25) Median Pctl(75)  Max     Mean    St. Dev. 
## -------------------------------------------------------------------------------
## HSI_GRANT           195  0     0       0       0       0      0.000     0.000  
## PELL_PERC           195  5    32.5     43      53      93    43.497    16.240  
## FT_UG               195  1    80.5    198     298     387    190.585   118.406 
## HISP_UG_TOTAL_FT    195  2    112     195    291.5    375    196.236   107.291 
## STUDENTFACULTYRATIO 195  6     14      18     23.5     38    19.241     6.363  
## TOTAL_ENROLLMENT    195 44   2,004   6,925   12,571  57,032 9,245.405 9,829.791
## HISPGRADRATE        195  0    20.5     30     46.5     88    34.138    19.216  
## HISPSTAFF           195  0    46.5    120    228.5   2,887   216.210   346.315 
## DISABILITIES        195  2     2       2       3       3      2.390     0.489  
## WHITEGRADRATE       195  0    22.5     35      53     100    38.549    20.571  
## TYPE                195  1     2       3       3       4      2.908     0.747  
## -------------------------------------------------------------------------------

Continuation of Descriptive Statistics

Below is our independent variable in a histogram

#Histogram of Hispanic Graduation Rate 
ggplot(ipeds4, aes(x=HISPGRADRATE))+
  geom_histogram(color="black", fill="purple", bins =  50) + labs(title="Graduation Rate of Hispanics in HSI Institutions for the Year 2017",x="Graduation Rate", y = "Number of Institutions in 2017")

#Histogram of Total Enrollment 
ggplot(ipeds4, aes(x=TOTAL_ENROLLMENT))+ 
  geom_histogram(color="black", fill="blue", alpha = .7, bins = 50) + labs(title="Total Student Enrollment of HSI Institutions in 2017",x="Total Student Enrollment", y = "Number of Institutions")

#Histogram of student Faculty Ratio 
ggplot(ipeds4, aes(x=STUDENTFACULTYRATIO))+ 
  geom_histogram(color="black", fill="blue", alpha = .7, bins = 50) + labs(title="Student Faculty Ratio of HSI Institutions in 2017",x="Student Faculty Ratio", y = "Number of Institutions")

#Histogram of Hispanic Graduation Rate by Type of All HSI Institutions
 i <- ipeds4
 levels(i$TYPE) <- c("2 Year Private", "4 Year Private", "2 Year Public", "4 Year Public")

 ggplot (i, aes(x = HISPGRADRATE)) +
geom_histogram(color="black", fill="orange", alpha = .7) + facet_wrap( TYPE ~.) + labs(title= "Hispanic Graduation Rate by Type of HSI Institutions", y="Number of Institutions in 2017", x="Graduation Rate")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Hispanic Graduation Rate by Institutions Recipients of HSI Grant vs. Eligible Institutions
 i <- ipeds4
levels(i$HSI_GRANT) <- c("No Award", "Awardee")

ggplot (i, aes(x = HISPGRADRATE)) +
  geom_histogram(color="black", fill="green", alpha = .7, bins = 50) + facet_wrap(HSI_GRANT) + labs(title= "Hispanic Graduation Rate by Awardees of HSI Grant vs HSI Grant Eligible", y="Number of Institutions in 2017", x="Graduation Rate") + facet_grid (HSI_GRANT ~. ,margins = TRUE)

Recoding of Variables

#Percentage of Disabilities in Institution more than 3 percent is coded as 1, less than 3 percent is coded as 0
ipeds4$disabilities<-ifelse (ipeds4$DISABILITIES== 'More than 3 percent', 1, 0 )

#Hispanic Graduation Rate, codes as 1 for universities with higher than 46% graduation rate for Hispanics and 0 anything lower than that. Using quantiles, I was able to establish that anything  higher than 45 or 75% as "high"
ipeds4$highhispgradrate<-ifelse (ipeds4$HISPGRADRATE>= 46, 1, 0 )

#WhiteGraduationRate, codes as 1 for universities with higher than 50% graduation rate for Whites and 0 anything lower than that. Using quantiles, I was able to establish that anything  higher than 50.2 or 75% as "high"
ipeds4$highwhitegradrate<-ifelse (ipeds4$WHITEGRADRATE>= 50.2, 1, 0 )

#HSI Grant already coded prior to upload, 1=grant awarded, 0=eligible without grant awarded

#Hispanic Staff (using quantiles, I selected universities with a Hispanic staff higher than 75%, coded as 1=high)
ipeds4$hispstaffrate<-ipeds4$HISPSTAFF/ipeds4$TOTALSTAFF*100
ipeds4$hispstaffratehigh<-ifelse(ipeds4$hispstaffrate>=27, 1, 0)
ipeds4$hispstaffratelow<-ifelse(ipeds4$hispstaffrate<=26, 1, 0)

#Type of Institution coded by 0,1 depending on the type of college Private vs. Public and 4 year vs. 2 year
ipeds4$pubfouryear<-ifelse(ipeds4$TYPE=='Pub 4yr', 1,0)
ipeds4$privfouryear<-ifelse(ipeds4$TYPE=='Pri 4yr', 1,0)
ipeds4$privtwoyear<-ifelse(ipeds4$TYPE=='Pri 2yr', 1,0)
ipeds4$pubtwoyear<-ifelse(ipeds4$TYPE=='Pub 2yr', 1,0)

#Pell Percentage of Recipients, coded as 1 for universities with higher than 50% pell recipient percentage  and 0 anything lower than that. Using quantiles, I was able to establish that anything  higher than 50.25 or 75% as "high"
ipeds4$highPellPerc<-ifelse (ipeds4$PELL_PERC>= 50.25, 1, 0 )

#Number of Students per Faculty Member (student faculty ratio) coded as 1 for universities with higher than 25 student faculty ratio and 0 anything lower than that. Using quantiles, I was able to establish that anything  higher than 25 or 75% as "high"
ipeds4$highsfratio<-ifelse (ipeds4$STUDENTFACULTYRATIO>=25, 1, 0)

#High Percentage of Hispanic undergraduate students. Coded as 1 for universities with higher than 56.65% of Hispanic undergraduate students and 0 for anything lower than that. Using quantiles, I was able to establish that anything  higher than 56.6% or 75% as "high"
ipeds4$highhispftstudentsperc<- ifelse (ipeds4$HISP_UG_TOTAL_PERC_FT>=56.6, 1, 0)

Step 1. Basic T Test of Independence

The following test shows that there is a significant difference betweenn institutiosn who received a grant vs. those that have not. On each of the covariates we observe a signficant p-value: hispanic graduation rate which is our outcome, type of institution, high student staff ratio, a high pell recipient percentage, total enrollment and high percentage of whites graduating from HSI institutions

#Institution Recipent of HSI Grant and Hispanic Graduation Rate
t.test(table(ipeds4$HSI_GRANT, ipeds4$HISPGRADRATE))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HSI_GRANT, ipeds4$HISPGRADRATE)
## t = 13.654, df = 147, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  1.918562 2.567925
## sample estimates:
## mean of x 
##  2.243243

#Institution Recipent of HSI Grant and Type of School
t.test(table(ipeds4$HSI_GRANT, ipeds4$TYPE))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HSI_GRANT, ipeds4$TYPE)
## t = 3.5368, df = 7, p-value = 0.00951
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  13.75432 69.24568
## sample estimates:
## mean of x 
##      41.5

#Institution Recipent of HSI Grant and Spefici Type of School
t.test(table(ipeds4$HSI_GRANT, ipeds4$privfouryear))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HSI_GRANT, ipeds4$privfouryear)
## t = 2.9415, df = 3, p-value = 0.06043
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##   -6.797259 172.797259
## sample estimates:
## mean of x 
##        83

t.test(table(ipeds4$HSI_GRANT, ipeds4$privtwoyear))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HSI_GRANT, ipeds4$privtwoyear)
## t = 1.7156, df = 3, p-value = 0.1847
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -70.96266 236.96266
## sample estimates:
## mean of x 
##        83

t.test(table(ipeds4$HSI_GRANT, ipeds4$pubfouryear))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HSI_GRANT, ipeds4$pubfouryear)
## t = 3.2568, df = 3, p-value = 0.04725
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##    1.894433 164.105567
## sample estimates:
## mean of x 
##        83

t.test(table(ipeds4$HSI_GRANT, ipeds4$pubtwoyear))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HSI_GRANT, ipeds4$pubtwoyear)
## t = 9.5522, df = 3, p-value = 0.002434
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##   55.34749 110.65251
## sample estimates:
## mean of x 
##        83

#Institution Recipent of HSI Grant and Student Faculty Ratio
t.test(table(ipeds4$HSI_GRANT, ipeds4$STUDENTFACULTYRATIO))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HSI_GRANT, ipeds4$STUDENTFACULTYRATIO)
## t = 10.476, df = 61, p-value = 2.924e-15
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  4.332703 6.376974
## sample estimates:
## mean of x 
##  5.354839

#Institution Recipent of HSI Grant and a High Percentage Hispanic Staff
t.test(table(ipeds4$HSI_GRANT, ipeds4$HISPGRADRATE))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HSI_GRANT, ipeds4$HISPGRADRATE)
## t = 13.654, df = 147, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  1.918562 2.567925
## sample estimates:
## mean of x 
##  2.243243

#Institution Recipent of HSI Grant and Student Pell Recipient Percentage
t.test(table(ipeds4$HSI_GRANT, ipeds4$PELL_PERC))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HSI_GRANT, ipeds4$PELL_PERC)
## t = 13.9, df = 129, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  2.190324 2.917368
## sample estimates:
## mean of x 
##  2.553846

#Institution Recipent of HSI Grant and Student White Graduation Rate
t.test(table(ipeds4$HSI_GRANT, ipeds4$WHITEGRADRATE))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HSI_GRANT, ipeds4$WHITEGRADRATE)
## t = 15.037, df = 157, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  1.825257 2.377275
## sample estimates:
## mean of x 
##  2.101266

#Institution Recipent of HSI Grant and Percentage of Black Students Enrolled
t.test(table(ipeds4$HSI_GRANT, ipeds4$BLACK_PERC))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HSI_GRANT, ipeds4$BLACK_PERC)
## t = 19.492, df = 347, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.8577602 1.0502858
## sample estimates:
## mean of x 
##  0.954023

#Institution Recipent of HSI Grant and Percentage of Total Minorities
t.test(table(ipeds4$HSI_GRANT, ipeds4$TOTAL_MIN_NOASIAN_PERC))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HSI_GRANT, ipeds4$TOTAL_MIN_NOASIAN_PERC)
## t = 23.052, df = 507, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.5978427 0.7092439
## sample estimates:
## mean of x 
## 0.6535433

#Institution Recipent of HSI Grant and Total Enrollment
t.test(table(ipeds4$HSI_GRANT, ipeds4$TOTAL_ENROLLMENT))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HSI_GRANT, ipeds4$TOTAL_ENROLLMENT)
## t = 25.52, df = 659, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.4643253 0.5417353
## sample estimates:
## mean of x 
## 0.5030303

#Institution Recipent of HSI Grant and high number of students with disabilities
t.test(table(ipeds4$HSI_GRANT, ipeds4$DISABILITIES))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HSI_GRANT, ipeds4$DISABILITIES)
## t = 2.3275, df = 9, p-value = 0.04493
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##   0.9325485 65.4674515
## sample estimates:
## mean of x 
##      33.2

#Institution Recipent of HSI Grant and percentage of hispanic undergraduate full time students enrolled
t.test(table(ipeds4$HSI_GRANT, ipeds4$HISP_UG_TOTAL_PERC_FT))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HSI_GRANT, ipeds4$HISP_UG_TOTAL_PERC_FT)
## t = 21.876, df = 509, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.5925171 0.7094437
## sample estimates:
## mean of x 
## 0.6509804

#Institution Recipent of HSI Grant and percentage of hispanic staff
t.test(table(ipeds4$HSI_GRANT, ipeds4$HISPSTAFF))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HSI_GRANT, ipeds4$HISPSTAFF)
## t = 21.016, df = 489, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.6142052 0.7408968
## sample estimates:
## mean of x 
##  0.677551

The following includes a T-Test of significance test and the outcome varialbe which in this case is Hispanic Graduation Rate

#Institution Recipent of HSI Grant and Hispanic Graduation Rate
t.test(table(ipeds4$HISPGRADRATE, ipeds4$HISPGRADRATE))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HISPGRADRATE, ipeds4$HISPGRADRATE)
## t = 6.9894, df = 5475, p-value = 3.09e-12
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.04362316 0.07763323
## sample estimates:
## mean of x 
## 0.0606282

#Institution Recipent of HSI Grant and Type of School
t.test(table(ipeds4$HISPGRADRATE, ipeds4$TYPE))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HISPGRADRATE, ipeds4$TYPE)
## t = 10.078, df = 295, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.9025862 1.3406570
## sample estimates:
## mean of x 
##  1.121622

#Institution Recipent of HSI Grant and Spefici Type of School
t.test(table(ipeds4$HISPGRADRATE, ipeds4$privfouryear))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HISPGRADRATE, ipeds4$privfouryear)
## t = 9.693, df = 147, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  1.785884 2.700602
## sample estimates:
## mean of x 
##  2.243243

t.test(table(ipeds4$HISPGRADRATE, ipeds4$privtwoyear))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HISPGRADRATE, ipeds4$privtwoyear)
## t = 8.4889, df = 147, p-value = 2.086e-14
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  1.721013 2.765474
## sample estimates:
## mean of x 
##  2.243243

t.test(table(ipeds4$HISPGRADRATE, ipeds4$pubfouryear))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HISPGRADRATE, ipeds4$pubfouryear)
## t = 11.096, df = 147, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  1.843704 2.642782
## sample estimates:
## mean of x 
##  2.243243

t.test(table(ipeds4$HISPGRADRATE, ipeds4$pubtwoyear))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HISPGRADRATE, ipeds4$pubtwoyear)
## t = 10.985, df = 147, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  1.839681 2.646806
## sample estimates:
## mean of x 
##  2.243243

#Institution Recipent of HSI Grant and Student Faculty Ratio
t.test(table(ipeds4$HISPGRADRATE, ipeds4$STUDENTFACULTYRATIO))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HISPGRADRATE, ipeds4$STUDENTFACULTYRATIO)
## t = 17.304, df = 2293, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.1283242 0.1611265
## sample estimates:
## mean of x 
## 0.1447254

#Institution Recipent of HSI Grant and  Percentage Hispanic Staff
t.test(table(ipeds4$HISPGRADRATE, ipeds4$HISPSTAFF))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HISPGRADRATE, ipeds4$HISPSTAFF)
## t = 18.114, df = 18129, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.01633062 0.02029376
## sample estimates:
##  mean of x 
## 0.01831219

#Institution Recipent of HSI Grant and Student Pell Recipient Percentage
t.test(table(ipeds4$HISPGRADRATE, ipeds4$PELL_PERC))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HISPGRADRATE, ipeds4$PELL_PERC)
## t = 17.922, df = 4809, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.06147268 0.07657306
## sample estimates:
##  mean of x 
## 0.06902287

#Institution Recipent of HSI Grant and Student White Graduation Rate
t.test(table(ipeds4$HISPGRADRATE, ipeds4$WHITEGRADRATE))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HISPGRADRATE, ipeds4$WHITEGRADRATE)
## t = 16.743, df = 5845, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.05014141 0.06344053
## sample estimates:
##  mean of x 
## 0.05679097

#Institution Recipent of HSI Grant and Percentage of Black Students Enrolled
t.test(table(ipeds4$HISPGRADRATE, ipeds4$BLACK_PERC))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HISPGRADRATE, ipeds4$BLACK_PERC)
## t = 18.181, df = 12875, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.02300448 0.02856433
## sample estimates:
##  mean of x 
## 0.02578441

#Institution Recipent of HSI Grant and Percentage of Total Minorities
t.test(table(ipeds4$HISPGRADRATE, ipeds4$TOTAL_MIN_NOASIAN_PERC))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HISPGRADRATE, ipeds4$TOTAL_MIN_NOASIAN_PERC)
## t = 18.162, df = 18795, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.01575707 0.01956960
## sample estimates:
##  mean of x 
## 0.01766333

#Institution Recipent of HSI Grant and Total Enrollment
t.test(table(ipeds4$HISPGRADRATE, ipeds4$TOTAL_ENROLLMENT))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HISPGRADRATE, ipeds4$TOTAL_ENROLLMENT)
## t = 18.346, df = 24419, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.01214287 0.01504796
## sample estimates:
##  mean of x 
## 0.01359541

#Institution Recipent of HSI Grant and number of students with disabilities
t.test(table(ipeds4$HISPGRADRATE, ipeds4$DISABILITIES))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HISPGRADRATE, ipeds4$DISABILITIES)
## t = 10.3, df = 369, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.7259958 1.0685988
## sample estimates:
## mean of x 
## 0.8972973

#Institution Recipent of HSI Grant and percentage of hispanic undergraduate full time students enrolled
t.test(table(ipeds4$HISPGRADRATE, ipeds4$HISP_UG_TOTAL_PERC_FT))

## 
##  One Sample t-test
## 
## data:  table(ipeds4$HISPGRADRATE, ipeds4$HISP_UG_TOTAL_PERC_FT)
## t = 18.161, df = 18869, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.01569521 0.01949292
## sample estimates:
##  mean of x 
## 0.01759406

Step 2. Propensity score estimation

hsi_ps <- glm(HSI_GRANT ~ highhispgradrate + highwhitegradrate + highPellPerc + highhispftstudentsperc + pubtwoyear+ pubfouryear + disabilities + highsfratio + hispstaffratehigh, family = binomial(), data = ipeds4)
summary(hsi_ps)

## 
## Call:
## glm(formula = HSI_GRANT ~ highhispgradrate + highwhitegradrate + 
##     highPellPerc + highhispftstudentsperc + pubtwoyear + pubfouryear + 
##     disabilities + highsfratio + hispstaffratehigh, family = binomial(), 
##     data = ipeds4)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6069  -1.0156  -0.6609   1.1874   2.0469  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)   
## (Intercept)            -0.85974    0.36619  -2.348  0.01889 * 
## highhispgradrate        0.11922    0.36868   0.323  0.74642   
## highwhitegradrate      -0.67245    0.38042  -1.768  0.07712 . 
## highPellPerc           -0.55068    0.31368  -1.756  0.07916 . 
## highhispftstudentsperc  0.58994    0.39581   1.490  0.13610   
## pubtwoyear              0.24754    0.38016   0.651  0.51495   
## pubfouryear             0.99796    0.35901   2.780  0.00544 **
## disabilities            0.32674    0.24093   1.356  0.17506   
## highsfratio             0.21901    0.28596   0.766  0.44375   
## hispstaffratehigh       0.02242    0.37415   0.060  0.95222   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 450.07  on 331  degrees of freedom
## Residual deviance: 420.77  on 322  degrees of freedom
## AIC: 440.77
## 
## Number of Fisher Scoring iterations: 4

#Calculation of the likelihood of receiving treatment or in this care receiving HSI grant.
prs_df <- data.frame(pr_score = predict(hsi_ps, type = "response"),
                     grant_recipient= hsi_ps$model$HSI_GRANT)
head(prs_df)

##    pr_score grant_recipient
## 1 0.3657449               0
## 2 0.4291149               0
## 3 0.3515578               1
## 4 0.3515578               1
## 5 0.3515578               1
## 6 0.4889933               1

2.1 Examining the region of common support

labs <- paste("HSI Grant Status:", c("Recipient", "Eligible"))
prs_df %>%
  mutate(Recipient = ifelse(grant_recipient == 1, labs[1], labs[2])) %>%
  ggplot(aes(x = pr_score)) +
  geom_histogram(color = "white", bins=50) +
  facet_wrap(~Recipient) +
  xlab("Probability of Receiving HSI Grant") +
  theme_bw()

## Executing a matching algorithm

mod_match <- matchit(HSI_GRANT ~ highhispgradrate + highwhitegradrate + highPellPerc + highhispftstudentsperc + pubtwoyear+ pubfouryear + disabilities + highsfratio + hispstaffratehigh, method = "nearest",
                     data = ipeds4)

#tested optimal method but was unable to produce output
#mod_match2 <- matchit(HSI_GRANT ~ highhispgradrate + highwhitegradrate + highPellPerc + highhispftstudentsperc + pubtwoyear+ pubfouryear + disabilities + highsfratio + hispstaffratehigh, data = ipeds4,  method = "optimal", ratio = 2)

Checking the Match

#I create a dataframe containing the matched observations. The summary of it shows that there are 137 pairs of treated and control observations.
dta_m <- match.data(mod_match)
dim(dta_m)

## [1] 274  52

library("cobalt")

## Warning: package 'cobalt' was built under R version 3.5.3

## 
## Attaching package: 'cobalt'

## The following object is masked from 'package:MatchIt':
## 
##     lalonde

library("MatchIt")
library("ggplot2")
# Checking balance before and after matching:
bal.tab(mod_match, m.threshold = 0.1, un = TRUE)

## Call
##  matchit(formula = HSI_GRANT ~ highhispgradrate + highwhitegradrate + 
##     highPellPerc + highhispftstudentsperc + pubtwoyear + pubfouryear + 
##     disabilities + highsfratio + hispstaffratehigh, data = ipeds4, 
##     method = "nearest")
## 
## Balance Measures
##                            Type Diff.Un Diff.Adj    M.Threshold
## distance               Distance  0.6616   0.1519               
## highhispgradrate         Binary -0.0593   0.0146 Balanced, <0.1
## highwhitegradrate        Binary -0.1274   0.0000 Balanced, <0.1
## highPellPerc             Binary -0.0901   0.0511 Balanced, <0.1
## highhispftstudentsperc   Binary  0.1212   0.0803 Balanced, <0.1
## pubtwoyear               Binary  0.0508  -0.0584 Balanced, <0.1
## pubfouryear              Binary  0.1153   0.0584 Balanced, <0.1
## disabilities             Binary  0.0920   0.0073 Balanced, <0.1
## highsfratio              Binary  0.1233   0.0438 Balanced, <0.1
## hispstaffratehigh        Binary  0.0736   0.0584 Balanced, <0.1
## 
## Balance tally for mean differences
##                    count
## Balanced, <0.1         9
## Not Balanced, >0.1     0
## 
## Variable with the greatest mean difference
##                Variable Diff.Adj    M.Threshold
##  highhispftstudentsperc   0.0803 Balanced, <0.1
## 
## Sample sizes
##           Control Treated
## All           195     137
## Matched       137     137
## Unmatched      58       0

bal.plot(mod_match, var.name = "distance")

bal.plot(mod_match, var.name = "distance", mirror = TRUE, type = "histogram")

##Testing Matching The following test shows that none of the covariates are significant which is essentially what I am looking for otherwise the matching failed.

ecls_cov <- c('highhispgradrate', 'highwhitegradrate' , 'highPellPerc', 'highhispftstudentsperc', 'pubtwoyear', 'pubfouryear', 'disabilities' , 'highsfratio' , 'hispstaffratehigh')

lapply(ecls_cov, function(v) {
    t.test(dta_m[, v] ~ dta_m$HSI_GRANT)
})

## [[1]]
## 
##  Welch Two Sample t-test
## 
## data:  dta_m[, v] by dta_m$HSI_GRANT
## t = -0.30705, df = 271.76, p-value = 0.759
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1082004  0.0790033
## sample estimates:
## mean in group 0 mean in group 1 
##       0.1824818       0.1970803 
## 
## 
## [[2]]
## 
##  Welch Two Sample t-test
## 
## data:  dta_m[, v] by dta_m$HSI_GRANT
## t = 0, df = 272, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.09075167  0.09075167
## sample estimates:
## mean in group 0 mean in group 1 
##       0.1751825       0.1751825 
## 
## 
## [[3]]
## 
##  Welch Two Sample t-test
## 
## data:  dta_m[, v] by dta_m$HSI_GRANT
## t = -1.1203, df = 268.22, p-value = 0.2636
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.14089345  0.03870367
## sample estimates:
## mean in group 0 mean in group 1 
##       0.1459854       0.1970803 
## 
## 
## [[4]]
## 
##  Welch Two Sample t-test
## 
## data:  dta_m[, v] by dta_m$HSI_GRANT
## t = -1.4789, df = 269.92, p-value = 0.1403
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1871812  0.0265973
## sample estimates:
## mean in group 0 mean in group 1 
##       0.2408759       0.3211679 
## 
## 
## [[5]]
## 
##  Welch Two Sample t-test
## 
## data:  dta_m[, v] by dta_m$HSI_GRANT
## t = 0.97225, df = 271.94, p-value = 0.3318
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.0598495  0.1766378
## sample estimates:
## mean in group 0 mean in group 1 
##       0.5912409       0.5328467 
## 
## 
## [[6]]
## 
##  Welch Two Sample t-test
## 
## data:  dta_m[, v] by dta_m$HSI_GRANT
## t = -1.0465, df = 271.23, p-value = 0.2963
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.16825154  0.05146322
## sample estimates:
## mean in group 0 mean in group 1 
##       0.2773723       0.3357664 
## 
## 
## [[7]]
## 
##  Welch Two Sample t-test
## 
## data:  dta_m[, v] by dta_m$HSI_GRANT
## t = -0.1205, df = 272, p-value = 0.9042
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1265530  0.1119544
## sample estimates:
## mean in group 0 mean in group 1 
##       0.4744526       0.4817518 
## 
## 
## [[8]]
## 
##  Welch Two Sample t-test
## 
## data:  dta_m[, v] by dta_m$HSI_GRANT
## t = -0.78417, df = 271.57, p-value = 0.4336
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.15374910  0.06615786
## sample estimates:
## mean in group 0 mean in group 1 
##       0.2846715       0.3284672 
## 
## 
## [[9]]
## 
##  Welch Two Sample t-test
## 
## data:  dta_m[, v] by dta_m$HSI_GRANT
## t = -1.0869, df = 270.73, p-value = 0.2781
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.16416810  0.04737978
## sample estimates:
## mean in group 0 mean in group 1 
##       0.2408759       0.2992701

Estimate the Treatment Effects

glm_treat <- glm(highhispgradrate ~ HSI_GRANT + highhispgradrate + highwhitegradrate + highPellPerc + highhispftstudentsperc + pubtwoyear+ pubfouryear + disabilities + highsfratio + hispstaffratehigh, data = dta_m)

## Warning in model.matrix.default(mt, mf, contrasts): the response appeared
## on the right-hand side and was dropped

## Warning in model.matrix.default(mt, mf, contrasts): problem with term 2 in
## model.matrix: no columns are assigned

summary(glm_treat)

## 
## Call:
## glm(formula = highhispgradrate ~ HSI_GRANT + highhispgradrate + 
##     highwhitegradrate + highPellPerc + highhispftstudentsperc + 
##     pubtwoyear + pubfouryear + disabilities + highsfratio + hispstaffratehigh, 
##     data = dta_m)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.77480  -0.14820  -0.03895   0.03295   0.98518  
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.155301   0.068718   2.260   0.0246 *  
## HSI_GRANT               0.006455   0.038634   0.167   0.8674    
## highwhitegradrate       0.457773   0.058761   7.790 1.53e-13 ***
## highPellPerc            0.063380   0.058468   1.084   0.2793    
## highhispftstudentsperc -0.045953   0.064970  -0.707   0.4800    
## pubtwoyear             -0.140480   0.067974  -2.067   0.0397 *  
## pubfouryear             0.020804   0.064378   0.323   0.7468    
## disabilities            0.077538   0.039515   1.962   0.0508 .  
## highsfratio            -0.047771   0.045358  -1.053   0.2932    
## hispstaffratehigh       0.012044   0.061703   0.195   0.8454    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1006892)
## 
##     Null deviance: 42.131  on 273  degrees of freedom
## Residual deviance: 26.582  on 264  degrees of freedom
## AIC: 160.36
## 
## Number of Fisher Scoring iterations: 2

# Estimating treatment effects
with(dta_m, t.test(highhispgradrate~ HSI_GRANT))

## 
##  Welch Two Sample t-test
## 
## data:  highhispgradrate by HSI_GRANT
## t = -0.30705, df = 271.76, p-value = 0.759
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1082004  0.0790033
## sample estimates:
## mean in group 0 mean in group 1 
##       0.1824818       0.1970803

lm_treat1 <- lm(highhispgradrate~ HSI_GRANT, data = dta_m)
summary(lm_treat1)

## 
## Call:
## lm(formula = highhispgradrate ~ HSI_GRANT, data = dta_m)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.1971 -0.1971 -0.1825 -0.1825  0.8175 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.18248    0.03362   5.428 1.26e-07 ***
## HSI_GRANT    0.01460    0.04754   0.307    0.759    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3935 on 272 degrees of freedom
## Multiple R-squared:  0.0003465,  Adjusted R-squared:  -0.003329 
## F-statistic: 0.09428 on 1 and 272 DF,  p-value: 0.759

This analysis shows the marginal effects of all HSI institutions as observed in the logistic regression model. We see that the treatment of having an HSI grant as an institution does not impact the graduation rates of Hispanics.

Causal_Inference_HW2

Paulina Cano

April 1, 2019