#install.packages(c("MatchIt","dplyr","ggplot2"))
#install.packages("skimr")
library(MatchIt)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(skimr)
setwd("C:/Users/charl/OneDrive/Desktop/Data Analysis For Social Science in R")
library(readr)
Labdata <- read_csv("Labdata.csv")
## New names:
## Rows: 614 Columns: 10
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): ...1, race dbl (8): treat, age, educ, married, nodegree, re74, re75, re78
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
View(Labdata)
dim(Labdata)
## [1] 614 10
names(Labdata)
## [1] "...1" "treat" "age" "educ" "race" "married"
## [7] "nodegree" "re74" "re75" "re78"
The initial dataset used for this labwork consist of 10 columns and 614 observations.There are 2 character type variables and 8 numeric variables in the dataset.The mean age reported from the initial dataset is 27.36 and a standard devaition of 9.8811872.As shown below, there are no missing cases within the initial dataset. Below is a detailed summary generated using the R statistical software.
skim(Labdata)
| Name | Labdata |
| Number of rows | 614 |
| Number of columns | 10 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| numeric | 8 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| …1 | 0 | 1 | 4 | 7 | 0 | 614 | 0 |
| race | 0 | 1 | 5 | 6 | 0 | 3 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| treat | 0 | 1 | 0.30 | 0.46 | 0 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▃ |
| age | 0 | 1 | 27.36 | 9.88 | 16 | 20.00 | 25.00 | 32.00 | 55.00 | ▇▅▂▂▁ |
| educ | 0 | 1 | 10.27 | 2.63 | 0 | 9.00 | 11.00 | 12.00 | 18.00 | ▁▂▆▇▁ |
| married | 0 | 1 | 0.42 | 0.49 | 0 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▆ |
| nodegree | 0 | 1 | 0.63 | 0.48 | 0 | 0.00 | 1.00 | 1.00 | 1.00 | ▅▁▁▁▇ |
| re74 | 0 | 1 | 4557.55 | 6477.96 | 0 | 0.00 | 1042.33 | 7888.50 | 35040.07 | ▇▂▁▁▁ |
| re75 | 0 | 1 | 2184.94 | 3295.68 | 0 | 0.00 | 601.55 | 3248.99 | 25142.24 | ▇▁▁▁▁ |
| re78 | 0 | 1 | 6792.83 | 7470.73 | 0 | 238.28 | 4759.02 | 10893.59 | 60307.93 | ▇▂▁▁▁ |
Visualization of the the variables such as treat, age ,married, nodegree and other variables are conducted to see how they are distributed.
##treat and race
ggplot(Labdata, aes(x=treat, fill= race )) + geom_bar()
##age
ggplot(Labdata, aes(x=age )) + geom_bar()
##married
ggplot(Labdata, aes(x=married )) + geom_bar()
## nodegree
ggplot(Labdata, aes(x=nodegree, fill= race )) + geom_bar()
### re74
ggplot(Labdata, aes(x=re74)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
### re75
ggplot(Labdata, aes(x=re75)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
### re78
ggplot(Labdata, aes(x=re78)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## race
Labdata |> ggplot(aes(race)) + geom_bar()
## Change the raw data into a wide data frame based on "race"
Labdata$white <- ifelse(Labdata$race == 'white', 1, 0)
Labdata$black <- ifelse(Labdata$race == 'black', 1, 0)
Labdata$hispan <-ifelse(Labdata$race == 'hispan', 1, 0)
Labdata <- subset(Labdata,select=-c(race))
colnames(Labdata)[1]= "X"
head(Labdata)
## # A tibble: 6 × 12
## X treat age educ married nodegree re74 re75 re78 white black hispan
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NSW1 1 37 11 1 1 0 0 9930. 0 1 0
## 2 NSW2 1 22 9 0 1 0 0 3596. 0 0 1
## 3 NSW3 1 30 12 0 0 0 0 24909. 0 1 0
## 4 NSW4 1 27 11 0 1 0 0 7506. 0 1 0
## 5 NSW5 1 33 8 0 1 0 0 290. 0 1 0
## 6 NSW6 1 22 9 0 1 0 0 4056. 0 1 0
skim(Labdata)
| Name | Labdata |
| Number of rows | 614 |
| Number of columns | 12 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 11 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| X | 0 | 1 | 4 | 7 | 0 | 614 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| treat | 0 | 1 | 0.30 | 0.46 | 0 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▃ |
| age | 0 | 1 | 27.36 | 9.88 | 16 | 20.00 | 25.00 | 32.00 | 55.00 | ▇▅▂▂▁ |
| educ | 0 | 1 | 10.27 | 2.63 | 0 | 9.00 | 11.00 | 12.00 | 18.00 | ▁▂▆▇▁ |
| married | 0 | 1 | 0.42 | 0.49 | 0 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▆ |
| nodegree | 0 | 1 | 0.63 | 0.48 | 0 | 0.00 | 1.00 | 1.00 | 1.00 | ▅▁▁▁▇ |
| re74 | 0 | 1 | 4557.55 | 6477.96 | 0 | 0.00 | 1042.33 | 7888.50 | 35040.07 | ▇▂▁▁▁ |
| re75 | 0 | 1 | 2184.94 | 3295.68 | 0 | 0.00 | 601.55 | 3248.99 | 25142.24 | ▇▁▁▁▁ |
| re78 | 0 | 1 | 6792.83 | 7470.73 | 0 | 238.28 | 4759.02 | 10893.59 | 60307.93 | ▇▂▁▁▁ |
| white | 0 | 1 | 0.49 | 0.50 | 0 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▇ |
| black | 0 | 1 | 0.40 | 0.49 | 0 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▅ |
| hispan | 0 | 1 | 0.12 | 0.32 | 0 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
## Standardizing re78 (outcome variable)
Labdata$re78_std <- scale(Labdata$re78)
head(Labdata)
## # A tibble: 6 × 13
## X treat age educ married nodegree re74 re75 re78 white black hispan
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NSW1 1 37 11 1 1 0 0 9930. 0 1 0
## 2 NSW2 1 22 9 0 1 0 0 3596. 0 0 1
## 3 NSW3 1 30 12 0 0 0 0 24909. 0 1 0
## 4 NSW4 1 27 11 0 1 0 0 7506. 0 1 0
## 5 NSW5 1 33 8 0 1 0 0 290. 0 1 0
## 6 NSW6 1 22 9 0 1 0 0 4056. 0 1 0
## # … with 1 more variable: re78_std <dbl[,1]>
boxplot(re78_std ~ treat, data= Labdata,
xlab= "treat",
ylab= "Standardized Real Earning in 78")
Labdata %>%
group_by(treat) %>%
summarise(treatment_status= n(),
mean_re78 = mean(re78_std),
std_error = sd(re78_std)/ sqrt(treatment_status))
## # A tibble: 2 × 4
## treat treatment_status mean_re78 std_error
## <dbl> <int> <dbl> <dbl>
## 1 0 429 0.0256 0.0471
## 2 1 185 -0.0594 0.0774
Labdata %>%
mutate(test= (re78 - mean(re78)) / sd(re78)) %>%
group_by(treat) %>%
summarise(mean_re78 = mean(test))
## # A tibble: 2 × 2
## treat mean_re78
## <dbl> <dbl>
## 1 0 0.0256
## 2 1 -0.0594
#Use t.test to check the difference-in-means: pre-treatment #covariates (confounding variables) with treatment (variable)
with(Labdata, t.test(re78_std ~ treat))
##
## Welch Two Sample t-test
##
## data: re78_std by treat
## t = 0.93773, df = 326.41, p-value = 0.3491
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.09332313 0.26332690
## sample estimates:
## mean in group 0 mean in group 1
## 0.02561132 -0.05939057
Labdata_cov <- c('age','educ','black','hispan','married','nodegree')
##Difference in mean pre-treatment covariates
library(dplyr)
Labdata %>%
group_by(treat) %>%
select(one_of(Labdata_cov)) %>%
summarise_all(funs(mean(., na.rm = T)))
## Adding missing grouping variables: `treat`
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## ℹ Please use a list of either functions or lambdas:
##
## # Simple named list: list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`: tibble::lst(mean, median)
##
## # Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## # A tibble: 2 × 7
## treat age educ black hispan married nodegree
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 28.0 10.2 0.203 0.142 0.513 0.597
## 2 1 25.8 10.3 0.843 0.0595 0.189 0.708
#outcome variable with treatment (variable)
with(Labdata, t.test(re78_std ~ treat))
##
## Welch Two Sample t-test
##
## data: re78_std by treat
## t = 0.93773, df = 326.41, p-value = 0.3491
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.09332313 0.26332690
## sample estimates:
## mean in group 0 mean in group 1
## 0.02561132 -0.05939057
From the above statistical output, it can be suggested that the p-values is greater than 0.05 and not close to zero , hence making the null hypothesis not to be rejected. This means that the mean of standard real earning is not significantly different.
#pre-treatment covariates (confounding variables) with treatment variable
with(Labdata, t.test(age~ treat))
##
## Welch Two Sample t-test
##
## data: age by treat
## t = 2.9911, df = 510.57, p-value = 0.002914
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 0.7598127 3.6683610
## sample estimates:
## mean in group 0 mean in group 1
## 28.03030 25.81622
with(Labdata, t.test(educ~ treat))
##
## Welch Two Sample t-test
##
## data: educ by treat
## t = -0.54676, df = 485.37, p-value = 0.5848
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.5076687 0.2866393
## sample estimates:
## mean in group 0 mean in group 1
## 10.23543 10.34595
with(Labdata, t.test(black~ treat))
##
## Welch Two Sample t-test
##
## data: black by treat
## t = -19.344, df = 382.86, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.7055419 -0.5753502
## sample estimates:
## mean in group 0 mean in group 1
## 0.2027972 0.8432432
with(Labdata, t.test(hispan~ treat))
##
## Welch Two Sample t-test
##
## data: hispan by treat
## t = 3.4091, df = 501.34, p-value = 0.0007042
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 0.03505288 0.13041048
## sample estimates:
## mean in group 0 mean in group 1
## 0.14219114 0.05945946
with(Labdata, t.test(nodegree~ treat))
##
## Welch Two Sample t-test
##
## data: nodegree by treat
## t = -2.7127, df = 374.01, p-value = 0.006982
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.19210040 -0.03064263
## sample estimates:
## mean in group 0 mean in group 1
## 0.5967366 0.7081081
with(Labdata, t.test(married~ treat))
##
## Welch Two Sample t-test
##
## data: married by treat
## t = 8.5961, df = 439.29, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 0.2496379 0.3976248
## sample estimates:
## mean in group 0 mean in group 1
## 0.5128205 0.1891892
##Use glm() to estimate propensity score (probability)
m_ps <- glm(treat ~ age + educ + black + hispan + nodegree + married, family=binomial(), data= Labdata)
summary(m_ps)
##
## Call:
## glm(formula = treat ~ age + educ + black + hispan + nodegree +
## married, family = binomial(), data = Labdata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7709 -0.4606 -0.2963 0.7766 2.6384
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.67874 1.02120 -4.582 4.61e-06 ***
## age 0.01030 0.01329 0.775 0.43857
## educ 0.15161 0.06568 2.308 0.02098 *
## black 3.12657 0.28514 10.965 < 2e-16 ***
## hispan 0.99947 0.42191 2.369 0.01784 *
## nodegree 0.78719 0.33507 2.349 0.01881 *
## married -0.92969 0.27128 -3.427 0.00061 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 751.49 on 613 degrees of freedom
## Residual deviance: 494.70 on 607 degrees of freedom
## AIC: 508.7
##
## Number of Fisher Scoring iterations: 5
prs_df <- data.frame(pr_score = predict(m_ps, type= "response"),
treat = m_ps$model$treat)
prs_df### propensity score and the treatment status
## pr_score treat
## 1 0.58760225 1
## 2 0.21399311 1
## 3 0.64016828 1
## 4 0.76508840 1
## 5 0.68734337 1
## 6 0.69552346 1
## 7 0.62340207 1
## 8 0.77421538 1
## 9 0.75029124 1
## 10 0.03079057 1
## 11 0.68894224 1
## 12 0.65362946 1
## 13 0.65323034 1
## 14 0.52484866 1
## 15 0.61570543 1
## 16 0.72046942 1
## 17 0.66748129 1
## 18 0.72868884 1
## 19 0.66353055 1
## 20 0.63062650 1
## 21 0.75760489 1
## 22 0.10583814 1
## 23 0.10567147 1
## 24 0.75949086 1
## 25 0.71839092 1
## 26 0.56750619 1
## 27 0.76136678 1
## 28 0.25012930 1
## 29 0.71630304 1
## 30 0.75949086 1
## 31 0.71630304 1
## 32 0.58315483 1
## 33 0.55736985 1
## 34 0.61612236 1
## 35 0.62822468 1
## 36 0.73162583 1
## 37 0.56230268 1
## 38 0.39515851 1
## 39 0.49797505 1
## 40 0.73071983 1
## 41 0.61855490 1
## 42 0.08855667 1
## 43 0.65089413 1
## 44 0.08139694 1
## 45 0.75188687 1
## 46 0.55736985 1
## 47 0.65089413 1
## 48 0.68451172 1
## 49 0.56230268 1
## 50 0.62340207 1
## 51 0.67617353 1
## 52 0.57255379 1
## 53 0.74802499 1
## 54 0.62822468 1
## 55 0.57003181 1
## 56 0.71630304 1
## 57 0.72721338 1
## 58 0.33938497 1
## 59 0.51829034 1
## 60 0.11948986 1
## 61 0.65089413 1
## 62 0.67842415 1
## 63 0.75241758 1
## 64 0.76508840 1
## 65 0.53496517 1
## 66 0.70844720 1
## 67 0.76508840 1
## 68 0.06956293 1
## 69 0.38385095 1
## 70 0.67203638 1
## 71 0.70631574 1
## 72 0.70939834 1
## 73 0.58177011 1
## 74 0.62822468 1
## 75 0.73308619 1
## 76 0.12277845 1
## 77 0.62098150 1
## 78 0.69333841 1
## 79 0.78828971 1
## 80 0.75570889 1
## 81 0.62822468 1
## 82 0.61123991 1
## 83 0.07812550 1
## 84 0.66748129 1
## 85 0.67391473 1
## 86 0.78483229 1
## 87 0.19118372 1
## 88 0.76323263 1
## 89 0.06641267 1
## 90 0.66937289 1
## 91 0.57255379 1
## 92 0.71630304 1
## 93 0.76136678 1
## 94 0.61855490 1
## 95 0.79508211 1
## 96 0.06901255 1
## 97 0.68673124 1
## 98 0.66811285 1
## 99 0.73274119 1
## 100 0.22999382 1
## 101 0.73071983 1
## 102 0.73475288 1
## 103 0.76136678 1
## 104 0.74802499 1
## 105 0.74996090 1
## 106 0.73929621 1
## 107 0.66748129 1
## 108 0.68451172 1
## 109 0.77059500 1
## 110 0.52228006 1
## 111 0.69114465 1
## 112 0.20546062 1
## 113 0.61612236 1
## 114 0.74802499 1
## 115 0.40504435 1
## 116 0.06641267 1
## 117 0.63302188 1
## 118 0.61612236 1
## 119 0.72046942 1
## 120 0.62340207 1
## 121 0.70454221 1
## 122 0.71839092 1
## 123 0.68894224 1
## 124 0.03358729 1
## 125 0.11522373 1
## 126 0.48696898 1
## 127 0.62098150 1
## 128 0.51456773 1
## 129 0.16478070 1
## 130 0.75188687 1
## 131 0.68451172 1
## 132 0.76693406 1
## 133 0.55990876 1
## 134 0.75188687 1
## 135 0.55482794 1
## 136 0.71478659 1
## 137 0.03319803 1
## 138 0.71839092 1
## 139 0.63779287 1
## 140 0.12389182 1
## 141 0.69986714 1
## 142 0.62822468 1
## 143 0.73071983 1
## 144 0.71536662 1
## 145 0.07308941 1
## 146 0.74996090 1
## 147 0.65555919 1
## 148 0.76508840 1
## 149 0.55990876 1
## 150 0.72253851 1
## 151 0.73874713 1
## 152 0.62581650 1
## 153 0.65555919 1
## 154 0.62340207 1
## 155 0.52555857 1
## 156 0.66289466 1
## 157 0.68673124 1
## 158 0.61855490 1
## 159 0.73675488 1
## 160 0.66019446 1
## 161 0.69552346 1
## 162 0.54008477 1
## 163 0.51713946 1
## 164 0.74072962 1
## 165 0.63779287 1
## 166 0.10158152 1
## 167 0.12364229 1
## 168 0.71478659 1
## 169 0.51199523 1
## 170 0.69333841 1
## 171 0.71630304 1
## 172 0.40256532 1
## 173 0.21054951 1
## 174 0.72046942 1
## 175 0.73475288 1
## 176 0.76693406 1
## 177 0.07155338 1
## 178 0.77601032 1
## 179 0.06705399 1
## 180 0.25401196 1
## 181 0.42001629 1
## 182 0.47455702 1
## 183 0.50756003 1
## 184 0.46969468 1
## 185 0.57758632 1
## 186 0.02988193 0
## 187 0.02871074 0
## 188 0.05092178 0
## 189 0.06174451 0
## 190 0.48182539 0
## 191 0.51270652 0
## 192 0.03048476 0
## 193 0.61612236 0
## 194 0.11244981 0
## 195 0.05197976 0
## 196 0.02555614 0
## 197 0.04044053 0
## 198 0.03703217 0
## 199 0.02958486 0
## 200 0.46686071 0
## 201 0.03263214 0
## 202 0.21923502 0
## 203 0.04037220 0
## 204 0.03821442 0
## 205 0.04325918 0
## 206 0.11633379 0
## 207 0.06697793 0
## 208 0.03172595 0
## 209 0.50054936 0
## 210 0.06056214 0
## 211 0.02983088 0
## 212 0.51013354 0
## 213 0.07512548 0
## 214 0.03767030 0
## 215 0.02958486 0
## 216 0.18516118 0
## 217 0.04531759 0
## 218 0.02899929 0
## 219 0.04129022 0
## 220 0.02871074 0
## 221 0.02899929 0
## 222 0.03666671 0
## 223 0.02842497 0
## 224 0.04295187 0
## 225 0.04949892 0
## 226 0.39270005 0
## 227 0.02175863 0
## 228 0.02842497 0
## 229 0.02730928 0
## 230 0.10741283 0
## 231 0.38779952 0
## 232 0.04936658 0
## 233 0.40009126 0
## 234 0.02958486 0
## 235 0.02703707 0
## 236 0.06827646 0
## 237 0.03292519 0
## 238 0.05156078 0
## 239 0.05197976 0
## 240 0.03079057 0
## 241 0.03523926 0
## 242 0.05034209 0
## 243 0.03018188 0
## 244 0.42755967 0
## 245 0.02924059 0
## 246 0.03230959 0
## 247 0.03804537 0
## 248 0.03919108 0
## 249 0.05105954 0
## 250 0.02018487 0
## 251 0.19600623 0
## 252 0.03464083 0
## 253 0.09000014 0
## 254 0.79152742 0
## 255 0.04667302 0
## 256 0.06417421 0
## 257 0.07102397 0
## 258 0.07505017 0
## 259 0.04531759 0
## 260 0.03852935 0
## 261 0.16907643 0
## 262 0.09084705 0
## 263 0.09818370 0
## 264 0.13418061 0
## 265 0.03319803 0
## 266 0.02786170 0
## 267 0.07363299 0
## 268 0.04407389 0
## 269 0.30083235 0
## 270 0.03301595 0
## 271 0.10158152 0
## 272 0.03676740 0
## 273 0.06514711 0
## 274 0.08609474 0
## 275 0.04713334 0
## 276 0.12520676 0
## 277 0.05083667 0
## 278 0.10443570 0
## 279 0.57576753 0
## 280 0.07363299 0
## 281 0.77471264 0
## 282 0.39762229 0
## 283 0.38779952 0
## 284 0.63541070 0
## 285 0.14933355 0
## 286 0.01330297 0
## 287 0.06452278 0
## 288 0.06641267 0
## 289 0.06479540 0
## 290 0.06641267 0
## 291 0.05714097 0
## 292 0.02676751 0
## 293 0.03975742 0
## 294 0.04237076 0
## 295 0.61612236 0
## 296 0.15464186 0
## 297 0.08170128 0
## 298 0.10364014 0
## 299 0.08943579 0
## 300 0.04985207 0
## 301 0.02958486 0
## 302 0.02786170 0
## 303 0.75188687 0
## 304 0.03464083 0
## 305 0.03401656 0
## 306 0.08093203 0
## 307 0.05034064 0
## 308 0.02842497 0
## 309 0.06452278 0
## 310 0.05399191 0
## 311 0.02899929 0
## 312 0.11522373 0
## 313 0.02894970 0
## 314 0.02786170 0
## 315 0.03646633 0
## 316 0.06514711 0
## 317 0.04443488 0
## 318 0.03682987 0
## 319 0.74996090 0
## 320 0.04314150 0
## 321 0.26534086 0
## 322 0.06641267 0
## 323 0.05197976 0
## 324 0.05936878 0
## 325 0.68894224 0
## 326 0.07730027 0
## 327 0.06758988 0
## 328 0.02786170 0
## 329 0.03141114 0
## 330 0.06452278 0
## 331 0.04048263 0
## 332 0.04290731 0
## 333 0.04487416 0
## 334 0.08832762 0
## 335 0.26937482 0
## 336 0.06514711 0
## 337 0.03604389 0
## 338 0.06641267 0
## 339 0.06835388 0
## 340 0.03740114 0
## 341 0.07572989 0
## 342 0.61570543 0
## 343 0.71839092 0
## 344 0.63541070 0
## 345 0.09718052 0
## 346 0.19030572 0
## 347 0.02080498 0
## 348 0.04253056 0
## 349 0.16033560 0
## 350 0.02988193 0
## 351 0.03073802 0
## 352 0.05083667 0
## 353 0.02899929 0
## 354 0.03919222 0
## 355 0.08787020 0
## 356 0.09971737 0
## 357 0.08101273 0
## 358 0.68673124 0
## 359 0.02842497 0
## 360 0.03222070 0
## 361 0.07501228 0
## 362 0.11627767 0
## 363 0.06705399 0
## 364 0.74802499 0
## 365 0.10347652 0
## 366 0.02814197 0
## 367 0.11522373 0
## 368 0.09971737 0
## 369 0.09971737 0
## 370 0.10064561 0
## 371 0.30360421 0
## 372 0.74802499 0
## 373 0.23131591 0
## 374 0.63062650 0
## 375 0.02504825 0
## 376 0.61123991 0
## 377 0.07792070 0
## 378 0.07632946 0
## 379 0.04667302 0
## 380 0.02012864 0
## 381 0.74802499 0
## 382 0.05248953 0
## 383 0.08787020 0
## 384 0.25401196 0
## 385 0.06521319 0
## 386 0.03604495 0
## 387 0.09523658 0
## 388 0.07525015 0
## 389 0.03469982 0
## 390 0.09879674 0
## 391 0.06452278 0
## 392 0.19030572 0
## 393 0.08609474 0
## 394 0.09879674 0
## 395 0.05866913 0
## 396 0.06577707 0
## 397 0.03190115 0
## 398 0.07489008 0
## 399 0.04985207 0
## 400 0.04998710 0
## 401 0.07026804 0
## 402 0.76908274 0
## 403 0.68451172 0
## 404 0.06503986 0
## 405 0.11154668 0
## 406 0.06514711 0
## 407 0.03301595 0
## 408 0.03750377 0
## 409 0.13916818 0
## 410 0.18334855 0
## 411 0.76001051 0
## 412 0.25936276 0
## 413 0.54193484 0
## 414 0.06514711 0
## 415 0.10654530 0
## 416 0.74802499 0
## 417 0.03626679 0
## 418 0.02703707 0
## 419 0.64895225 0
## 420 0.03353013 0
## 421 0.02899929 0
## 422 0.08855667 0
## 423 0.51341775 0
## 424 0.26134570 0
## 425 0.22949003 0
## 426 0.15781646 0
## 427 0.08347734 0
## 428 0.08787020 0
## 429 0.03777364 0
## 430 0.07645381 0
## 431 0.03429813 0
## 432 0.06630348 0
## 433 0.10664857 0
## 434 0.01520582 0
## 435 0.03529923 0
## 436 0.08609474 0
## 437 0.06503986 0
## 438 0.69333841 0
## 439 0.02758415 0
## 440 0.12817496 0
## 441 0.65089413 0
## 442 0.03254335 0
## 443 0.10064561 0
## 444 0.02814197 0
## 445 0.39762229 0
## 446 0.08609474 0
## 447 0.03222070 0
## 448 0.09568108 0
## 449 0.18181174 0
## 450 0.18489542 0
## 451 0.68451172 0
## 452 0.08787020 0
## 453 0.68228375 0
## 454 0.65827716 0
## 455 0.08609474 0
## 456 0.25936276 0
## 457 0.08654379 0
## 458 0.04854032 0
## 459 0.11522373 0
## 460 0.07224049 0
## 461 0.04229931 0
## 462 0.72046942 0
## 463 0.61326608 0
## 464 0.02953430 0
## 465 0.09879674 0
## 466 0.09971737 0
## 467 0.09971737 0
## 468 0.03319803 0
## 469 0.02958486 0
## 470 0.09971737 0
## 471 0.04295187 0
## 472 0.04314150 0
## 473 0.05892590 0
## 474 0.10158152 0
## 475 0.11522373 0
## 476 0.57675089 0
## 477 0.03286911 0
## 478 0.41500782 0
## 479 0.04985207 0
## 480 0.25936276 0
## 481 0.03646633 0
## 482 0.04827830 0
## 483 0.04867197 0
## 484 0.14137427 0
## 485 0.43653545 0
## 486 0.02899929 0
## 487 0.04759797 0
## 488 0.03719690 0
## 489 0.03401656 0
## 490 0.10163136 0
## 491 0.08727488 0
## 492 0.03469982 0
## 493 0.54973563 0
## 494 0.02758415 0
## 495 0.05133584 0
## 496 0.03940719 0
## 497 0.09208529 0
## 498 0.07521005 0
## 499 0.11948986 0
## 500 0.07308941 0
## 501 0.06514711 0
## 502 0.04487416 0
## 503 0.03172595 0
## 504 0.03729878 0
## 505 0.03319803 0
## 506 0.05769828 0
## 507 0.06770104 0
## 508 0.11137213 0
## 509 0.03880630 0
## 510 0.10064561 0
## 511 0.71326527 0
## 512 0.18361259 0
## 513 0.03426223 0
## 514 0.10080525 0
## 515 0.75029124 0
## 516 0.39762229 0
## 517 0.08704837 0
## 518 0.58427345 0
## 519 0.20884301 0
## 520 0.61368399 0
## 521 0.07572989 0
## 522 0.61368399 0
## 523 0.02197889 0
## 524 0.22100271 0
## 525 0.06770104 0
## 526 0.21054951 0
## 527 0.10252515 0
## 528 0.05826069 0
## 529 0.03268878 0
## 530 0.68451172 0
## 531 0.05363842 0
## 532 0.03489085 0
## 533 0.15918990 0
## 534 0.03295973 0
## 535 0.03584764 0
## 536 0.09971737 0
## 537 0.62581650 0
## 538 0.11627767 0
## 539 0.39609852 0
## 540 0.68894224 0
## 541 0.43962921 0
## 542 0.11522373 0
## 543 0.15895414 0
## 544 0.05034209 0
## 545 0.03405221 0
## 546 0.07738708 0
## 547 0.07433847 0
## 548 0.03794153 0
## 549 0.03646633 0
## 550 0.07489008 0
## 551 0.20378472 0
## 552 0.71839092 0
## 553 0.59506735 0
## 554 0.07489008 0
## 555 0.68228375 0
## 556 0.09208529 0
## 557 0.69114465 0
## 558 0.61612236 0
## 559 0.74802499 0
## 560 0.60986329 0
## 561 0.65089413 0
## 562 0.08609474 0
## 563 0.04759797 0
## 564 0.07945325 0
## 565 0.42252678 0
## 566 0.65362946 0
## 567 0.05442951 0
## 568 0.06514711 0
## 569 0.12277517 0
## 570 0.15918990 0
## 571 0.61612236 0
## 572 0.60056478 0
## 573 0.74996090 0
## 574 0.74802499 0
## 575 0.04136004 0
## 576 0.09174614 0
## 577 0.71630304 0
## 578 0.03236474 0
## 579 0.03703217 0
## 580 0.09000014 0
## 581 0.04780736 0
## 582 0.08609474 0
## 583 0.65089413 0
## 584 0.70023721 0
## 585 0.69612601 0
## 586 0.71630304 0
## 587 0.07489008 0
## 588 0.03574957 0
## 589 0.09415642 0
## 590 0.10080525 0
## 591 0.71420585 0
## 592 0.06770104 0
## 593 0.60986329 0
## 594 0.09971737 0
## 595 0.13937946 0
## 596 0.71839092 0
## 597 0.69273401 0
## 598 0.04188412 0
## 599 0.05695681 0
## 600 0.03021364 0
## 601 0.64960054 0
## 602 0.18028497 0
## 603 0.02899929 0
## 604 0.59064436 0
## 605 0.27094058 0
## 606 0.04556456 0
## 607 0.08609474 0
## 608 0.73675488 0
## 609 0.09122799 0
## 610 0.11522373 0
## 611 0.03158373 0
## 612 0.15014405 0
## 613 0.35272116 0
## 614 0.08609474 0
labs <- paste("Actual Treatment Status:", c("treat1", "treat2"))
test <- prs_df %>%
mutate(treat= ifelse(treat == 1, labs[1], labs[2]))
head(test,2)
## pr_score treat
## 1 0.5876022 Actual Treatment Status: treat1
## 2 0.2139931 Actual Treatment Status: treat1
library(ggplot2)
labs <- paste("Actual Treat Status:", c("treat1", "treat2"))
prs_df %>%
mutate(treat= ifelse(treat == 1, labs[1], labs[2])) %>%
ggplot(aes(x= pr_score)) +
geom_histogram(color= "white") +
facet_wrap(~treat) +
xlab("Probability of getting treatment status")+
theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Use matchit() to execute a matching algorithm
Labdata_nomiss <- Labdata %>%
select(re78_std, treat, one_of(Labdata_cov)) %>%
na.omit()
mod_match <- matchit(treat ~ age + educ + black + hispan + nodegree + married, method = "nearest", data = Labdata_nomiss)
dta_m <- match.data(mod_match)
dim(dta_m)
## [1] 370 11
fn_bal <- function(dta, variable) {
dta$variable <- dta[ , variable]
dta$treat <- as.factor(dta$treat)
ggplot(dta, aes(x= distance, y= variable, color= treat)) +
geom_point(alpha = 0.2, size= 1.5) +
geom_smooth(method= "loess", se= F) +
xlab("Propensity score") +
ylab(variable) +
theme_bw()
}
## checking balance
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(grobs = list(
fn_bal(as.data.frame(dta_m), "age") ,
fn_bal(as.data.frame(dta_m), "educ") + theme(legend.position = "none"),
fn_bal(as.data.frame(dta_m), "married") + theme(legend.position = "none"),
fn_bal(as.data.frame(dta_m), "black"),
fn_bal(as.data.frame(dta_m),"hispan") + theme(legend.position = "none"),
fn_bal(as.data.frame(dta_m),"nodegree")
),
fn_bal = 4, widths= c(1,0.85)
)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
dta_m <- subset(dta_m,select=-c(subclass))
#Use t.test to check the difference-in-means based on the matched #data: outcome variable with treatment (variable)
dta_m %>%
group_by(treat) %>%
summarise_all(funs(mean))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## ℹ Please use a list of either functions or lambdas:
##
## # Simple named list: list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`: tibble::lst(mean, median)
##
## # Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## # A tibble: 2 × 10
## treat re78_std age educ black hispan married nodegree distance weights
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 -0.126 25.1 10.4 0.470 0.227 0.205 0.686 0.361 1
## 2 1 -0.0594 25.8 10.3 0.843 0.0595 0.189 0.708 0.573 1
#Use t.test to check the difference-in-means based on the matched data
with(dta_m, t.test(re78_std ~ treat))
##
## Welch Two Sample t-test
##
## data: re78_std by treat
## t = -0.66533, df = 353.47, p-value = 0.5063
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.2627845 0.1299301
## sample estimates:
## mean in group 0 mean in group 1
## -0.12581781 -0.05939057
#lapply(Labdata_cov, function (v) {t.test(unlist(dta_m[,v]) ~unlist(dta_m[,"married"]))})
with(dta_m, t.test(age ~ treat))
##
## Welch Two Sample t-test
##
## data: age by treat
## t = -0.78123, df = 334.6, p-value = 0.4352
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -2.472045 1.066640
## sample estimates:
## mean in group 0 mean in group 1
## 25.11351 25.81622
with(dta_m, t.test(educ ~ treat))
##
## Welch Two Sample t-test
##
## data: educ by treat
## t = 0.24865, df = 348.64, p-value = 0.8038
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.4108521 0.5297710
## sample estimates:
## mean in group 0 mean in group 1
## 10.40541 10.34595
with(dta_m, t.test(black ~ treat))
##
## Welch Two Sample t-test
##
## data: black by treat
## t = -8.1932, df = 336.37, p-value = 5.38e-15
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.4625173 -0.2834286
## sample estimates:
## mean in group 0 mean in group 1
## 0.4702703 0.8432432
with(dta_m, t.test(hispan ~ treat))
##
## Welch Two Sample t-test
##
## data: hispan by treat
## t = 4.7251, df = 290.46, p-value = 3.588e-06
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 0.0977695 0.2373656
## sample estimates:
## mean in group 0 mean in group 1
## 0.22702703 0.05945946
with(dta_m, t.test(nodegree ~ treat))
##
## Welch Two Sample t-test
##
## data: nodegree by treat
## t = -0.45153, df = 367.85, p-value = 0.6519
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.11578506 0.07254182
## sample estimates:
## mean in group 0 mean in group 1
## 0.6864865 0.7081081
with(dta_m, t.test(married ~ treat))
##
## Welch Two Sample t-test
##
## data: married by treat
## t = 0.39093, df = 367.65, p-value = 0.6961
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.06535445 0.09778688
## sample estimates:
## mean in group 0 mean in group 1
## 0.2054054 0.1891892
The above shows 6 t-test results which reports whether the used variables lead to a significant difference in treat. From the above results only black and hispan shows significant difference in treat since their p-value is less 0.05 and very close to zero.
##Use lm() to estimate treatment effects
lm_treat1 <- lm(re78_std ~ treat, data= dta_m)
summary(lm_treat1)
##
## Call:
## lm(formula = re78_std ~ treat, data = dta_m)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8499 -0.7758 -0.2867 0.4345 7.2227
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.12582 0.07060 -1.782 0.0755 .
## treat 0.06643 0.09984 0.665 0.5063
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9602 on 368 degrees of freedom
## Multiple R-squared: 0.001201, Adjusted R-squared: -0.001513
## F-statistic: 0.4427 on 1 and 368 DF, p-value: 0.5063
The above result suggest that the p-value of treat is 0.5063, which is more than 0.05, making it statistically insignificant. There is also a report on the standard error and the t-values of the intercept and variable treat.
#(2) lm() used for pre-treatment covariates (confounding variables) with treatment (variable)
lm_treat2 <- lm(treat ~ age + educ + black + hispan + nodegree + married, data= dta_m)
summary(lm_treat2)
##
## Call:
## lm(formula = treat ~ age + educ + black + hispan + nodegree +
## married, data = dta_m)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.72194 -0.29375 0.07263 0.36464 0.88678
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.031832 0.231541 -0.137 0.891
## age 0.001949 0.002961 0.658 0.511
## educ 0.015931 0.014858 1.072 0.284
## black 0.417256 0.063062 6.617 1.32e-10 ***
## hispan -0.013738 0.085281 -0.161 0.872
## nodegree 0.085579 0.072981 1.173 0.242
## married -0.075270 0.063768 -1.180 0.239
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4623 on 363 degrees of freedom
## Multiple R-squared: 0.1614, Adjusted R-squared: 0.1476
## F-statistic: 11.65 on 6 and 363 DF, p-value: 6.134e-12
From the above statistical results, it can be suggested that only variable black is statistically significant at 0.001 level with a p-value approximately close to zero. All other variables are however deem to statisitically insignificant.
besides the confounding variables(‘age’,‘educ’,‘black’,‘hispan’,‘married’,‘nodegree’,‘), can variables’ white’,‘re74’,‘re75’ be involved in the propensity score matching?
Answer:
The Inclusion of ’ white’,‘re74’,‘re75’ in the propensity score matching will hinder the chances of reducing bias in our results since propensity score matching are more effective at reducing bias with less covariates(confounding variables)
lm() is used to estimate treatment effects based on the matched data, can we use glm() instead of lm()?
Answer:
I think glm() function also be used to estimate effects based on the matched data, which allows us to predict based on the type to get the propensity score. The use of of lm however allows for continious outcomes based on the matched data.