{r} install.packages(“sandwich”,repos = “http://cran.us.r-project.org”) install.packages(“estimatr”) install.packages(“clubSandwich”) install.packages(“texreg”) install.packages(“stargazer”) install.packages(“starpolishr”) install.packages(“overleaf”) install.packages(“Latex”) install.packages(“dplyr”, dependencies=TRUE) install.packages(“ggplot2”, dependencies=TRUE) install.packages(“lmtest”)
Q 1.1
# loading the data in the first task
# Q 1.1.1
library(sandwich)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(texreg)
## Version: 1.38.6
## Date: 2022-04-06
## Author: Philip Leifeld (University of Essex)
##
## Consider submitting praise using the praise or praise_interactive functions.
## Please cite the JSS article in your publications -- see citation("texreg").
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(tinytex)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
getActiveDocumentContext()
## Document Context:
## - id: 'BAFAEB3C'
## - path: '~/Desktop/PPOL Spring Semester 2023/SOSC 5340/Assignments/Assignment 1/Assignment1/SOSC5340_Assignment_1.Rmd'
## - contents: <248 rows>
## Document Selection:
## - [148, 2] -- [148, 2]: ''
library(readr)
data_problem1<- readr::read_csv("data-problem1.csv")
## New names:
## • `` -> `...1`
## Rows: 2963 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): race, education, maritl, region, jobclass, health_ins
## dbl (10): ...1, year, age, health, logwage, wage, count, conditionl_expectat...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(data_problem1)
## # A tibble: 6 × 16
## ...1 race education year age maritl region jobclass health health_ins
## <dbl> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 1 1. White 1. < HS G… 2006 18 1. Ne… 2. Mi… 1. Indu… 0 2. No
## 2 2 1. White 1. < HS G… 2009 37 2. Ma… 2. Mi… 1. Indu… 1 2. No
## 3 3 1. White 1. < HS G… 2008 35 2. Ma… 2. Mi… 1. Indu… 1 2. No
## 4 4 1. White 1. < HS G… 2008 39 2. Ma… 2. Mi… 1. Indu… 1 1. Yes
## 5 5 1. White 1. < HS G… 2009 50 2. Ma… 2. Mi… 2. Info… 0 1. Yes
## 6 6 1. White 1. < HS G… 2007 41 4. Di… 2. Mi… 2. Info… 0 1. Yes
## # … with 6 more variables: logwage <dbl>, wage <dbl>, count <dbl>,
## # conditionl_expectation <dbl>, odds <dbl>, logodds <dbl>
regress1 <- lm(health~wage+race+education, data=data_problem1)
#calculate classical standard error
#calculate robust standard error
regress1_r_s_e <- coeftest(regress1, vcov. = vcovHC(regress1, type = "HC0"))
# present both together
summary(regress1)
##
## Call:
## lm(formula = health ~ wage + race + education, data = data_problem1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9679 -0.6060 0.2174 0.3237 0.4544
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.5251208 0.0337874 15.542 < 2e-16 ***
## wage 0.0009784 0.0002231 4.385 1.20e-05 ***
## race2. Black -0.0229205 0.0275566 -0.832 0.40561
## race3. Asian -0.0275508 0.0338245 -0.815 0.41541
## education2. HS Grad 0.0252934 0.0312601 0.809 0.41851
## education3. Some College 0.0869903 0.0331566 2.624 0.00874 **
## education4. College Grad 0.1472778 0.0337243 4.367 1.30e-05 ***
## education5. Advanced Degree 0.1671753 0.0382193 4.374 1.26e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4433 on 2955 degrees of freedom
## Multiple R-squared: 0.03744, Adjusted R-squared: 0.03516
## F-statistic: 16.42 on 7 and 2955 DF, p-value: < 2.2e-16
regress1_r_s_e
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.52512077 0.03551077 14.7876 < 2.2e-16 ***
## wage 0.00097839 0.00020374 4.8021 1.648e-06 ***
## race2. Black -0.02292051 0.02850430 -0.8041 0.4214
## race3. Asian -0.02755085 0.03319181 -0.8300 0.4066
## education2. HS Grad 0.02529343 0.03424972 0.7385 0.4603
## education3. Some College 0.08699035 0.03560009 2.4435 0.0146 *
## education4. College Grad 0.14727780 0.03542263 4.1577 3.306e-05 ***
## education5. Advanced Degree 0.16717529 0.03829499 4.3655 1.312e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# finding: the standard error of the robust standard error is smaller than the classical standard errors
#Q 1.1.2
#Q1.1.4 plot wage against residuals
residuals<-residuals(regress1)
plot(data_problem1$wage,residuals)
#Judging from the plot, residual decrease when wage increase; seems that wage correlate with residuals, there is heteroskedasticity
#Q1.1.5 Calculate 95% confidence interval of your estimated coefficient for “education3. Some College”, using the estimated coefficient and standard error only.
summary(regress1)
##
## Call:
## lm(formula = health ~ wage + race + education, data = data_problem1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9679 -0.6060 0.2174 0.3237 0.4544
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.5251208 0.0337874 15.542 < 2e-16 ***
## wage 0.0009784 0.0002231 4.385 1.20e-05 ***
## race2. Black -0.0229205 0.0275566 -0.832 0.40561
## race3. Asian -0.0275508 0.0338245 -0.815 0.41541
## education2. HS Grad 0.0252934 0.0312601 0.809 0.41851
## education3. Some College 0.0869903 0.0331566 2.624 0.00874 **
## education4. College Grad 0.1472778 0.0337243 4.367 1.30e-05 ***
## education5. Advanced Degree 0.1671753 0.0382193 4.374 1.26e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4433 on 2955 degrees of freedom
## Multiple R-squared: 0.03744, Adjusted R-squared: 0.03516
## F-statistic: 16.42 on 7 and 2955 DF, p-value: < 2.2e-16
estimated_coefficient= 0.0869903
standard_error = 0.0331566
conf_interval_low = estimated_coefficient - 1.96*standard_error
conf_interval_high = estimated_coefficient+1.96*standard_error
confint(regress1,parm="education3. Some College", level =0.95)
## 2.5 % 97.5 %
## education3. Some College 0.02197795 0.1520027
# the result is very similar
Q1.2
#Q1.2.1
logistic1<-glm(health~wage*race + education, data=data_problem1, family=binomial)
summary(logistic1)
##
## Call:
## glm(formula = health ~ wage * race + education, family = binomial,
## data = data_problem1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2757 -1.3600 0.6947 0.8748 1.1813
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.091823 0.179430 -0.512 0.6088
## wage 0.006316 0.001420 4.449 8.62e-06 ***
## race2. Black 0.334530 0.407675 0.821 0.4119
## race3. Asian -0.377244 0.523572 -0.721 0.4712
## education2. HS Grad 0.085049 0.145391 0.585 0.5586
## education3. Some College 0.354349 0.157634 2.248 0.0246 *
## education4. College Grad 0.672991 0.165924 4.056 4.99e-05 ***
## education5. Advanced Degree 0.823654 0.199072 4.137 3.51e-05 ***
## wage:race2. Black -0.004467 0.003848 -1.161 0.2457
## wage:race3. Asian 0.002108 0.004537 0.465 0.6422
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3538.8 on 2962 degrees of freedom
## Residual deviance: 3419.9 on 2953 degrees of freedom
## AIC: 3439.9
##
## Number of Fisher Scoring iterations: 4
# interpretation:
# for coefficient of wage*race term, holding the race as black, $1 additional increase in wage, compare to the white counterpart, will mean .004 less of health increase, that is only .006-.004 = .002 increase of health; holding the race as Asian, $1 additional increase in wage, compare with white counterpart, will mean .002 more of health increase, that means .006+.002 = .008 increase in health
# for education term, holding the wage and race constant, high school education would mean 0.08 increase in health compare with those with no education, while some college education would mean .35 increase in health, and advanced college would mean .82 increase in health
#odd ratio
odd_ratio<-exp(coef(logistic1))
# wage, education with high school graduate, some college and advanced degree have odd ratio over 1, indicating that they are more likely to occur when the subject is healthy; given the same wage level, being Asian has odd ratio over 1, while being black has odd ratio less than 1; which indicates that at the same wage level, being Asian is more likely to associate with health
#1.2.2 do some prediction
library("ggplot2")
#install.packages("margins")
library("margins")
data_edu <- transform(data_problem1, education = "2.HS Grad")
logistic1_fix<-glm(health~wage+race, data=data_edu, family=binomial)
summary(logistic1_fix)
##
## Call:
## glm(formula = health ~ wage + race, family = binomial, data = data_edu)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3813 -1.4041 0.7523 0.8621 1.1833
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.110346 0.134540 -0.820 0.412
## wage 0.009660 0.001202 8.036 9.28e-16 ***
## race2. Black -0.128365 0.133871 -0.959 0.338
## race3. Asian -0.003132 0.172373 -0.018 0.986
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3538.8 on 2962 degrees of freedom
## Residual deviance: 3460.7 on 2959 degrees of freedom
## AIC: 3468.7
##
## Number of Fisher Scoring iterations: 4
cplot(logistic1_fix, "wage", what = "prediction", main = "Predicted probability")
cplot(logistic1_fix, "race", what = "prediction", main = "Predicted probability")
#for wage, higher the wage, higher the predicted probability of being healthy, it is aligned to the positive sign of coefficient at wage in the logistic regression, with the slope being similar to the magnitude of the coefficient
# for race, being black would decrease the likelihood of being healthy(compare to being white)which is reflected as a lower level in the plot; while being Asian would increase the likelihood of being healthy (compare to being white)
Q1.3
# Q1.3.1
#regroup the data by education and race with probability showing being healthy
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
ratio<-aggregate(health~race+education, data_problem1, mean)
ratio<- mutate(ratio, health = ratio$health/(1-ratio$health))
ratio
## race education health
## 1 1. White 1. < HS Grad 1.453488
## 2 2. Black 1. < HS Grad 2.100000
## 3 3. Asian 1. < HS Grad 1.500000
## 4 1. White 2. HS Grad 1.874126
## 5 2. Black 2. HS Grad 1.187500
## 6 3. Asian 2. HS Grad 2.100000
## 7 1. White 3. Some College 2.546667
## 8 2. Black 3. Some College 2.172414
## 9 3. Asian 3. Some College 2.600000
## 10 1. White 4. College Grad 3.840336
## 11 2. Black 4. College Grad 5.666667
## 12 3. Asian 4. College Grad 2.666667
## 13 1. White 5. Advanced Degree 5.163636
## 14 2. Black 5. Advanced Degree 7.333333
## 15 3. Asian 5. Advanced Degree 4.000000
#log odds
ratio_log<- mutate(ratio, health = log(health))
ratio_log
## race education health
## 1 1. White 1. < HS Grad 0.3739664
## 2 2. Black 1. < HS Grad 0.7419373
## 3 3. Asian 1. < HS Grad 0.4054651
## 4 1. White 2. HS Grad 0.6281424
## 5 2. Black 2. HS Grad 0.1718503
## 6 3. Asian 2. HS Grad 0.7419373
## 7 1. White 3. Some College 0.9347853
## 8 2. Black 3. Some College 0.7758389
## 9 3. Asian 3. Some College 0.9555114
## 10 1. White 4. College Grad 1.3455599
## 11 2. Black 4. College Grad 1.7346011
## 12 3. Asian 4. College Grad 0.9808293
## 13 1. White 5. Advanced Degree 1.6416411
## 14 2. Black 5. Advanced Degree 1.9924302
## 15 3. Asian 5. Advanced Degree 1.3862944
# run logistic regression on the data frame
regress_2<-lm(health~race + education, data=ratio_log)
regress_2
##
## Call:
## lm(formula = health ~ race + education, data = ratio_log)
##
## Coefficients:
## (Intercept) race2. Black
## 0.504556 0.098513
## race3. Asian education2. HS Grad
## -0.090812 0.006854
## education3. Some College education4. College Grad
## 0.381589 0.846540
## education5. Advanced Degree
## 1.166332
# incorporating weight in to OLS regression
#regress_3<-lm(health~race + education + wage, data=ratio_log)
# nope, the original aggregated data has already lost the information on wages, needs to regrouping the dataset with additional category of wage, however, since wage is not a categorical variable, this is not attainable
Q1.4
# question 1.4.1
#install.packages("broom")
library(broom)
library(sandwich)
library(lmtest)
library(texreg)
library(stargazer)
library(tinytex)
data =read.csv("MichelinNY.csv")
l = glm(InMichelin ~ Service + Decor + Food + Price, data, family=binomial("logit"))
d1 <- tidy(coeftest(l))
d1$group <- "R default"
We can use MLE to implement logistic regression estimations by hand
# logit^{-1} (X \beta )
invlogit = function(mX, vBeta) {
return(exp(mX %*% vBeta)/(1+ exp(mX %*% vBeta)) )
}
# log-likelihoood function
logLikelihoodLogit = function(vBeta, mX, vY) {
return(- sum(
vY * log( invlogit(mX, vBeta) ) +
(1-vY)* log(1 - invlogit(mX, vBeta))
)
)
}
Then use optim package to find \(\beta\) that maximize log likelihood (or
minimize negative log likelihood)
vY = as.matrix(data['InMichelin'])
mX = as.matrix(data.frame(`(Intercept)` = 1, data[c('Service','Decor', 'Food', 'Price')]))
vBeta0 = rep(0, ncol(mX))
# optimize
# report every 1 minute
optimLogit <- optim(par = vBeta0,
fn = logLikelihoodLogit,
mX = mX, vY = vY,
method = "BFGS",
hessian=TRUE,
control = list(maxit = 50000, trace = 2, REPORT = 1))
## initial value 113.676138
## iter 2 value 112.828918
## iter 3 value 94.808341
## iter 4 value 93.408367
## iter 5 value 93.153819
## iter 6 value 79.787274
## iter 7 value 75.082138
## iter 8 value 74.332124
## iter 9 value 74.232998
## iter 10 value 74.214238
## iter 11 value 74.202620
## iter 12 value 74.198979
## iter 13 value 74.198788
## iter 14 value 74.198768
## iter 15 value 74.198756
## iter 16 value 74.198750
## iter 17 value 74.198624
## iter 18 value 74.198476
## iter 19 value 74.198474
## iter 19 value 74.198474
## iter 19 value 74.198474
## final value 74.198474
## converged
# construct output
coef = optimLogit$par # coefficient
coef.sd = sqrt(diag(solve(optimLogit$hessian))) # standard error
tv <- coef / coef.sd # t-value
## pt is a student-t distribution
## the below line will be the correct way to calculate p-values if you are running linear regression
## but I don't remember whether logistic regression's test statistics follow student-t
d = data.frame(term = d1$term, "estimate" = coef, "std.error" = coef.sd, "statistic" = tv, check.names = FALSE)
compare the two estimates (default R and MLE by hand)
d$group <- "MLE_by_hand"
print (d1)
## # A tibble: 5 × 6
## term estimate std.error statistic p.value group
## <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 (Intercept) -11.2 2.31 -4.85 0.00000124 R default
## 2 Service -0.192 0.124 -1.56 0.119 R default
## 3 Decor 0.100 0.0892 1.12 0.262 R default
## 4 Food 0.405 0.131 3.08 0.00207 R default
## 5 Price 0.0917 0.0318 2.89 0.00387 R default
print (d)
## term estimate std.error statistic group
## 1 (Intercept) -11.19667249 2.30897107 -4.849204 MLE_by_hand
## 2 Service -0.19249778 0.12357888 -1.557692 MLE_by_hand
## 3 Decor 0.09990914 0.08918921 1.120193 MLE_by_hand
## 4 Food 0.40486247 0.13145773 3.079792 MLE_by_hand
## 5 Price 0.09175256 0.03174654 2.890159 MLE_by_hand
Q 1.5 Count data model
Q 1.6