SOSC 5340_Assignment

{r} install.packages(“sandwich”,repos = “http://cran.us.r-project.org”) install.packages(“estimatr”) install.packages(“clubSandwich”) install.packages(“texreg”) install.packages(“stargazer”) install.packages(“starpolishr”) install.packages(“overleaf”) install.packages(“Latex”) install.packages(“dplyr”, dependencies=TRUE) install.packages(“ggplot2”, dependencies=TRUE) install.packages(“lmtest”)

Q 1.1

# loading the data in the first task
# Q 1.1.1
library(sandwich)
library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(texreg)

## Version:  1.38.6
## Date:     2022-04-06
## Author:   Philip Leifeld (University of Essex)
## 
## Consider submitting praise using the praise or praise_interactive functions.
## Please cite the JSS article in your publications -- see citation("texreg").

library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

library(tinytex)


setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
getActiveDocumentContext()

## Document Context: 
## - id:        'BAFAEB3C'
## - path:      '~/Desktop/PPOL Spring Semester 2023/SOSC 5340/Assignments/Assignment 1/Assignment1/SOSC5340_Assignment_1.Rmd'
## - contents:  <248 rows>
## Document Selection:
## - [148, 2] -- [148, 2]: ''

library(readr)
data_problem1<- readr::read_csv("data-problem1.csv")

## New names:
## • `` -> `...1`

## Rows: 2963 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): race, education, maritl, region, jobclass, health_ins
## dbl (10): ...1, year, age, health, logwage, wage, count, conditionl_expectat...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(data_problem1)

## # A tibble: 6 × 16
##    ...1 race     education   year   age maritl region jobclass health health_ins
##   <dbl> <chr>    <chr>      <dbl> <dbl> <chr>  <chr>  <chr>     <dbl> <chr>     
## 1     1 1. White 1. < HS G…  2006    18 1. Ne… 2. Mi… 1. Indu…      0 2. No     
## 2     2 1. White 1. < HS G…  2009    37 2. Ma… 2. Mi… 1. Indu…      1 2. No     
## 3     3 1. White 1. < HS G…  2008    35 2. Ma… 2. Mi… 1. Indu…      1 2. No     
## 4     4 1. White 1. < HS G…  2008    39 2. Ma… 2. Mi… 1. Indu…      1 1. Yes    
## 5     5 1. White 1. < HS G…  2009    50 2. Ma… 2. Mi… 2. Info…      0 1. Yes    
## 6     6 1. White 1. < HS G…  2007    41 4. Di… 2. Mi… 2. Info…      0 1. Yes    
## # … with 6 more variables: logwage <dbl>, wage <dbl>, count <dbl>,
## #   conditionl_expectation <dbl>, odds <dbl>, logodds <dbl>

regress1 <- lm(health~wage+race+education, data=data_problem1)
#calculate classical standard error

#calculate robust standard error
regress1_r_s_e <- coeftest(regress1, vcov. = vcovHC(regress1, type = "HC0"))

# present both together
summary(regress1)

## 
## Call:
## lm(formula = health ~ wage + race + education, data = data_problem1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9679 -0.6060  0.2174  0.3237  0.4544 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  0.5251208  0.0337874  15.542  < 2e-16 ***
## wage                         0.0009784  0.0002231   4.385 1.20e-05 ***
## race2. Black                -0.0229205  0.0275566  -0.832  0.40561    
## race3. Asian                -0.0275508  0.0338245  -0.815  0.41541    
## education2. HS Grad          0.0252934  0.0312601   0.809  0.41851    
## education3. Some College     0.0869903  0.0331566   2.624  0.00874 ** 
## education4. College Grad     0.1472778  0.0337243   4.367 1.30e-05 ***
## education5. Advanced Degree  0.1671753  0.0382193   4.374 1.26e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4433 on 2955 degrees of freedom
## Multiple R-squared:  0.03744,    Adjusted R-squared:  0.03516 
## F-statistic: 16.42 on 7 and 2955 DF,  p-value: < 2.2e-16

regress1_r_s_e

## 
## t test of coefficients:
## 
##                                Estimate  Std. Error t value  Pr(>|t|)    
## (Intercept)                  0.52512077  0.03551077 14.7876 < 2.2e-16 ***
## wage                         0.00097839  0.00020374  4.8021 1.648e-06 ***
## race2. Black                -0.02292051  0.02850430 -0.8041    0.4214    
## race3. Asian                -0.02755085  0.03319181 -0.8300    0.4066    
## education2. HS Grad          0.02529343  0.03424972  0.7385    0.4603    
## education3. Some College     0.08699035  0.03560009  2.4435    0.0146 *  
## education4. College Grad     0.14727780  0.03542263  4.1577 3.306e-05 ***
## education5. Advanced Degree  0.16717529  0.03829499  4.3655 1.312e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# finding: the standard error of the robust standard error is smaller than the classical standard errors


#Q 1.1.2


#Q1.1.4 plot wage against residuals

residuals<-residuals(regress1)
plot(data_problem1$wage,residuals)

#Judging from the plot, residual decrease when wage increase; seems that wage correlate with residuals, there is heteroskedasticity

#Q1.1.5 Calculate 95% confidence interval of your estimated coefficient for “education3. Some College”, using the estimated coefficient and standard error only.
summary(regress1)

## 
## Call:
## lm(formula = health ~ wage + race + education, data = data_problem1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9679 -0.6060  0.2174  0.3237  0.4544 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  0.5251208  0.0337874  15.542  < 2e-16 ***
## wage                         0.0009784  0.0002231   4.385 1.20e-05 ***
## race2. Black                -0.0229205  0.0275566  -0.832  0.40561    
## race3. Asian                -0.0275508  0.0338245  -0.815  0.41541    
## education2. HS Grad          0.0252934  0.0312601   0.809  0.41851    
## education3. Some College     0.0869903  0.0331566   2.624  0.00874 ** 
## education4. College Grad     0.1472778  0.0337243   4.367 1.30e-05 ***
## education5. Advanced Degree  0.1671753  0.0382193   4.374 1.26e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4433 on 2955 degrees of freedom
## Multiple R-squared:  0.03744,    Adjusted R-squared:  0.03516 
## F-statistic: 16.42 on 7 and 2955 DF,  p-value: < 2.2e-16

estimated_coefficient= 0.0869903
standard_error = 0.0331566
conf_interval_low = estimated_coefficient - 1.96*standard_error
conf_interval_high = estimated_coefficient+1.96*standard_error

confint(regress1,parm="education3. Some College", level =0.95)

##                               2.5 %    97.5 %
## education3. Some College 0.02197795 0.1520027

# the result is very similar

Q1.2

#Q1.2.1
logistic1<-glm(health~wage*race + education, data=data_problem1, family=binomial)
summary(logistic1)

## 
## Call:
## glm(formula = health ~ wage * race + education, family = binomial, 
##     data = data_problem1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2757  -1.3600   0.6947   0.8748   1.1813  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -0.091823   0.179430  -0.512   0.6088    
## wage                         0.006316   0.001420   4.449 8.62e-06 ***
## race2. Black                 0.334530   0.407675   0.821   0.4119    
## race3. Asian                -0.377244   0.523572  -0.721   0.4712    
## education2. HS Grad          0.085049   0.145391   0.585   0.5586    
## education3. Some College     0.354349   0.157634   2.248   0.0246 *  
## education4. College Grad     0.672991   0.165924   4.056 4.99e-05 ***
## education5. Advanced Degree  0.823654   0.199072   4.137 3.51e-05 ***
## wage:race2. Black           -0.004467   0.003848  -1.161   0.2457    
## wage:race3. Asian            0.002108   0.004537   0.465   0.6422    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3538.8  on 2962  degrees of freedom
## Residual deviance: 3419.9  on 2953  degrees of freedom
## AIC: 3439.9
## 
## Number of Fisher Scoring iterations: 4

# interpretation: 
# for coefficient of wage*race term, holding the race as black, $1 additional increase in wage, compare to the white counterpart, will mean  .004 less of health increase, that is only .006-.004 = .002 increase of health; holding the race as Asian, $1 additional increase in wage, compare with white counterpart, will mean .002 more of health increase, that means .006+.002 = .008 increase in health

# for education term, holding the wage and race constant, high school education would mean 0.08 increase in health compare with those with no education, while some college education would mean .35 increase in health, and advanced college would mean .82 increase in health

#odd ratio
odd_ratio<-exp(coef(logistic1))

# wage, education with high school graduate, some college and advanced degree have odd ratio over 1, indicating that they are more likely to occur when the subject is healthy; given the same wage level, being Asian has odd ratio over 1, while being black has odd ratio less than 1; which indicates that at the same wage level, being Asian is more likely to associate with health


#1.2.2 do some prediction
library("ggplot2")
#install.packages("margins")
library("margins")


data_edu <- transform(data_problem1, education = "2.HS Grad")
logistic1_fix<-glm(health~wage+race, data=data_edu, family=binomial)
summary(logistic1_fix)

## 
## Call:
## glm(formula = health ~ wage + race, family = binomial, data = data_edu)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3813  -1.4041   0.7523   0.8621   1.1833  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.110346   0.134540  -0.820    0.412    
## wage          0.009660   0.001202   8.036 9.28e-16 ***
## race2. Black -0.128365   0.133871  -0.959    0.338    
## race3. Asian -0.003132   0.172373  -0.018    0.986    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3538.8  on 2962  degrees of freedom
## Residual deviance: 3460.7  on 2959  degrees of freedom
## AIC: 3468.7
## 
## Number of Fisher Scoring iterations: 4

cplot(logistic1_fix, "wage", what = "prediction", main = "Predicted probability")

cplot(logistic1_fix, "race", what = "prediction", main = "Predicted probability")

#for wage, higher the wage, higher the predicted probability of being healthy, it is aligned to the positive sign of coefficient at wage in the logistic regression, with the slope being similar to the magnitude of the coefficient

# for race, being black would decrease the likelihood of being healthy(compare to being white)which is reflected as a lower level in the plot; while being Asian would increase the likelihood of being healthy (compare to being white)

Q1.3

# Q1.3.1

#regroup the data by education and race with probability showing being healthy
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

ratio<-aggregate(health~race+education, data_problem1, mean)

ratio<- mutate(ratio, health = ratio$health/(1-ratio$health))
ratio

##        race          education   health
## 1  1. White       1. < HS Grad 1.453488
## 2  2. Black       1. < HS Grad 2.100000
## 3  3. Asian       1. < HS Grad 1.500000
## 4  1. White         2. HS Grad 1.874126
## 5  2. Black         2. HS Grad 1.187500
## 6  3. Asian         2. HS Grad 2.100000
## 7  1. White    3. Some College 2.546667
## 8  2. Black    3. Some College 2.172414
## 9  3. Asian    3. Some College 2.600000
## 10 1. White    4. College Grad 3.840336
## 11 2. Black    4. College Grad 5.666667
## 12 3. Asian    4. College Grad 2.666667
## 13 1. White 5. Advanced Degree 5.163636
## 14 2. Black 5. Advanced Degree 7.333333
## 15 3. Asian 5. Advanced Degree 4.000000

#log odds
ratio_log<- mutate(ratio, health = log(health))
ratio_log

##        race          education    health
## 1  1. White       1. < HS Grad 0.3739664
## 2  2. Black       1. < HS Grad 0.7419373
## 3  3. Asian       1. < HS Grad 0.4054651
## 4  1. White         2. HS Grad 0.6281424
## 5  2. Black         2. HS Grad 0.1718503
## 6  3. Asian         2. HS Grad 0.7419373
## 7  1. White    3. Some College 0.9347853
## 8  2. Black    3. Some College 0.7758389
## 9  3. Asian    3. Some College 0.9555114
## 10 1. White    4. College Grad 1.3455599
## 11 2. Black    4. College Grad 1.7346011
## 12 3. Asian    4. College Grad 0.9808293
## 13 1. White 5. Advanced Degree 1.6416411
## 14 2. Black 5. Advanced Degree 1.9924302
## 15 3. Asian 5. Advanced Degree 1.3862944

# run logistic regression on the data frame
regress_2<-lm(health~race + education, data=ratio_log)
regress_2

## 
## Call:
## lm(formula = health ~ race + education, data = ratio_log)
## 
## Coefficients:
##                 (Intercept)                 race2. Black  
##                    0.504556                     0.098513  
##                race3. Asian          education2. HS Grad  
##                   -0.090812                     0.006854  
##    education3. Some College     education4. College Grad  
##                    0.381589                     0.846540  
## education5. Advanced Degree  
##                    1.166332

# incorporating weight in to OLS regression




#regress_3<-lm(health~race + education + wage, data=ratio_log)

# nope, the original aggregated data has already lost the information on wages, needs to regrouping the dataset with additional category of wage, however, since wage is not a categorical variable, this is not attainable

Q1.4

# question 1.4.1 
#install.packages("broom")
library(broom)
library(sandwich)
library(lmtest)
library(texreg)
library(stargazer)
library(tinytex)
data =read.csv("MichelinNY.csv")
l = glm(InMichelin ~ Service + Decor + Food + Price, data, family=binomial("logit"))
d1 <- tidy(coeftest(l))
d1$group <- "R default"

MLE by hand

We can use MLE to implement logistic regression estimations by hand

# logit^{-1} (X \beta )
invlogit = function(mX, vBeta) {
  return(exp(mX %*% vBeta)/(1+ exp(mX %*% vBeta)) )
}
# log-likelihoood function
logLikelihoodLogit = function(vBeta, mX, vY) {
  return(- sum(
    vY * log(  invlogit(mX, vBeta) ) + 
      (1-vY)* log(1 - invlogit(mX, vBeta))
  )
  )
}

Then use optim package to find \(\beta\) that maximize log likelihood (or minimize negative log likelihood)

vY = as.matrix(data['InMichelin'])
mX = as.matrix(data.frame(`(Intercept)` = 1, data[c('Service','Decor', 'Food', 'Price')]))
vBeta0 = rep(0, ncol(mX))
  
# optimize
# report every 1 minute
optimLogit <- optim(par = vBeta0, 
                    fn = logLikelihoodLogit,
                   mX = mX, vY = vY,
                   method = "BFGS",
                   hessian=TRUE, 
                   control = list(maxit = 50000, trace = 2, REPORT = 1))

## initial  value 113.676138 
## iter   2 value 112.828918
## iter   3 value 94.808341
## iter   4 value 93.408367
## iter   5 value 93.153819
## iter   6 value 79.787274
## iter   7 value 75.082138
## iter   8 value 74.332124
## iter   9 value 74.232998
## iter  10 value 74.214238
## iter  11 value 74.202620
## iter  12 value 74.198979
## iter  13 value 74.198788
## iter  14 value 74.198768
## iter  15 value 74.198756
## iter  16 value 74.198750
## iter  17 value 74.198624
## iter  18 value 74.198476
## iter  19 value 74.198474
## iter  19 value 74.198474
## iter  19 value 74.198474
## final  value 74.198474 
## converged

# construct output
coef = optimLogit$par  # coefficient
coef.sd = sqrt(diag(solve(optimLogit$hessian))) # standard error
tv  <- coef  / coef.sd # t-value
## pt is a student-t distribution
## the below line will be the correct way to calculate p-values if you are running linear regression
## but I don't remember whether logistic regression's test statistics follow student-t
d = data.frame(term = d1$term, "estimate" = coef,  "std.error" = coef.sd, "statistic" = tv,   check.names = FALSE)

compare the two estimates (default R and MLE by hand)

d$group <- "MLE_by_hand"
print (d1)

## # A tibble: 5 × 6
##   term        estimate std.error statistic    p.value group    
##   <chr>          <dbl>     <dbl>     <dbl>      <dbl> <chr>    
## 1 (Intercept) -11.2       2.31       -4.85 0.00000124 R default
## 2 Service      -0.192     0.124      -1.56 0.119      R default
## 3 Decor         0.100     0.0892      1.12 0.262      R default
## 4 Food          0.405     0.131       3.08 0.00207    R default
## 5 Price         0.0917    0.0318      2.89 0.00387    R default

print (d)

##          term     estimate  std.error statistic       group
## 1 (Intercept) -11.19667249 2.30897107 -4.849204 MLE_by_hand
## 2     Service  -0.19249778 0.12357888 -1.557692 MLE_by_hand
## 3       Decor   0.09990914 0.08918921  1.120193 MLE_by_hand
## 4        Food   0.40486247 0.13145773  3.079792 MLE_by_hand
## 5       Price   0.09175256 0.03174654  2.890159 MLE_by_hand

Q 1.5 Count data model

Q 1.6

SOSC 5340_Assignment_1

LI Zonghao

2023-03-08

MLE by hand