dummy_variables_in

library(car)
library(multcomp)

## Loading required package: mvtnorm
## Loading required package: survival
## Loading required package: TH.data

data = Salaries
attach(data)
table(rank)

## rank
##  AsstProf AssocProf      Prof 
##        67        64       266

summary(salary)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   57800   91000  107300  113700  134200  231500

summary(yrs.since.phd)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   12.00   21.00   22.31   32.00   56.00

library(ggplot2)
ggplot(data, aes(yrs.since.phd, salary, color=rank)) + geom_point(shape=1)

# method 1
model0 = summary(lm(salary ~ factor(rank), data = data)); model0 # automatically creates indicator variables

## 
## Call:
## lm(formula = salary ~ factor(rank), data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -68972 -16376  -1580  11755 104773 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              80776       2887  27.976  < 2e-16 ***
## factor(rank)AssocProf    13100       4131   3.171  0.00164 ** 
## factor(rank)Prof         45996       3230  14.238  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23630 on 394 degrees of freedom
## Multiple R-squared:  0.3943, Adjusted R-squared:  0.3912 
## F-statistic: 128.2 on 2 and 394 DF,  p-value: < 2.2e-16

# method 2
data <- within(data, {
  rank.ct <- C(rank, treatment) # helmert instead of treatment compares mean to previous mean
  print(attributes(rank.ct))
}) # using C function to specify several contrast

## $levels
## [1] "AsstProf"  "AssocProf" "Prof"     
## 
## $class
## [1] "factor"
## 
## $contrasts
## [1] "contr.treatment"

model1 = summary(lm(salary ~ rank.ct, data = data)); model1

## 
## Call:
## lm(formula = salary ~ rank.ct, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -68972 -16376  -1580  11755 104773 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         80776       2887  27.976  < 2e-16 ***
## rank.ctAssocProf    13100       4131   3.171  0.00164 ** 
## rank.ctProf         45996       3230  14.238  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23630 on 394 degrees of freedom
## Multiple R-squared:  0.3943, Adjusted R-squared:  0.3912 
## F-statistic: 128.2 on 2 and 394 DF,  p-value: < 2.2e-16

# method 3
model11 = lm(salary ~ I(rank == 'Prof') + I(rank == 'AssocProf'), data = data)

# next model uses asstprof as the indicator and prof and assocprob as baseline
model22 = lm(salary ~ I(rank == 'AsstProf'), data = data) 

### testing linear combination of coefficients ### 
## test assocprof vs. prof

# get variance-covariance matrix
vcov(model1) # on the diagonal, variances, are the squared standard errors of the coefficients

##                  (Intercept) rank.ctAssocProf rank.ctProf
## (Intercept)          8336574         -8336574    -8336574
## rank.ctAssocProf    -8336574         17063925     8336574
## rank.ctProf         -8336574          8336574    10436388

# lets save the coefficients
coeffs = coef(model1)
var.covar = vcov(model1)
Beta1 = coeffs[2] # accossiate prof
Beta2 = coeffs[3] # professor
S.E.beta1 = var.covar[5:5]; # variance of assocprof
S.E.beta2 = var.covar[9:9] # variance of prof
cov.Asso.Prof = var.covar[8:8] # covariance of AssocProf and Prof

# calculate standard error of difference 
S.E.diff = sqrt((S.E.beta1+S.E.beta2)-(2*cov.Asso.Prof))

# calculate t-value by dividing by standard error of difference
t.value = ((Beta2 - Beta1) - 0) / S.E.diff; t.value

## [1] 9.997269

(Beta2 - Beta1) # professors on average make $32,896 more than associate professors, p<.05

## [1] 32895.67

anova(model11, model22) # t^2 is the same as the f-test in this ex testing model11 vs model22

## Analysis of Variance Table
## 
## Model 1: salary ~ I(rank == "Prof") + I(rank == "AssocProf")
## Model 2: salary ~ I(rank == "AsstProf")
##   Res.Df        RSS Df   Sum of Sq      F    Pr(>F)    
## 1    394 2.2007e+11                                    
## 2    395 2.7589e+11 -1 -5.5825e+10 99.945 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# professors on average make $32,896 more than associate professors, p<.05


################ Interactions #################
# categorical variable interactions
# test effects by re-estimating, partial F-test, & linear combination

# Ex. salary = rank + sex; rank has 3 levels, sex female is the reference group
summary(lm(salary ~ factor(rank) + sex, data = data)) # assistant professor is ref. grp.

## 
## Call:
## lm(formula = salary ~ factor(rank) + sex, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -69307 -15757  -1449  12359 104438 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              76645       4433  17.290  < 2e-16 ***
## factor(rank)AssocProf    13061       4128   3.164  0.00168 ** 
## factor(rank)Prof         45519       3252  13.998  < 2e-16 ***
## sexMale                   4943       4026   1.228  0.22029    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23620 on 393 degrees of freedom
## Multiple R-squared:  0.3966, Adjusted R-squared:  0.392 
## F-statistic: 86.09 on 3 and 393 DF,  p-value: < 2.2e-16

# I want professor to be indicator and AsstProf & AssocProf combined to be reference
model33 = lm(salary ~ I(rank == 'Prof') + sex, data = data); model33

## 
## Call:
## lm(formula = salary ~ I(rank == "Prof") + sex, data = data)
## 
## Coefficients:
##           (Intercept)  I(rank == "Prof")TRUE                sexMale  
##                 82943                  39129                   5041

#  expected salary for female non-full prof = B0 = 82,943
# expected salary for female prof holding B2 constant =
  # B0 + B1 = 82,943 + 39,129 = 122,072
# expected salary for male non-full prof holding B1 constant = 
  # B0 + B2 = 82,943 + 5,042 = 87,985
# expected salary for male prof = 
  # B0 + B1 + B2 = 82,943 + 39,129, + 5,042 = 127,114

#plot expected values side by side
pred.val.sex.rank = matrix(c(82943, 122072, 87985, 127114), ncol=2, byrow=TRUE)
colnames(pred.val.sex.rank) = c("non-full professor", "professor")
rownames(pred.val.sex.rank) = c("female", "male")
pred.val.sex.rank = as.table(pred.val.sex.rank)

barplot(pred.val.sex.rank, main="Predicted values of salary by sex and rank",
         col=c("lightpink","lightblue"),    legend = rownames(pred.val.sex.rank), 
        beside=TRUE)

library(scatterplot3d)
s3d = scatterplot3d(sex,rank == 'Prof',salary, pch=16, highlight.3d=TRUE,
              type="h", main="3D Scatterplot")
s3d$plane3d(model33)

# spinning 3d graph
library(rgl)
plot3d(sex,rank == 'Prof',salary, col="lightblue", size=3) 

# is there an interaction between rank and sex?

# Salary = B0 + B1*prof + B2*sex + B3(prof*sex)
Model.int = lm(salary ~ I(rank == 'Prof') + sex + I(rank == 'Prof')*sex, data=data)
summary(Model.int)

## 
## Call:
## lm(formula = salary ~ I(rank == "Prof") + sex + I(rank == "Prof") * 
##     sex, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -69321 -15771  -2967  15032 104424 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    83032.2     5219.2  15.909  < 2e-16 ***
## I(rank == "Prof")TRUE          38935.4     7682.4   5.068  6.2e-07 ***
## sexMale                         4935.1     5695.6   0.866    0.387    
## I(rank == "Prof")TRUE:sexMale    218.1     8156.4   0.027    0.979    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23920 on 393 degrees of freedom
## Multiple R-squared:  0.3812, Adjusted R-squared:  0.3765 
## F-statistic:  80.7 on 3 and 393 DF,  p-value: < 2.2e-16

# interaction is not significant if it were we would continue with interpretation 
Model.coef =  coefficients(Model.int)
# E(salary| prof, sex=male) = B0 + B1*1 + B2*sex + B3(1*1)


### effects of being a Prof on salary given sex = (B0 + B2) + (B1 + B3)*Prof.vs.other

# B1 varies in this model.
# B0 is the expected mean for non-prof females
Model.coef[1]

## (Intercept) 
##    83032.24

# (b0 + b1)*0 or b0= is the expected salary for non-prof given that they are female
Model.coef[1]+(Model.coef[2]*0)

## (Intercept) 
##    83032.24

# simple intercept = (b0 + b2) = average salary for males who are not prof (b1=0)
Model.coef[1] + Model.coef[3]

## (Intercept) 
##    87967.33

# simple slope = (B1 + B3) captures the effect of being a Prof for males
Model.coef[2] + Model.coef[3]

## I(rank == "Prof")TRUE 
##              43870.46

# B3 captures the difference in effect for being Prof among men & women
Model.coef[4]

## I(rank == "Prof")TRUE:sexMale 
##                      218.1223

# interaction was not sig so not different from 0

# E(Y|b1=1, b2=1 ) (B0 + B2) + (B1 + B3)*1 
# effects of being male on salary given rank
rank.wit.sex = (Model.coef[1] + Model.coef[3]) + 
  ((Model.coef[2] + Model.coef[4])*1); rank.wit.sex

## (Intercept) 
##    127120.8

# plot for interaction effect 
xx <- c(0,1)   #  <-- change to alter plot dims
yy <- c(74214.48, 127120.8)   #  <-- change to alter plot dims
leg <- c(.79,91780)   #  <-- change to alter legend location
x <- c(0,1)   #  <-- x-coords for lines
y1 <- c(83032.2,121967.6)
y2 <- c(87967.3,127120.8)
plot(xx,yy,type='n',font=1,font.lab=1,xlab='sex',ylab='salary',
     main='Interaction effect plot')
lines(x,y1,lwd=3,lty=3,col=4)
lines(x,y2,lwd=3,lty=4,col=2)
points(x,y1,col=1,pch=16)
points(x,y2,col=1,pch=15)
legend(leg[1],leg[2],legend=c('male','female'),lwd=c(3,3),
       lty=c(4,3),col=c(2,4))

#  expected salary for female non-full prof = B0 = 83032.2
# expected salary for female prof holding B2 constant =
# B0 + B1 = 83032.2 + 38935.4 = 121967.6
# expected salary for male non-full prof holding B1 constant = 
# B0 + B2 = 83032.2 + 4935.1 = 87967.3
# expected salary for male prof = 
# B0 + B1 + B2 = 83032.2 + 38935.4 + 4935.1 + 218.1 = 127120.8
#plot expected values side by side
sex.rank.int = matrix(c(83032.2, 121967.6, 87967.3, 127120.8),
                           ncol=2, byrow=TRUE)
colnames(sex.rank.int) = c("non-full professor", "professor")
rownames(sex.rank.int) = c("female", "male")
sex.rank.int = as.table(sex.rank.int)

barplot(sex.rank.int, 
        main="Predicted values of salary by sex and rank w/ interaction",
        col=c("lightpink","lightblue"), 
        legend = rownames(pred.val.sex.rank), beside=TRUE)

scatterplot3d(sex, rank == 'Prof',salary, pch=16, 
                    highlight.3d=TRUE,
                    type="h", main="3D Scatterplot")

###################################

######### Interaction 1 categorical variables 1 interval #############


# using margins
############## ex #################
library(margins)
Modelx <- lm(salary ~  yrs.service*sex, data = data)
(m <- margins(Modelx)[[1]])

##       Factor     dy/dx  Std.Err. z value Pr(>|z|)      2.50%     97.50%
##  yrs.service  797.0942  114.6885  6.9501   0.0000   572.3088  1021.8795
##      sexMale 3716.4550 5742.7156  0.6472   0.5175 -7539.0608 14971.9709

plot(m)

###################################

# B0 is the predicted salary for females with 0 years of service 
# B1 differnce in salary among men with a difference of 1 year of service
# B2 difference in predicted salary between gender with 0 years of service
# B3 difference in slope for yrs.service compared to women and men

# visualizing the interaction: model w/ no interaction minus sex
fit.1 <- lm(salary ~  yrs.service , data = data)
plot (yrs.service, salary, xlab="years of service", ylab="salary")
curve (coef(fit.1)[1] + coef(fit.1)[2]*x, add=TRUE) # superimposes regression line

# plot with 2 lines 1 for female & male
fit.2 <- lm(salary ~sex + yrs.service, data = data)
colors <- ifelse (sex=="Male", "red", "blue")
plot (yrs.service, salary, xlab="years of service", ylab="salary",
      col=colors, pch=20)
curve (cbind (1, 1, x) %*% coef(fit.2), add=TRUE, col="red")
curve (cbind (1, 0, x) %*% coef(fit.2), add=TRUE, col="blue")

# model with interaction 
fit.3 <- lm(salary ~  sex + yrs.service + yrs.service:sex, data = data)
colors <- ifelse (sex=="Male", "red", "blue")
plot (yrs.service, salary, xlab="years of service", ylab="salary",
      col=colors, pch=20)
curve (cbind (1, 1, x, 1*x) %*% coef(fit.3), add=TRUE, col="green")
curve (cbind (1, 0, x, 0*x) %*% coef(fit.3), add=TRUE, col="red")

# displaying uncertainty in fitted regression
library(arm)

## Loading required package: MASS
## Loading required package: Matrix
## Loading required package: lme4
## 
## arm (Version 1.8-6, built: 2015-7-7)
## 
## Working directory is /home/kevin/Dropbox/My documents/grad classes/Int procedures of data analysis
## 
## 
## Attaching package: 'arm'
## 
## The following object is masked from 'package:car':
## 
##     logit

plot (yrs.service, salary, xlab="years of service", ylab="salary", type="n")
points (yrs.service[sex=="Male"], salary[sex=="Male"], pch=20, col="black")
points (yrs.service[sex=="Female"], salary[sex=="Female"], pch=20, col="gray")

fit.1.sim <- sim (fit.1)
plot (yrs.service, salary, xlab="years of service", ylab="salary")
for (i in 1:10){
  curve (fit.1.sim@coef[i,1] + fit.1.sim@coef[i,2]*x, 
         add=TRUE,col="gray") # gray lines uncertainty
}
curve (coef(fit.1)[1] + coef(fit.1)[2]*x, add=TRUE, col="red") # red line regression

# plot uncertainty but for model with sex
gender = as.numeric(sex) # has to be numeric 
beta.hat <- coef (fit.2)
beta.sim <- sim (fit.2)@coef
par (mfrow=c(1,2))
plot (yrs.service, salary, xlab="years of service", ylab="salary")
for (i in 1:10){
  curve (cbind (1, mean(gender), x) %*% beta.sim[i,], lwd=.5,
         col="gray", add=TRUE)
}
curve (cbind (1, mean(gender), x) %*% beta.hat, col="red", add=TRUE)
plot (sex, salary, xlab="years of service", ylab="salary")
for (i in 1:10){
  curve (cbind (1, x, mean(yrs.service)) %*% beta.sim[i,], lwd=.5,
         col="gray", add=TRUE)
}
curve (cbind (1, x, mean(yrs.service)) %*% beta.hat, col="red", add=TRUE)

######### Interaction 2 continous variables #############

fit.3a = lm(salary~yrs.service + yrs.since.phd, data=data)
display(fit.3a)

## lm(formula = salary ~ yrs.service + yrs.since.phd, data = data)
##               coef.est coef.se 
## (Intercept)   89912.18  2843.56
## yrs.service    -629.10   254.47
## yrs.since.phd  1562.89   256.82
## ---
## n = 397, k = 3
## residual sd = 27357.14, R-Squared = 0.19

scatterplotMatrix(~salary+yrs.service+yrs.since.phd, reg.line=lm, 
                  smooth=FALSE, spread=FALSE, span=0.55, 
                  ellipse=FALSE, levels=c(.5, .9), 
                  id.n=0, diagonal = 'histogram', data=data, 
                  main="Plot for model fit.3a")

par(mfrow=c(4,4))
# with intercation
fit.3b <- lm(salary ~ yrs.service*yrs.since.phd, data=data)
summary(fit.3b)

## 
## Call:
## lm(formula = salary ~ yrs.service * yrs.since.phd, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -63823 -17292  -2538  13158 107001 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               70155.263   3472.077  20.206  < 2e-16 ***
## yrs.service                1692.446    356.279   4.750 2.85e-06 ***
## yrs.since.phd              2194.289    246.862   8.889  < 2e-16 ***
## yrs.service:yrs.since.phd   -64.617      7.487  -8.630  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 25120 on 393 degrees of freedom
## Multiple R-squared:  0.3177, Adjusted R-squared:  0.3125 
## F-statistic: 60.99 on 3 and 393 DF,  p-value: < 2.2e-16

dummy_variables_in_R.R

kevin

Sat Nov 7 00:40:56 2015