library(car)
library(multcomp)
## Loading required package: mvtnorm
## Loading required package: survival
## Loading required package: TH.data
data = Salaries
attach(data)
table(rank)
## rank
## AsstProf AssocProf Prof
## 67 64 266
summary(salary)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 57800 91000 107300 113700 134200 231500
summary(yrs.since.phd)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 12.00 21.00 22.31 32.00 56.00
library(ggplot2)
ggplot(data, aes(yrs.since.phd, salary, color=rank)) + geom_point(shape=1)

# method 1
model0 = summary(lm(salary ~ factor(rank), data = data)); model0 # automatically creates indicator variables
##
## Call:
## lm(formula = salary ~ factor(rank), data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -68972 -16376 -1580 11755 104773
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 80776 2887 27.976 < 2e-16 ***
## factor(rank)AssocProf 13100 4131 3.171 0.00164 **
## factor(rank)Prof 45996 3230 14.238 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23630 on 394 degrees of freedom
## Multiple R-squared: 0.3943, Adjusted R-squared: 0.3912
## F-statistic: 128.2 on 2 and 394 DF, p-value: < 2.2e-16
# method 2
data <- within(data, {
rank.ct <- C(rank, treatment) # helmert instead of treatment compares mean to previous mean
print(attributes(rank.ct))
}) # using C function to specify several contrast
## $levels
## [1] "AsstProf" "AssocProf" "Prof"
##
## $class
## [1] "factor"
##
## $contrasts
## [1] "contr.treatment"
model1 = summary(lm(salary ~ rank.ct, data = data)); model1
##
## Call:
## lm(formula = salary ~ rank.ct, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -68972 -16376 -1580 11755 104773
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 80776 2887 27.976 < 2e-16 ***
## rank.ctAssocProf 13100 4131 3.171 0.00164 **
## rank.ctProf 45996 3230 14.238 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23630 on 394 degrees of freedom
## Multiple R-squared: 0.3943, Adjusted R-squared: 0.3912
## F-statistic: 128.2 on 2 and 394 DF, p-value: < 2.2e-16
# method 3
model11 = lm(salary ~ I(rank == 'Prof') + I(rank == 'AssocProf'), data = data)
# next model uses asstprof as the indicator and prof and assocprob as baseline
model22 = lm(salary ~ I(rank == 'AsstProf'), data = data)
### testing linear combination of coefficients ###
## test assocprof vs. prof
# get variance-covariance matrix
vcov(model1) # on the diagonal, variances, are the squared standard errors of the coefficients
## (Intercept) rank.ctAssocProf rank.ctProf
## (Intercept) 8336574 -8336574 -8336574
## rank.ctAssocProf -8336574 17063925 8336574
## rank.ctProf -8336574 8336574 10436388
# lets save the coefficients
coeffs = coef(model1)
var.covar = vcov(model1)
Beta1 = coeffs[2] # accossiate prof
Beta2 = coeffs[3] # professor
S.E.beta1 = var.covar[5:5]; # variance of assocprof
S.E.beta2 = var.covar[9:9] # variance of prof
cov.Asso.Prof = var.covar[8:8] # covariance of AssocProf and Prof
# calculate standard error of difference
S.E.diff = sqrt((S.E.beta1+S.E.beta2)-(2*cov.Asso.Prof))
# calculate t-value by dividing by standard error of difference
t.value = ((Beta2 - Beta1) - 0) / S.E.diff; t.value
## [1] 9.997269
(Beta2 - Beta1) # professors on average make $32,896 more than associate professors, p<.05
## [1] 32895.67
anova(model11, model22) # t^2 is the same as the f-test in this ex testing model11 vs model22
## Analysis of Variance Table
##
## Model 1: salary ~ I(rank == "Prof") + I(rank == "AssocProf")
## Model 2: salary ~ I(rank == "AsstProf")
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 394 2.2007e+11
## 2 395 2.7589e+11 -1 -5.5825e+10 99.945 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# professors on average make $32,896 more than associate professors, p<.05
################ Interactions #################
# categorical variable interactions
# test effects by re-estimating, partial F-test, & linear combination
# Ex. salary = rank + sex; rank has 3 levels, sex female is the reference group
summary(lm(salary ~ factor(rank) + sex, data = data)) # assistant professor is ref. grp.
##
## Call:
## lm(formula = salary ~ factor(rank) + sex, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69307 -15757 -1449 12359 104438
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 76645 4433 17.290 < 2e-16 ***
## factor(rank)AssocProf 13061 4128 3.164 0.00168 **
## factor(rank)Prof 45519 3252 13.998 < 2e-16 ***
## sexMale 4943 4026 1.228 0.22029
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23620 on 393 degrees of freedom
## Multiple R-squared: 0.3966, Adjusted R-squared: 0.392
## F-statistic: 86.09 on 3 and 393 DF, p-value: < 2.2e-16
# I want professor to be indicator and AsstProf & AssocProf combined to be reference
model33 = lm(salary ~ I(rank == 'Prof') + sex, data = data); model33
##
## Call:
## lm(formula = salary ~ I(rank == "Prof") + sex, data = data)
##
## Coefficients:
## (Intercept) I(rank == "Prof")TRUE sexMale
## 82943 39129 5041
# expected salary for female non-full prof = B0 = 82,943
# expected salary for female prof holding B2 constant =
# B0 + B1 = 82,943 + 39,129 = 122,072
# expected salary for male non-full prof holding B1 constant =
# B0 + B2 = 82,943 + 5,042 = 87,985
# expected salary for male prof =
# B0 + B1 + B2 = 82,943 + 39,129, + 5,042 = 127,114
#plot expected values side by side
pred.val.sex.rank = matrix(c(82943, 122072, 87985, 127114), ncol=2, byrow=TRUE)
colnames(pred.val.sex.rank) = c("non-full professor", "professor")
rownames(pred.val.sex.rank) = c("female", "male")
pred.val.sex.rank = as.table(pred.val.sex.rank)
barplot(pred.val.sex.rank, main="Predicted values of salary by sex and rank",
col=c("lightpink","lightblue"), legend = rownames(pred.val.sex.rank),
beside=TRUE)

library(scatterplot3d)
s3d = scatterplot3d(sex,rank == 'Prof',salary, pch=16, highlight.3d=TRUE,
type="h", main="3D Scatterplot")
s3d$plane3d(model33)

# spinning 3d graph
library(rgl)
plot3d(sex,rank == 'Prof',salary, col="lightblue", size=3)
# is there an interaction between rank and sex?
# Salary = B0 + B1*prof + B2*sex + B3(prof*sex)
Model.int = lm(salary ~ I(rank == 'Prof') + sex + I(rank == 'Prof')*sex, data=data)
summary(Model.int)
##
## Call:
## lm(formula = salary ~ I(rank == "Prof") + sex + I(rank == "Prof") *
## sex, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69321 -15771 -2967 15032 104424
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83032.2 5219.2 15.909 < 2e-16 ***
## I(rank == "Prof")TRUE 38935.4 7682.4 5.068 6.2e-07 ***
## sexMale 4935.1 5695.6 0.866 0.387
## I(rank == "Prof")TRUE:sexMale 218.1 8156.4 0.027 0.979
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23920 on 393 degrees of freedom
## Multiple R-squared: 0.3812, Adjusted R-squared: 0.3765
## F-statistic: 80.7 on 3 and 393 DF, p-value: < 2.2e-16
# interaction is not significant if it were we would continue with interpretation
Model.coef = coefficients(Model.int)
# E(salary| prof, sex=male) = B0 + B1*1 + B2*sex + B3(1*1)
### effects of being a Prof on salary given sex = (B0 + B2) + (B1 + B3)*Prof.vs.other
# B1 varies in this model.
# B0 is the expected mean for non-prof females
Model.coef[1]
## (Intercept)
## 83032.24
# (b0 + b1)*0 or b0= is the expected salary for non-prof given that they are female
Model.coef[1]+(Model.coef[2]*0)
## (Intercept)
## 83032.24
# simple intercept = (b0 + b2) = average salary for males who are not prof (b1=0)
Model.coef[1] + Model.coef[3]
## (Intercept)
## 87967.33
# simple slope = (B1 + B3) captures the effect of being a Prof for males
Model.coef[2] + Model.coef[3]
## I(rank == "Prof")TRUE
## 43870.46
# B3 captures the difference in effect for being Prof among men & women
Model.coef[4]
## I(rank == "Prof")TRUE:sexMale
## 218.1223
# interaction was not sig so not different from 0
# E(Y|b1=1, b2=1 ) (B0 + B2) + (B1 + B3)*1
# effects of being male on salary given rank
rank.wit.sex = (Model.coef[1] + Model.coef[3]) +
((Model.coef[2] + Model.coef[4])*1); rank.wit.sex
## (Intercept)
## 127120.8
# plot for interaction effect
xx <- c(0,1) # <-- change to alter plot dims
yy <- c(74214.48, 127120.8) # <-- change to alter plot dims
leg <- c(.79,91780) # <-- change to alter legend location
x <- c(0,1) # <-- x-coords for lines
y1 <- c(83032.2,121967.6)
y2 <- c(87967.3,127120.8)
plot(xx,yy,type='n',font=1,font.lab=1,xlab='sex',ylab='salary',
main='Interaction effect plot')
lines(x,y1,lwd=3,lty=3,col=4)
lines(x,y2,lwd=3,lty=4,col=2)
points(x,y1,col=1,pch=16)
points(x,y2,col=1,pch=15)
legend(leg[1],leg[2],legend=c('male','female'),lwd=c(3,3),
lty=c(4,3),col=c(2,4))

# expected salary for female non-full prof = B0 = 83032.2
# expected salary for female prof holding B2 constant =
# B0 + B1 = 83032.2 + 38935.4 = 121967.6
# expected salary for male non-full prof holding B1 constant =
# B0 + B2 = 83032.2 + 4935.1 = 87967.3
# expected salary for male prof =
# B0 + B1 + B2 = 83032.2 + 38935.4 + 4935.1 + 218.1 = 127120.8
#plot expected values side by side
sex.rank.int = matrix(c(83032.2, 121967.6, 87967.3, 127120.8),
ncol=2, byrow=TRUE)
colnames(sex.rank.int) = c("non-full professor", "professor")
rownames(sex.rank.int) = c("female", "male")
sex.rank.int = as.table(sex.rank.int)
barplot(sex.rank.int,
main="Predicted values of salary by sex and rank w/ interaction",
col=c("lightpink","lightblue"),
legend = rownames(pred.val.sex.rank), beside=TRUE)

scatterplot3d(sex, rank == 'Prof',salary, pch=16,
highlight.3d=TRUE,
type="h", main="3D Scatterplot")

###################################
######### Interaction 1 categorical variables 1 interval #############
# using margins
############## ex #################
library(margins)
Modelx <- lm(salary ~ yrs.service*sex, data = data)
(m <- margins(Modelx)[[1]])
## Factor dy/dx Std.Err. z value Pr(>|z|) 2.50% 97.50%
## yrs.service 797.0942 114.6885 6.9501 0.0000 572.3088 1021.8795
## sexMale 3716.4550 5742.7156 0.6472 0.5175 -7539.0608 14971.9709
plot(m)

###################################
# B0 is the predicted salary for females with 0 years of service
# B1 differnce in salary among men with a difference of 1 year of service
# B2 difference in predicted salary between gender with 0 years of service
# B3 difference in slope for yrs.service compared to women and men
# visualizing the interaction: model w/ no interaction minus sex
fit.1 <- lm(salary ~ yrs.service , data = data)
plot (yrs.service, salary, xlab="years of service", ylab="salary")
curve (coef(fit.1)[1] + coef(fit.1)[2]*x, add=TRUE) # superimposes regression line

# plot with 2 lines 1 for female & male
fit.2 <- lm(salary ~sex + yrs.service, data = data)
colors <- ifelse (sex=="Male", "red", "blue")
plot (yrs.service, salary, xlab="years of service", ylab="salary",
col=colors, pch=20)
curve (cbind (1, 1, x) %*% coef(fit.2), add=TRUE, col="red")
curve (cbind (1, 0, x) %*% coef(fit.2), add=TRUE, col="blue")

# model with interaction
fit.3 <- lm(salary ~ sex + yrs.service + yrs.service:sex, data = data)
colors <- ifelse (sex=="Male", "red", "blue")
plot (yrs.service, salary, xlab="years of service", ylab="salary",
col=colors, pch=20)
curve (cbind (1, 1, x, 1*x) %*% coef(fit.3), add=TRUE, col="green")
curve (cbind (1, 0, x, 0*x) %*% coef(fit.3), add=TRUE, col="red")

# displaying uncertainty in fitted regression
library(arm)
## Loading required package: MASS
## Loading required package: Matrix
## Loading required package: lme4
##
## arm (Version 1.8-6, built: 2015-7-7)
##
## Working directory is /home/kevin/Dropbox/My documents/grad classes/Int procedures of data analysis
##
##
## Attaching package: 'arm'
##
## The following object is masked from 'package:car':
##
## logit
plot (yrs.service, salary, xlab="years of service", ylab="salary", type="n")
points (yrs.service[sex=="Male"], salary[sex=="Male"], pch=20, col="black")
points (yrs.service[sex=="Female"], salary[sex=="Female"], pch=20, col="gray")

fit.1.sim <- sim (fit.1)
plot (yrs.service, salary, xlab="years of service", ylab="salary")
for (i in 1:10){
curve (fit.1.sim@coef[i,1] + fit.1.sim@coef[i,2]*x,
add=TRUE,col="gray") # gray lines uncertainty
}
curve (coef(fit.1)[1] + coef(fit.1)[2]*x, add=TRUE, col="red") # red line regression

# plot uncertainty but for model with sex
gender = as.numeric(sex) # has to be numeric
beta.hat <- coef (fit.2)
beta.sim <- sim (fit.2)@coef
par (mfrow=c(1,2))
plot (yrs.service, salary, xlab="years of service", ylab="salary")
for (i in 1:10){
curve (cbind (1, mean(gender), x) %*% beta.sim[i,], lwd=.5,
col="gray", add=TRUE)
}
curve (cbind (1, mean(gender), x) %*% beta.hat, col="red", add=TRUE)
plot (sex, salary, xlab="years of service", ylab="salary")
for (i in 1:10){
curve (cbind (1, x, mean(yrs.service)) %*% beta.sim[i,], lwd=.5,
col="gray", add=TRUE)
}
curve (cbind (1, x, mean(yrs.service)) %*% beta.hat, col="red", add=TRUE)

######### Interaction 2 continous variables #############
fit.3a = lm(salary~yrs.service + yrs.since.phd, data=data)
display(fit.3a)
## lm(formula = salary ~ yrs.service + yrs.since.phd, data = data)
## coef.est coef.se
## (Intercept) 89912.18 2843.56
## yrs.service -629.10 254.47
## yrs.since.phd 1562.89 256.82
## ---
## n = 397, k = 3
## residual sd = 27357.14, R-Squared = 0.19
scatterplotMatrix(~salary+yrs.service+yrs.since.phd, reg.line=lm,
smooth=FALSE, spread=FALSE, span=0.55,
ellipse=FALSE, levels=c(.5, .9),
id.n=0, diagonal = 'histogram', data=data,
main="Plot for model fit.3a")

par(mfrow=c(4,4))
# with intercation
fit.3b <- lm(salary ~ yrs.service*yrs.since.phd, data=data)
summary(fit.3b)
##
## Call:
## lm(formula = salary ~ yrs.service * yrs.since.phd, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63823 -17292 -2538 13158 107001
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70155.263 3472.077 20.206 < 2e-16 ***
## yrs.service 1692.446 356.279 4.750 2.85e-06 ***
## yrs.since.phd 2194.289 246.862 8.889 < 2e-16 ***
## yrs.service:yrs.since.phd -64.617 7.487 -8.630 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 25120 on 393 degrees of freedom
## Multiple R-squared: 0.3177, Adjusted R-squared: 0.3125
## F-statistic: 60.99 on 3 and 393 DF, p-value: < 2.2e-16