#longitudinal - a baby gets weighed every few months
#spatial - nearby counties are more similar than distant counties
#clustered - choose a classroom and then measure everyone in that room
#choose a dorm floor and randomly sample rooms on that floor
#repeated measurements - multiple measurements on the same person
#examples:
#blood pressure is measured weekly for participants
#longitudinal - some people tend to have high/low blood pressure so one person's measurements over time are correlated
#PSU: participants
#voter turnout for a recent presidential election was recorded for every polling place in precincts in Florida
#spatial - precincts close together will be more similar
#PSU: polling place
#secondary sampling: voter
#the monthly number of car accidents at fifteen intersections
#longitudinal
#PSU: car accidents
#census researchers randomly selects citizens from census tracts to report their income
#cluster
#PSU: citizens
#each figure skater receives score for 5 judges
#repeated
#PSU: skaters
#the water supply for a village comes from three different streams each which are distributed by five pumps. fecal coliform measurements are made on two samples from each pump
#repeated
#PSU: pump
Galton = read.csv("http://www.cknudson.com/data/Galton.csv")
fixedmod = lm(Height ~ 0 + Gender, data = Galton)
library(lme4)
## Loading required package: Matrix
summary(fixedmod)
##
## Call:
## lm(formula = Height ~ 0 + Gender, data = Galton)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.2288 -1.6102 -0.1102 1.7712 9.7712
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## GenderF 64.1102 0.1206 531.7 <2e-16 ***
## GenderM 69.2288 0.1164 595.0 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.509 on 896 degrees of freedom
## Multiple R-squared: 0.9986, Adjusted R-squared: 0.9986
## F-statistic: 3.184e+05 on 2 and 896 DF, p-value: < 2.2e-16
mixedmod = lmer(Height ~ 0 + Gender + (1|FamilyID), data = Galton) #random intercept for each family
summary(mixedmod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: Height ~ 0 + Gender + (1 | FamilyID)
## Data: Galton
##
## REML criterion at convergence: 4007.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.9475 -0.5661 0.0067 0.5937 3.5069
##
## Random effects:
## Groups Name Variance Std.Dev.
## FamilyID (Intercept) 2.448 1.564
## Residual 3.843 1.960
## Number of obs: 898, groups: FamilyID, 197
##
## Fixed effects:
## Estimate Std. Error t value
## GenderF 64.1489 0.1542 415.9
## GenderM 69.3019 0.1505 460.5
##
## Correlation of Fixed Effects:
## GendrF
## GenderM 0.567
#Between the fixed effects model and the mixed (random and fixed) effects model, which model's fixed effects has larger standard errors? Why does this make sense?
#standard errors for the fixed model are a little less than for the mixed model
#in the fixed effects model, we're ignoring the correlation of the data, which artificially deflates the standard error
#Let I(female) be 1 if the person is a female and 0 if otherwise. Let I(male) be 1 if the person is male and 0 if otherwise. Then we can model the height of the ith kid from family j as follows:
#^height = 64.1489I(female) + 69.3019(male) + uj
#where uj ~ N(0.1564^2)iid
#2 sources of variability: from family to family, and from kid to kid (in the same family)
#The variance of the residuals represents the variability within families (from kid to kid)
#The variance of the random effects represents the variability between families
#If a family is full of short people, the family's random effect would be negative
#If we have a short person with tall family members, the residual will be negative
#lm(Heights ~ Gender)
#heights = B0 + B1I(male)
#uses reference group
#lm(Heights ~ 0 + Gender)
#heights = B0I(female) + B1I(male)
#examples (from worksheet)
#lmer(Lead ~ Poverty + (1|CityID))
#lead = B0 + B1 I(Group B) + ucity
#lmer(Lead ~ 1 + (1|homeID))
#lead = B0 + uhome
#lmer(Lead ~ Filter + (1|homeID))
#lead = B0 + B1 I(no filter) + uhome
#testing whether filter is effective: H0: B1 = 0, HA: B1 =/ 0
#if B1 = 0, it doesn't matter if the filter is there or not
#alternative: lmer(Lead ~ 0 + Filter + (1|homeID))
#lead = B0 I(filter) + B1 I(no filter) + uhome
#H0: B0 = B1, HA: B0 =/ B1
#lmer(NDrinks ~ Class + (1|dormroom))
#NDrinks = B0 + B1 I(upper) + udorm
#H0: B1 = 0, HA: B1 =/ 0
library(faraway)
mixed = lmer(bright ~ 1 + (1|operator), data = pulp)
summary(mixed)
## Linear mixed model fit by REML ['lmerMod']
## Formula: bright ~ 1 + (1 | operator)
## Data: pulp
##
## REML criterion at convergence: 18.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.4666 -0.7595 -0.1244 0.6281 1.6012
##
## Random effects:
## Groups Name Variance Std.Dev.
## operator (Intercept) 0.06808 0.2609
## Residual 0.10625 0.3260
## Number of obs: 20, groups: operator, 4
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 60.4000 0.1494 404.2
#bright = 60.4 + uoper
#uoper ~ N(0, 0.06808)
#Eshift ~ N(0, 0.10625)
#The shift variability is greater than the operator variability (0.06808 vs 0.10625)
#intraclass correlation (ICC)
#variance between classes / variance between classes + variance within classes
0.06808 / (0.06808 + 0.10625)
## [1] 0.3905237
#39% of the variability in paper brightness is due to variability from operator to operator
#If ICC is close to 0, there isn't a lot of variability from operator to operator