Week 10

#longitudinal - a baby gets weighed every few months
#spatial - nearby counties are more similar than distant counties
#clustered - choose a classroom and then measure everyone in that room
  #choose a dorm floor and randomly sample rooms on that floor
#repeated measurements - multiple measurements on the same person

#examples:
#blood pressure is measured weekly for participants
  #longitudinal - some people tend to have high/low blood pressure so one person's measurements over time are correlated
  #PSU: participants

#voter turnout for a recent presidential election was recorded for every polling place in precincts in Florida
  #spatial - precincts close together will be more similar
  #PSU: polling place
    #secondary sampling: voter

#the monthly number of car accidents at fifteen intersections
  #longitudinal
  #PSU: car accidents

#census researchers randomly selects citizens from census tracts to report their income
  #cluster
  #PSU: citizens

#each figure skater receives score for 5 judges
  #repeated
  #PSU: skaters

#the water supply for a village comes from three different streams each which are distributed by five pumps. fecal coliform measurements are made on two samples from each pump
  #repeated
  #PSU: pump

Galton = read.csv("http://www.cknudson.com/data/Galton.csv")

fixedmod = lm(Height ~ 0 + Gender, data = Galton)
library(lme4)

## Loading required package: Matrix

summary(fixedmod)

## 
## Call:
## lm(formula = Height ~ 0 + Gender, data = Galton)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.2288 -1.6102 -0.1102  1.7712  9.7712 
## 
## Coefficients:
##         Estimate Std. Error t value Pr(>|t|)    
## GenderF  64.1102     0.1206   531.7   <2e-16 ***
## GenderM  69.2288     0.1164   595.0   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.509 on 896 degrees of freedom
## Multiple R-squared:  0.9986, Adjusted R-squared:  0.9986 
## F-statistic: 3.184e+05 on 2 and 896 DF,  p-value: < 2.2e-16

mixedmod = lmer(Height ~ 0 + Gender + (1|FamilyID), data = Galton) #random intercept for each family
summary(mixedmod)

## Linear mixed model fit by REML ['lmerMod']
## Formula: Height ~ 0 + Gender + (1 | FamilyID)
##    Data: Galton
## 
## REML criterion at convergence: 4007.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.9475 -0.5661  0.0067  0.5937  3.5069 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  FamilyID (Intercept) 2.448    1.564   
##  Residual             3.843    1.960   
## Number of obs: 898, groups:  FamilyID, 197
## 
## Fixed effects:
##         Estimate Std. Error t value
## GenderF  64.1489     0.1542   415.9
## GenderM  69.3019     0.1505   460.5
## 
## Correlation of Fixed Effects:
##         GendrF
## GenderM 0.567

#Between the fixed effects model and the mixed (random and fixed) effects model, which model's fixed effects has larger standard errors? Why does this make sense?
  #standard errors for the fixed model are a little less than for the mixed model
  #in the fixed effects model, we're ignoring the correlation of the data, which artificially deflates the standard error

#Let I(female) be 1 if the person is a female and 0 if otherwise. Let I(male) be 1 if the person is male and 0 if otherwise. Then we can model the height of the ith kid from family j as follows:
  #^height = 64.1489I(female) + 69.3019(male) + uj
  #where uj ~ N(0.1564^2)iid
    #2 sources of variability: from family to family, and from kid to kid (in the same family)

#The variance of the residuals represents the variability within families (from kid to kid)
#The variance of the random effects represents the variability between families 
#If a family is full of short people, the family's random effect would be negative
#If we have a short person with tall family members, the residual will be negative


#lm(Heights ~ Gender)
  #heights = B0 + B1I(male)
  #uses reference group
#lm(Heights ~ 0 + Gender)
  #heights = B0I(female) + B1I(male)

#examples (from worksheet)
#lmer(Lead ~ Poverty + (1|CityID))
  #lead = B0 + B1 I(Group B) + ucity
#lmer(Lead ~ 1 + (1|homeID))
  #lead = B0 + uhome
#lmer(Lead ~ Filter + (1|homeID))
  #lead = B0 + B1 I(no filter) + uhome
  #testing whether filter is effective: H0: B1 = 0, HA: B1 =/ 0
    #if B1 = 0, it doesn't matter if the filter is there or not
  #alternative: lmer(Lead ~ 0 + Filter + (1|homeID))
    #lead = B0 I(filter) + B1 I(no filter) + uhome
    #H0: B0 = B1, HA: B0 =/ B1
#lmer(NDrinks ~ Class + (1|dormroom))
  #NDrinks = B0 + B1 I(upper) + udorm
  #H0: B1 = 0, HA: B1 =/ 0

library(faraway)
mixed = lmer(bright ~ 1 + (1|operator), data = pulp)
summary(mixed)

## Linear mixed model fit by REML ['lmerMod']
## Formula: bright ~ 1 + (1 | operator)
##    Data: pulp
## 
## REML criterion at convergence: 18.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.4666 -0.7595 -0.1244  0.6281  1.6012 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  operator (Intercept) 0.06808  0.2609  
##  Residual             0.10625  0.3260  
## Number of obs: 20, groups:  operator, 4
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  60.4000     0.1494   404.2

#bright = 60.4 + uoper
#uoper ~ N(0, 0.06808)
#Eshift ~ N(0, 0.10625)
  #The shift variability is greater than the operator variability (0.06808 vs 0.10625)

#intraclass correlation (ICC)
#variance between classes / variance between classes + variance within classes
0.06808 / (0.06808 + 0.10625)

## [1] 0.3905237

#39% of the variability in paper brightness is due to variability from operator to operator

#If ICC is close to 0, there isn't a lot of variability from operator to operator