“Weighted regression” / example of regression with sampling weights
Edit the R script with your own comments to explain what each code chunk does and then compile a report of the script in html for publication to moodle.
# set some options - digits: digits to print when printing numeric values(數字位數), show.signif.stars: should stars be printed on summary tables of coefficients
options(digits=3, show.signif.stars=FALSE)
# packagage management
# install.packages(pacman)
# load packages
pacman::p_load(alr4, tidyverse)
# load data
data(UN11, package="alr4")
str(UN11)
## 'data.frame': 199 obs. of 6 variables:
## $ region : Factor w/ 8 levels "Africa","Asia",..: 2 4 1 1 3 5 2 3 8 4 ...
## $ group : Factor w/ 3 levels "oecd","other",..: 2 2 3 3 2 2 2 2 1 1 ...
## $ fertility: num 5.97 1.52 2.14 5.13 2 ...
## $ ppgdp : num 499 3677 4473 4322 13750 ...
## $ lifeExpF : num 49.5 80.4 75 53.2 81.1 ...
## $ pctUrban : num 23 53 67 59 100 93 64 47 89 68 ...
## - attr(*, "na.action")= 'omit' Named int [1:34] 4 5 8 28 41 67 68 72 79 83 ...
## ..- attr(*, "names")= chr [1:34] "Am Samoa" "Andorra" "Antigua and Barbuda" "Br Virigin Is" ...
# ?? seed the random number generator to get the same sample
set.seed(6102)
# pick 81 countries from three regions
# arrange the rows by region alphabetical order
dta <- UN11 %>%
filter(region %in% c("Africa", "Asia", "Europe")) %>%
sample_n(81) %>%
arrange(region)
# first 6 lines of data frame
head(dta)
## region group fertility ppgdp lifeExpF pctUrban
## Ghana Africa africa 3.99 1333 65.8 52
## Seychelles Africa africa 2.34 11451 78.0 56
## Gabon Africa africa 3.19 12469 64.3 86
## Libya Africa africa 2.41 11321 77.9 78
## Benin Africa africa 5.08 741 58.7 42
## Burkina Faso Africa africa 5.75 520 57.0 27
## [1] 81 6
# how many countries in each of the three regions
R3 <- table(dta$region)
# percentage of countries from each of the three regions selected
w <- R3/table(UN11$region)
# add the sampling weights variable to data
# skip over countries in regions not selected
dta$wt <- rep(1/w[w != 0], R3[R3 != 0])
# simple regression: fertility ~ log (Per capita gross domestic product in US dollars)
summary(m0 <- lm(fertility ~ log(ppgdp), data=dta))
##
## Call:
## lm(formula = fertility ~ log(ppgdp), data = dta)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.2686 -0.7716 0.0497 0.6811 2.6292
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.313 0.575 14.46 < 2e-16
## log(ppgdp) -0.652 0.068 -9.58 7.2e-15
##
## Residual standard error: 1.07 on 79 degrees of freedom
## Multiple R-squared: 0.537, Adjusted R-squared: 0.532
## F-statistic: 91.8 on 1 and 79 DF, p-value: 7.15e-15
##
## Call:
## lm(formula = fertility ~ log(ppgdp), data = dta, weights = wt)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -3.031 -1.001 0.063 0.921 3.425
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.210 0.577 14.22 < 2e-16
## log(ppgdp) -0.642 0.068 -9.44 1.3e-14
##
## Residual standard error: 1.42 on 79 degrees of freedom
## Multiple R-squared: 0.53, Adjusted R-squared: 0.524
## F-statistic: 89.1 on 1 and 79 DF, p-value: 1.35e-14
# plot
ggplot(dta,
aes(log(ppgdp), fertility, label=region)) +
stat_smooth(method="lm", formula=y ~ x, se=F, col="peru", lwd=rel(.5)) +
stat_smooth(aes(weight=wt), method="lm", formula=y ~ x, se=F, lwd=rel(.5), col="gray")+
geom_text(check_overlap=TRUE, size=rel(2.3), aes(color=region))+
labs(x="GDP (US$ in log unit)",
y="Number of children per woman") +
theme_minimal() +
theme(legend.position="NONE")
“Regression with dependent errors”/trial by trial reaction time data
Replicate the analysis in the lexical decision latencies data example using the data:heid{languageR}. Perform a generalized least-squares regression and compare the results against an ordinary regression of RT onto BaseFrequency.
# set plot theme
ot <- theme_set(theme_bw())
# load packages
pacman::p_load(languageR, lattice, tidyverse, nlme, lmtest)
# load data
data(heid, package="languageR")
# query data info
?heid
## starting httpd help server ... done
## Subject Word RT BaseFrequency
## 1 pp1 basaalheid 6.69 3.56
## 2 pp1 markantheid 6.81 5.16
## 3 pp1 ontroerdheid 6.51 5.55
## 4 pp1 contentheid 6.58 4.50
## 5 pp1 riantheid 6.86 4.53
## 6 pp1 tembaarheid 6.35 0.00
## 'data.frame': 832 obs. of 4 variables:
## $ Subject : Factor w/ 26 levels "pp1","pp10","pp11",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Word : Factor w/ 40 levels "aftandsheid",..: 4 24 28 10 33 38 25 5 16 30 ...
## $ RT : num 6.69 6.81 6.51 6.58 6.86 6.35 6.69 7.17 6.58 6.98 ...
## $ BaseFrequency: num 3.56 5.16 5.55 4.5 4.53 0 1.61 3.61 2.56 5.86 ...
# word frequency effect
ggplot(data = dta, aes(x = BaseFrequency, y = RT)) +
geom_point() +
stat_smooth(method="lm") +
labs(x = "Word Frequency", y = "Reaction Time (transformed)")
## `geom_smooth()` using formula 'y ~ x'
# trial effect by subject
ggplot(data = dta, aes(x = Trial, y = RT)) +
geom_point(pch =1, size = rel(.5)) +
geom_line() +
facet_wrap(~ Subject) +
labs(x = "Trial ID", y = "Reaction Time (transformed)")
# autocorrelation by individual
acf.fnc(dta, group = "Subject", time = "Trial", x = "RT", plot = TRUE)
# simple linear model ignoring trial dependence - least-squares
summary(m0 <- lm(RT ~ BaseFrequency, data = dta))
##
## Call:
## lm(formula = RT ~ BaseFrequency, data = dta)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5166 -0.2158 -0.0525 0.1599 0.8903
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.6417 0.0207 320.39 <2e-16
## BaseFrequency -0.0155 0.0047 -3.29 0.001
##
## Residual standard error: 0.282 on 830 degrees of freedom
## Multiple R-squared: 0.0129, Adjusted R-squared: 0.0117
## F-statistic: 10.8 on 1 and 830 DF, p-value: 0.00104
# Maximum likelihood estimation of the model above
summary(m0a <- gls(RT ~ BaseFrequency, data = dta, method = "ML"))
## Generalized least squares fit by maximum likelihood
## Model: RT ~ BaseFrequency
## Data: dta
## AIC BIC logLik
## 258 273 -126
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 6.64 0.0207 320 0.000
## BaseFrequency -0.02 0.0047 -3 0.001
##
## Correlation:
## (Intr)
## BaseFrequency -0.882
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.835 -0.766 -0.186 0.568 3.161
##
## Residual standard error: 0.282
## Degrees of freedom: 832 total; 830 residual
# Simple linear model with autocorrelation error using gls
summary(m1 <- update(m0a, correlation = corAR1(form = ~ Trial | Subject)))
## Generalized least squares fit by maximum likelihood
## Model: RT ~ BaseFrequency
## Data: dta
## AIC BIC logLik
## 64.3 83.2 -28.1
##
## Correlation Structure: AR(1)
## Formula: ~Trial | Subject
## Parameter estimate(s):
## Phi
## 0.462
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 6.64 0.02193 302.5 0e+00
## BaseFrequency -0.01 0.00387 -3.6 4e-04
##
## Correlation:
## (Intr)
## BaseFrequency -0.699
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.835 -0.765 -0.184 0.586 3.159
##
## Residual standard error: 0.281
## Degrees of freedom: 832 total; 830 residual
## Model df AIC BIC logLik Test L.Ratio p-value
## m0a 1 3 258.5 272.6 -126.2
## m1 2 4 64.3 83.2 -28.1 1 vs 2 196 <.0001
# collect residuals from the two models above
dta <- dta %>% mutate(res0 = rstandard(m0),
res1 = resid(m1, type = "normalized"))
# Durbin-Watson test of serial correlation
dwtest(res0 ~ BaseFrequency, data = dta)
##
## Durbin-Watson test
##
## data: res0 ~ BaseFrequency
## DW = 1, p-value <2e-16
## alternative hypothesis: true autocorrelation is greater than 0
##
## Durbin-Watson test
##
## data: res1 ~ BaseFrequency
## DW = 2, p-value = 1
## alternative hypothesis: true autocorrelation is greater than 0
The height of a sample of 20 pre-adolencent girls were recorded at ages 6, 7, 8, 9, and 10. The girls were classified according to the height of their mother (small: < 155 cm, medium: < 155-164, tall: > 164 cm).
Replicate the analyses shown in the heights of boys example for the data in the heights of girls.
# set significant decimal points
# don't print silly stars
options(digits=4, show.signif.stars=FALSE)
# handle package loading automatically
#install.package(pacman)
# load packages
pacman::p_load(tidyverse, nlme, broom, magrittr, MuMIn)
dta3<-read.csv('girl_height.csv', header=T, sep=",")
# grouped data structure from nlme
str(dta3)
## 'data.frame': 100 obs. of 4 variables:
## $ Height: num 111 116 122 126 130 ...
## $ Girl : chr "G1" "G1" "G1" "G1" ...
## $ Age : int 6 7 8 9 10 6 7 8 9 10 ...
## $ Mother: chr "S" "S" "S" "S" ...
## Height Girl Age Mother
## 1 111.0 G1 6 S
## 2 116.4 G1 7 S
## 3 121.7 G1 8 S
## 4 126.3 G1 9 S
## 5 130.5 G1 10 S
## 6 110.0 G2 6 S
# set plot theme to black-n-white
ot <- theme_set(theme_bw())
# one regression line fits all
ggplot(dta3, aes(x = Age, y = Height, color = Girl)) +
geom_point() +
stat_smooth(aes(group = 1), method = "lm") +
labs(x = "Age (scaled)", y = "Height (cm)") +
theme(legend.position = "NONE")
## `geom_smooth()` using formula 'y ~ x'
# ordinary regression
m0 <- gls(Height ~ Age, data = dta3)
# regression with autocorrelation
m1 <- update(m0, corr = corAR1(form = ~ 1 | Girl), method = "ML")
# regression with unequal variances for occasions
m2 <- update(m0, weights = varIdent(form = ~ 1 | Mother))
# regression with autocorrelation and unequal variances
m3 <- update(m1, weights = varIdent(form = ~ 1 | Mother))
# model selection with AICc
model.sel(m0, m1, m2, m3)
## Model selection table
## (Intrc) Age family correlation weights REML
## m3 82.47 5.558 gaussian(identity) corAR1(~1|Girl) varIdent(~1|Mother) FALSE
## m1 82.47 5.684 gaussian(identity) corAR1(~1|Girl) FALSE
## m2 82.46 5.549 gaussian(identity) varIdent(~1|Mother)
## m0 82.52 5.716 gaussian(identity)
## df logLik AICc delta weight
## m3 6 -162.8 338.5 0.00 1
## m1 4 -172.7 353.8 15.26 0
## m2 5 -292.0 594.6 256.14 0
## m0 3 -300.8 607.8 269.27 0
## Models ranked by AICc(x)
## [[1]]
## Generalized least squares fit by REML
## Model: Height ~ Age
## Data: dta3
## AIC BIC logLik
## 607.5 615.3 -300.8
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 82.52 2.8439 29.02 0
## Age 5.72 0.3501 16.33 0
##
## Correlation:
## (Intr)
## Age -0.985
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.8561 -0.7402 -0.1696 0.7974 2.6348
##
## Residual standard error: 4.951
## Degrees of freedom: 100 total; 98 residual
##
## [[2]]
## Generalized least squares fit by maximum likelihood
## Model: Height ~ Age
## Data: dta3
## AIC BIC logLik
## 353.3 363.8 -172.7
##
## Correlation Structure: AR(1)
## Formula: ~1 | Girl
## Parameter estimate(s):
## Phi
## 0.9782
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 82.47 1.380 59.75 0
## Age 5.68 0.111 51.21 0
##
## Correlation:
## (Intr)
## Age -0.643
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.8432 -0.7005 -0.1174 0.8831 2.7934
##
## Residual standard error: 4.781
## Degrees of freedom: 100 total; 98 residual
##
## [[3]]
## Generalized least squares fit by REML
## Model: Height ~ Age
## Data: dta3
## AIC BIC logLik
## 594 606.9 -292
##
## Variance function:
## Structure: Different standard deviations per stratum
## Formula: ~1 | Mother
## Parameter estimates:
## S M T
## 1.0000 0.8014 1.8171
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 82.46 2.3331 35.34 0
## Age 5.55 0.2872 19.32 0
##
## Correlation:
## (Intr)
## Age -0.985
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.88260 -0.64836 0.07568 0.75427 2.40785
##
## Residual standard error: 3.961
## Degrees of freedom: 100 total; 98 residual
##
## [[4]]
## Generalized least squares fit by maximum likelihood
## Model: Height ~ Age
## Data: dta3
## AIC BIC logLik
## 337.6 353.2 -162.8
##
## Correlation Structure: AR(1)
## Formula: ~1 | Girl
## Parameter estimate(s):
## Phi
## 0.9787
## Variance function:
## Structure: Different standard deviations per stratum
## Formula: ~1 | Mother
## Parameter estimates:
## S M T
## 1.0000 0.6923 1.5494
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 82.47 1.1305 72.95 0
## Age 5.56 0.0903 61.55 0
##
## Correlation:
## (Intr)
## Age -0.639
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.77189 -0.71727 0.06484 0.80091 2.56095
##
## Residual standard error: 4.265
## Degrees of freedom: 100 total; 98 residual
“Course evaluation data example” The data set concerns student evaluation of instructor’s beauty and teaching quality for several courses at the University of Texas. The teaching evaluations were done at the end of the semester, and the beauty judgments were made later, by six students who had not attended the classes and were not aware of the course evaluations.
Carry out the analysis presented in the R script by commenting on what each code chunk does. You can spin the script to a markdown file for html output.
# input data
dta4 <- read.table("course_eval.txt", header = T)
# factor <- int
dta4$Course <- as.factor(dta4$Course)
str(dta4)
## 'data.frame': 463 obs. of 7 variables:
## $ Score : num 4.3 4.5 3.7 4.3 4.4 4.2 4 3.4 4.5 3.9 ...
## $ Beauty : num 0.202 -0.826 -0.66 -0.766 1.421 ...
## $ Gender : int 1 0 0 1 1 0 1 1 1 0 ...
## $ Age : int 36 59 51 40 31 62 33 51 33 47 ...
## $ Minority: int 1 0 0 0 0 0 0 0 0 0 ...
## $ Tenure : int 0 1 1 1 0 1 0 1 0 0 ...
## $ Course : Factor w/ 31 levels "0","1","2","3",..: 4 1 5 3 1 1 5 1 1 5 ...
# plot the Course evaluation score and Beauty score by course ID
xyplot(Score ~ Beauty | Course, data = dta4,
ylab="Average course evaluation score",
xlab="Beauty judgment score",
type=c("p", "g", "r"))
# groupedData: mean of the Score ~ Beauty | Course ID??
# create a new copy of the groupedData object
dtag <-
nlme::groupedData(Score ~ Beauty | Course,
data=as.data.frame(dta4),
FUN=mean,
labels=list(x="Beauty score",
y="Couse evaluation score" ))
# plot of dtag
plot(dtag, asp=1)
# m0: linear regression of Course evaluation score and Beauty score
m0 <- lm(Score ~ Beauty, data=dta4)
summary(m0)
##
## Call:
## lm(formula = Score ~ Beauty, data = dta4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8002 -0.3630 0.0725 0.4021 1.1037
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.0100 0.0255 157.21 < 2e-16
## Beauty 0.1330 0.0322 4.13 4.2e-05
##
## Residual standard error: 0.545 on 461 degrees of freedom
## Multiple R-squared: 0.0357, Adjusted R-squared: 0.0336
## F-statistic: 17.1 on 1 and 461 DF, p-value: 4.25e-05
# Plot linear regression of Course evaluation score and Beauty score
with(dta4, plot(Beauty, Score, bty="n",#bty = o: complete box (default parameter), n: no box,7: top + right,L: bottom + left,C: top + left + bottom,U: left + bottom + right
xlab="Beauty score",
ylab="Average course evaluation") +
grid() + #格線?
abline(m0)) #This function adds one or more straight lines through the current plot.
## integer(0)
# t.test on the coefficients of the fit a list of lm Course evaluation score and Beauty score by course ID.
t.test(coef(lmList(Score ~ Beauty | Course, data = dta4))["Beauty"])
##
## One Sample t-test
##
## data: coef(lmList(Score ~ Beauty | Course, data = dta4))["Beauty"]
## t = 0.54, df = 30, p-value = 0.6
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -2.188 3.769
## sample estimates:
## mean of x
## 0.7905
# nesting of explanatory variables in the model (not division)
# Course的各分類下,再細分出Beauty因子的分類
# -1: omitting intercept
m1 <- lm(Score ~ Course/Beauty - 1, data=dta4)
#
summary(m1)
##
## Call:
## lm(formula = Score ~ Course/Beauty - 1, data = dta4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8108 -0.2739 0.0135 0.3314 1.0935
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Course0 4.0316 0.0304 132.83 < 2e-16
## Course1 3.7049 1.2199 3.04 0.00254
## Course2 4.4852 0.4880 9.19 < 2e-16
## Course3 3.8806 0.2535 15.31 < 2e-16
## Course4 3.8359 0.1207 31.79 < 2e-16
## Course5 4.0863 0.3073 13.30 < 2e-16
## Course6 4.2195 0.3775 11.18 < 2e-16
## Course7 3.4491 0.4324 7.98 1.6e-14
## Course8 5.0785 1.0201 4.98 9.5e-07
## Course9 3.9284 0.2150 18.27 < 2e-16
## Course10 4.8393 1.8830 2.57 0.01053
## Course11 4.1322 0.4060 10.18 < 2e-16
## Course12 3.3081 0.6667 4.96 1.0e-06
## Course13 4.1574 0.6147 6.76 4.8e-11
## Course14 3.6171 0.4442 8.14 4.9e-15
## Course15 2.4200 0.4187 5.78 1.5e-08
## Course16 -16.9734 22.0939 -0.77 0.44280
## Course17 4.4669 0.2982 14.98 < 2e-16
## Course18 4.2042 0.3147 13.36 < 2e-16
## Course19 3.5362 0.2520 14.03 < 2e-16
## Course20 4.4540 0.4957 8.99 < 2e-16
## Course21 3.6620 0.1425 25.70 < 2e-16
## Course22 3.7286 0.2064 18.06 < 2e-16
## Course23 3.5310 0.5096 6.93 1.7e-11
## Course24 3.7810 0.3552 10.65 < 2e-16
## Course25 20.7529 13.6132 1.52 0.12818
## Course26 4.2462 0.3138 13.53 < 2e-16
## Course27 4.5522 0.6720 6.77 4.5e-11
## Course28 3.6051 1.3974 2.58 0.01024
## Course29 3.8039 0.3909 9.73 < 2e-16
## Course30 4.3076 0.3272 13.17 < 2e-16
## Course0:Beauty 0.1462 0.0378 3.87 0.00013
## Course1:Beauty 0.9109 1.3378 0.68 0.49632
## Course2:Beauty 0.2416 0.8978 0.27 0.78795
## Course3:Beauty -0.0458 1.4114 -0.03 0.97412
## Course4:Beauty 0.5401 0.2909 1.86 0.06405
## Course5:Beauty 0.2275 0.4206 0.54 0.58888
## Course6:Beauty 0.7948 0.6393 1.24 0.21453
## Course7:Beauty -0.1954 0.4447 -0.44 0.66051
## Course8:Beauty 1.8980 1.4103 1.35 0.17913
## Course9:Beauty -1.9625 0.7006 -2.80 0.00534
## Course10:Beauty 0.5348 1.9240 0.28 0.78117
## Course11:Beauty -0.4049 0.5014 -0.81 0.41987
## Course12:Beauty -1.9471 1.6706 -1.17 0.24452
## Course13:Beauty 0.5055 0.9294 0.54 0.58682
## Course14:Beauty -0.1386 0.8917 -0.16 0.87659
## Course15:Beauty -0.3753 0.5578 -0.67 0.50140
## Course16:Beauty -18.2149 19.1411 -0.95 0.34187
## Course17:Beauty 0.3354 0.4471 0.75 0.45349
## Course18:Beauty -0.1303 0.4932 -0.26 0.79172
## Course19:Beauty -0.3202 0.3246 -0.99 0.32451
## Course20:Beauty 0.0692 0.8893 0.08 0.93801
## Course21:Beauty -0.0380 0.1891 -0.20 0.84096
## Course22:Beauty 0.1641 0.2424 0.68 0.49881
## Course23:Beauty -1.3347 0.7649 -1.74 0.08178
## Course24:Beauty 0.0756 0.9732 0.08 0.93811
## Course25:Beauty 40.5972 32.6559 1.24 0.21453
## Course26:Beauty 0.2253 0.3363 0.67 0.50332
## Course27:Beauty 0.9815 1.2156 0.81 0.41987
## Course28:Beauty 0.7384 0.9699 0.76 0.44693
## Course29:Beauty 0.4722 0.2924 1.61 0.10711
## Course30:Beauty 0.1544 0.2311 0.67 0.50451
##
## Residual standard error: 0.525 on 401 degrees of freedom
## Multiple R-squared: 0.985, Adjusted R-squared: 0.983
## F-statistic: 434 on 62 and 401 DF, p-value: <2e-16
## Estimate Std. Error t value Pr(>|t|)
## Course0 4.032 0.03035 132.83 0.000e+00
## Course9 3.928 0.21504 18.27 1.102e-54