Draw a scatter plot of the variables written and course scores from the data set Gcsemv{mlmRev} and superimpose another scatter plot on it using mean written and mean course scores by school as in the language and math example. You can start by editing in RStudio either the gcsemv rmarkdown or the exam rmarkdown file.
# install.package("mlmRev")
library(mlmRev)
# load the data from the package
data(Gcsemv, package="mlmRev")
# view first 6 lines
head(Gcsemv)
school student gender written course
1 20920 16 M 23 NA
2 20920 25 F NA 71.2
3 20920 27 F 39 76.8
4 20920 31 F 36 87.9
5 20920 42 M 16 44.4
6 20920 62 F 36 NA
with(Gcsemv, cor(written, course, use="pairwise"))
[1] 0.47417
# compute the means by school
course_schavg <- with(Gcsemv, tapply(course, school, mean, na.rm=T))
written_schavg <- with(Gcsemv, tapply(written, school, mean, na.rm=T))
cor(course_schavg, written_schavg)
[1] 0.39568
library(tidyverse)
dta <- Gcsemv %>%
group_by(school) %>%
mutate(r_sch = cor(course, written, use="pairwise"))
dtar <- dta[!duplicated(dta$school),"r_sch"]
with(dta, plot(written ~ course,
bty = 'n',
cex = 0.5,
xlab = 'Written score',
ylab = 'Course'))
grid()
with(dta, abline(lm(written ~ course)))
points(written_schavg, course_schavg, pch=16, col=5)
abline(lm(course_schavg ~ written_schavg), col=5)
In Rstudio, open and edit this R script from the SAT scores example to add an overall regression line of SAT score by teacher’s salary to three separate regression lines of the same set of variables by states with low, medium and high percentage of students taking the SAT exam.
dta <- read.table("http://www.amstat.org/publications/jse/datasets/sat.dat.txt")
names(dta) <- c("State", "Expend", "Ratio", "Salary", "Frac", "Verbal", "Math",
"Sat")
str(dta)
'data.frame': 50 obs. of 8 variables:
$ State : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
$ Expend: num 4.41 8.96 4.78 4.46 4.99 ...
$ Ratio : num 17.2 17.6 19.3 17.1 24 18.4 14.4 16.6 19.1 16.3 ...
$ Salary: num 31.1 48 32.2 28.9 41.1 ...
$ Frac : int 8 47 27 6 45 29 81 68 48 65 ...
$ Verbal: int 491 445 448 482 417 462 431 429 420 406 ...
$ Math : int 538 489 496 523 485 518 477 468 469 448 ...
$ Sat : int 1029 934 944 1005 902 980 908 897 889 854 ...
head(dta)
State Expend Ratio Salary Frac Verbal Math Sat
1 Alabama 4.405 17.2 31.144 8 491 538 1029
2 Alaska 8.963 17.6 47.951 47 445 489 934
3 Arizona 4.778 19.3 32.175 27 448 496 944
4 Arkansas 4.459 17.1 28.934 6 482 523 1005
5 California 4.992 24.0 41.078 45 417 485 902
6 Colorado 5.443 18.4 34.571 29 462 518 980
library(tidyverse)
dta <- mutate(dta, Fracf = cut(Frac, breaks = c(0, 22, 49, 81),
labels = c("Low", "Medium", "High")))
ggplot(data=dta, aes(x=Salary, y=Sat, label=State)) +
stat_smooth(method="lm",
formula= y ~ x,
se=F,
color="red",
linetype=2,
size=rel(.5)) +
stat_smooth(aes(x=Salary, y=Sat, group=Fracf), method="lm",
formula= y ~ x,
se=F,
color="gray",
linetype=2,
size=rel(.5)) +
geom_text(aes(color=Fracf),
check_overlap=TRUE,
show.legend=FALSE,
size=rel(2)) +
labs(x="Salary ($1000)",
y="SAT Score") +
theme_bw()
Reproduce the plot on the NLSY 86 example by completing the R script using the data file in wide format.
dta <- read.csv("./data/nlsy86wide.csv")
str(dta)
'data.frame': 166 obs. of 23 variables:
$ id : int 23901 25601 37401 40201 63501 70301 72001 76101 76801 77001 ...
$ sex : chr "Female" "Female" "Female" "Male" ...
$ race : chr "Majority" "Majority" "Majority" "Majority" ...
$ grade86 : int 0 0 0 0 1 0 0 0 0 0 ...
$ grade88 : int 2 1 2 1 3 2 1 3 2 2 ...
$ grade90 : int 3 3 5 2 4 3 3 4 5 4 ...
$ grade92 : int 5 6 6 5 6 5 5 6 6 5 ...
$ age86year : int 6 6 6 5 7 5 6 7 6 6 ...
$ age88year : int 8 8 8 8 9 8 8 9 9 8 ...
$ age90year : int 10 10 10 9 11 10 10 11 11 10 ...
$ age92year : int 12 12 12 12 13 12 12 13 13 12 ...
$ age86month: int 67 66 67 60 78 62 66 79 76 67 ...
$ age88month: int 96 95 95 91 108 93 94 109 104 94 ...
$ age90month: int 119 119 122 112 132 117 118 131 128 117 ...
$ age92month: int 142 143 144 139 155 139 140 154 151 139 ...
$ math86 : num 14.29 20.24 17.86 7.14 29.76 ...
$ math88 : num 15.5 36.9 22.6 21.4 50 ...
$ math90 : num 38.1 52.4 53.6 53.6 47.6 ...
$ math92 : num 41.7 58.3 58.3 51.2 71.4 ...
$ read86 : num 19.05 21.43 21.43 7.14 30.95 ...
$ read88 : num 29.8 32.1 45.2 21.4 50 ...
$ read90 : num 28.6 45.2 69 50 63.1 ...
$ read92 : num 45.2 57.1 78.6 59.5 82.1 ...
head(dta)
id sex race grade86 grade88 grade90 grade92 age86year age88year
1 23901 Female Majority 0 2 3 5 6 8
2 25601 Female Majority 0 1 3 6 6 8
3 37401 Female Majority 0 2 5 6 6 8
4 40201 Male Majority 0 1 2 5 5 8
5 63501 Male Majority 1 3 4 6 7 9
6 70301 Male Majority 0 2 3 5 5 8
age90year age92year age86month age88month age90month age92month math86
1 10 12 67 96 119 142 14.2857
2 10 12 66 95 119 143 20.2381
3 10 12 67 95 122 144 17.8571
4 9 12 60 91 112 139 7.1429
5 11 13 78 108 132 155 29.7619
6 10 12 62 93 117 139 14.2857
math88 math90 math92 read86 read88 read90 read92
1 15.476 38.095 41.667 19.0476 29.762 28.571 45.238
2 36.905 52.381 58.333 21.4286 32.143 45.238 57.143
3 22.619 53.571 58.333 21.4286 45.238 69.048 78.571
4 21.429 53.571 51.190 7.1429 21.429 50.000 59.524
5 50.000 47.619 71.429 30.9524 50.000 63.095 82.143
6 36.905 55.952 63.095 17.8571 46.429 64.286 96.429
dta_1 <- dta %>%
gather(time, Age_y, age86year:age92year) %>%
mutate(Time=readr::parse_number(time)) %>%
select(id, sex, race, Time, Age_y)
dta_2 <- dta %>%
gather(time, Age_m, age86month:age92month)%>%
mutate(Time=readr::parse_number(time))%>%
select(Age_m)
dta_3 <- dta %>%
gather(time, Grade, grade86:grade92) %>%
mutate(Time=readr::parse_number(time))%>%
select(Grade)
dta_4 <- dta %>%
gather(time, Math, math86:math92) %>%
mutate(Time=readr::parse_number(time)) %>%
select(Math)
dta_5 <- dta %>%
gather(time, Read, read86:read92) %>%
mutate(Time=readr::parse_number(time)) %>%
select(Read)
dta_L <- cbind(dta_1, dta_2, dta_3, dta_4, dta_5)
#dta_L <- mutate(Time=readr::parse_number(time))
ggplot(data=dta_L, aes(x=Age_m, y=Read, group=id)) +
geom_point(size=rel(.5)) +
stat_smooth(method ="lm", formula=y ~ x, se=F) +
facet_grid(race ~ sex) +
labs(x="Month", y="Reading score")
For the IQ and language example, plot 131 school-level regression lines of language score against verbal IQ (similar to the figure on the page). Compute the mean and standard deviation of the intercept and slope estimates. What is the correlation coefficient between the two parameter estimates?
dta <- read.table("./data/verbalIQ.txt", header = T)
str(dta)
'data.frame': 2287 obs. of 6 variables:
$ school : int 1 1 1 1 1 1 1 1 1 1 ...
$ pupil : int 17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
$ viq : num 15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
$ language: int 46 45 33 46 20 30 30 57 36 36 ...
$ csize : int 29 29 29 29 29 29 29 29 29 29 ...
$ ses : int 23 10 15 23 10 10 23 10 13 15 ...
head(dta)
school pupil viq language csize ses
1 1 17001 15.0 46 29 23
2 1 17002 14.5 45 29 10
3 1 17003 9.5 33 29 15
4 1 17004 11.0 46 29 23
5 1 17005 8.0 20 29 10
6 1 17006 9.5 30 29 10
ggplot(dta,
aes(viq,
language,
group=school))+
geom_point(alpha=.5) +
stat_smooth(method="lm",
formula=y ~ x,
se=F,
col='black',
lwd=rel(.5)) +
labs(x="Verbal IQ ",
y="Language Score") +
theme_minimal()
#m1 <- lmList(viq ~language | school, data=dta)
library(dplyr)
library(broom)
res_lm <- dta %>%
group_by(school) %>%
do(tidy(lm(language ~viq, data = .))) %>%
as.data.frame(.)
head(res_lm)
school term estimate std.error statistic p.value
1 1 (Intercept) 13.2994 5.74305 2.3157 0.02984019
2 1 viq 2.2384 0.54079 4.1392 0.00039794
3 2 (Intercept) 12.1671 11.19351 1.0870 0.32664636
4 2 viq 1.2830 1.21564 1.0554 0.33953566
5 10 (Intercept) -30.1000 15.17971 -1.9829 0.14165616
6 10 viq 5.7619 1.43211 4.0234 0.02758449
m_i <- res_lm %>%
filter(term == '(Intercept)') %>%
summarize(m_intercept = mean(estimate), sd_intersept = sd(estimate))
m_s <- res_lm %>%
filter(term == 'viq') %>%
summarize(m_slope = mean(estimate), sd_slope = sd(estimate))
m_all <- cbind(m_i, m_s)
head(m_all)
m_intercept sd_intersept m_slope sd_slope
1 9.6341 16.027 2.5931 1.2792
lm_i <- res_lm %>%
filter(term == '(Intercept)')
lm_s <- res_lm %>%
filter(term == 'viq')
cor(lm_i$estimate, lm_s$estimate)
[1] -0.95948