Draw a scatter plot of the variables written and course scores from the data set Gcsemv{mlmRev} and superimpose another scatter plot on it using mean written and mean course scores by school as in the language and math example. You can start by editing in RStudio either the gcsemv rmarkdown or the exam rmarkdown file.
# install.package("mlmRev")
library(mlmRev)
# load the data from the package
data(Gcsemv, package="mlmRev")
# view first 6 lines
head(Gcsemv)
school student gender written course
1 20920 16 M 23 NA
2 20920 25 F NA 71.2
3 20920 27 F 39 76.8
4 20920 31 F 36 87.9
5 20920 42 M 16 44.4
6 20920 62 F 36 NA
library(tidyverse)
dta1 <- Gcsemv %>%
group_by(school) %>%
mutate(r_sch = cor(course, written, use="pairwise"))
head(dta1)
# A tibble: 6 x 6
# Groups: school [1]
school student gender written course r_sch
<fct> <fct> <fct> <dbl> <dbl> <dbl>
1 20920 16 M 23 NA 0.789
2 20920 25 F NA 71.2 0.789
3 20920 27 F 39 76.8 0.789
4 20920 31 F 36 87.9 0.789
5 20920 42 M 16 44.4 0.789
6 20920 62 F 36 NA 0.789
dta1_a <- dta1 %>%
group_by(school) %>%
summarize(ave_written = mean(written, na.rm=TRUE),
ave_course = mean(course, na.rm=TRUE))
ggplot(data=dta1, aes(x=written, y=course)) +
geom_point(color="skyblue") +
stat_smooth(method="lm", formula=y ~ x, se=F, col="skyblue") +
geom_point(data=dta1_a, aes(ave_written, ave_course), color="steelblue") +
stat_smooth(data=dta1_a, aes(ave_written, ave_course),
method="lm", formula= y ~ x, se=F, color="steelblue") +
labs(x="Written score",
y="Course score") +
theme_bw()
In Rstudio, open and edit this R script from the SAT scores example to add an overall regression line of SAT score by teacher’s salary to three separate regression lines of the same set of variables by states with low, medium and high percentage of students taking the SAT exam.
dta2 <- read.table("http://www.amstat.org/publications/jse/datasets/sat.dat.txt")
names(dta2) <- c("State", "Expend", "Ratio", "Salary", "Frac", "Verbal", "Math",
"Sat")
str(dta2)
'data.frame': 50 obs. of 8 variables:
$ State : Factor w/ 50 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
$ Expend: num 4.41 8.96 4.78 4.46 4.99 ...
$ Ratio : num 17.2 17.6 19.3 17.1 24 18.4 14.4 16.6 19.1 16.3 ...
$ Salary: num 31.1 48 32.2 28.9 41.1 ...
$ Frac : int 8 47 27 6 45 29 81 68 48 65 ...
$ Verbal: int 491 445 448 482 417 462 431 429 420 406 ...
$ Math : int 538 489 496 523 485 518 477 468 469 448 ...
$ Sat : int 1029 934 944 1005 902 980 908 897 889 854 ...
head(dta2)
State Expend Ratio Salary Frac Verbal Math Sat
1 Alabama 4.405 17.2 31.144 8 491 538 1029
2 Alaska 8.963 17.6 47.951 47 445 489 934
3 Arizona 4.778 19.3 32.175 27 448 496 944
4 Arkansas 4.459 17.1 28.934 6 482 523 1005
5 California 4.992 24.0 41.078 45 417 485 902
6 Colorado 5.443 18.4 34.571 29 462 518 980
library(tidyverse)
dta2 <- mutate(dta2, Fracf = cut(Frac, breaks = c(0, 22, 49, 81),
labels = c("Low", "Medium", "High")))
head(dta2)
State Expend Ratio Salary Frac Verbal Math Sat Fracf
1 Alabama 4.405 17.2 31.144 8 491 538 1029 Low
2 Alaska 8.963 17.6 47.951 47 445 489 934 Medium
3 Arizona 4.778 19.3 32.175 27 448 496 944 Medium
4 Arkansas 4.459 17.1 28.934 6 482 523 1005 Low
5 California 4.992 24.0 41.078 45 417 485 902 Medium
6 Colorado 5.443 18.4 34.571 29 462 518 980 Medium
ggplot(data=dta2, aes(x=Salary, y=Sat, label=State, group=Fracf)) +
stat_smooth(method="lm",
formula= y ~ x,
se=F,
color="gray",
linetype=2,
size=rel(.5)) +
stat_smooth(aes(x = Salary, y = Sat, group =F),
method = "lm",
formula = y ~ x,
se = F,
color = "purple",
linetype = 1,
size = rel(.8))+
geom_text(aes(color=Fracf),
check_overlap=TRUE,
show.legend=FALSE,
size=rel(2)) +
labs(x="Salary ($1000)",
y="SAT Score") +
theme_bw()
Reproduce the plot on the NLSY 86 example by completing the R script using the data file in wide format.
dta3 <- read.csv("~/data/nlsy86wide.csv")
str(dta3)
'data.frame': 166 obs. of 23 variables:
$ id : int 23901 25601 37401 40201 63501 70301 72001 76101 76801 77001 ...
$ sex : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 2 2 2 1 2 ...
$ race : Factor w/ 2 levels "Majority","Minority": 1 1 1 1 1 1 1 1 1 1 ...
$ grade86 : int 0 0 0 0 1 0 0 0 0 0 ...
$ grade88 : int 2 1 2 1 3 2 1 3 2 2 ...
$ grade90 : int 3 3 5 2 4 3 3 4 5 4 ...
$ grade92 : int 5 6 6 5 6 5 5 6 6 5 ...
$ age86year : int 6 6 6 5 7 5 6 7 6 6 ...
$ age88year : int 8 8 8 8 9 8 8 9 9 8 ...
$ age90year : int 10 10 10 9 11 10 10 11 11 10 ...
$ age92year : int 12 12 12 12 13 12 12 13 13 12 ...
$ age86month: int 67 66 67 60 78 62 66 79 76 67 ...
$ age88month: int 96 95 95 91 108 93 94 109 104 94 ...
$ age90month: int 119 119 122 112 132 117 118 131 128 117 ...
$ age92month: int 142 143 144 139 155 139 140 154 151 139 ...
$ math86 : num 14.29 20.24 17.86 7.14 29.76 ...
$ math88 : num 15.5 36.9 22.6 21.4 50 ...
$ math90 : num 38.1 52.4 53.6 53.6 47.6 ...
$ math92 : num 41.7 58.3 58.3 51.2 71.4 ...
$ read86 : num 19.05 21.43 21.43 7.14 30.95 ...
$ read88 : num 29.8 32.1 45.2 21.4 50 ...
$ read90 : num 28.6 45.2 69 50 63.1 ...
$ read92 : num 45.2 57.1 78.6 59.5 82.1 ...
head(dta3)
id sex race grade86 grade88 grade90 grade92 age86year
1 23901 Female Majority 0 2 3 5 6
2 25601 Female Majority 0 1 3 6 6
3 37401 Female Majority 0 2 5 6 6
4 40201 Male Majority 0 1 2 5 5
5 63501 Male Majority 1 3 4 6 7
6 70301 Male Majority 0 2 3 5 5
age88year age90year age92year age86month age88month age90month
1 8 10 12 67 96 119
2 8 10 12 66 95 119
3 8 10 12 67 95 122
4 8 9 12 60 91 112
5 9 11 13 78 108 132
6 8 10 12 62 93 117
age92month math86 math88 math90 math92 read86 read88 read90 read92
1 142 14.2857 15.476 38.095 41.667 19.0476 29.762 28.571 45.238
2 143 20.2381 36.905 52.381 58.333 21.4286 32.143 45.238 57.143
3 144 17.8571 22.619 53.571 58.333 21.4286 45.238 69.048 78.571
4 139 7.1429 21.429 53.571 51.190 7.1429 21.429 50.000 59.524
5 155 29.7619 50.000 47.619 71.429 30.9524 50.000 63.095 82.143
6 139 14.2857 36.905 55.952 63.095 17.8571 46.429 64.286 96.429
dta3_ <- dta3[c(1:3, 12:15, 20:23)]
head(dta3_)
id sex race age86month age88month age90month age92month
1 23901 Female Majority 67 96 119 142
2 25601 Female Majority 66 95 119 143
3 37401 Female Majority 67 95 122 144
4 40201 Male Majority 60 91 112 139
5 63501 Male Majority 78 108 132 155
6 70301 Male Majority 62 93 117 139
read86 read88 read90 read92
1 19.0476 29.762 28.571 45.238
2 21.4286 32.143 45.238 57.143
3 21.4286 45.238 69.048 78.571
4 7.1429 21.429 50.000 59.524
5 30.9524 50.000 63.095 82.143
6 17.8571 46.429 64.286 96.429
library(tidyverse)
dta3L_month <- dta3_[1:7] %>%
gather(time, month, age86month:age92month) %>%
mutate(time = parse_number(time))
dta3L_read <- dta3_[c(1:3, 8:11)] %>%
gather(time, read, read86:read92) %>%
mutate(time = parse_number(time))
str(dta3L_read)
'data.frame': 664 obs. of 5 variables:
$ id : int 23901 25601 37401 40201 63501 70301 72001 76101 76801 77001 ...
$ sex : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 2 2 2 1 2 ...
$ race: Factor w/ 2 levels "Majority","Minority": 1 1 1 1 1 1 1 1 1 1 ...
$ time: num 86 86 86 86 86 86 86 86 86 86 ...
$ read: num 19.05 21.43 21.43 7.14 30.95 ...
dta3L <- merge(dta3L_month, dta3L_read, by = c("id", "sex", "race", "time"))
str(dta3L)
'data.frame': 664 obs. of 6 variables:
$ id : int 1003201 1003201 1003201 1003201 1012901 1012901 1012901 1012901 1013201 1013201 ...
$ sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 1 1 ...
$ race : Factor w/ 2 levels "Majority","Minority": 2 2 2 2 2 2 2 2 2 2 ...
$ time : num 86 88 90 92 86 88 90 92 86 88 ...
$ month: int 60 91 116 138 75 103 127 151 69 99 ...
$ read : num 10.7 36.9 36.9 45.2 21.4 ...
ggplot(data=dta3L, aes(x = month, y = read, group=id)) +
geom_point(size=rel(.5)) +
stat_smooth(method ="lm", formula=y ~ x, se=F) +
facet_grid(race ~ sex) +
labs(x="Month", y="Reading score") +
theme_bw()
For the IQ and language example, plot 131 school-level regression lines of language score against verbal IQ (similar to the figure on the page). Compute the mean and standard deviation of the intercept and slope estimates. What is the correlation coefficient between the two parameter estimates?
Column 1: School ID Column 2: Pupil ID Column 3: Verbal IQ Column 4: Language achievement score Column 5: The number of 8th graders in the classroom Column 6: Socioeconomic status score
dta4 <- read.table("~/data/verbalIQ.txt", header = T)
head(dta4)
school pupil viq language csize ses
1 1 17001 15.0 46 29 23
2 1 17002 14.5 45 29 10
3 1 17003 9.5 33 29 15
4 1 17004 11.0 46 29 23
5 1 17005 8.0 20 29 10
6 1 17006 9.5 30 29 10
ggplot(data = dta4, aes(x = viq, y = language, group = school)) +
stat_smooth(method ="lm", formula=y ~ x, se=F, color = "grey")+
geom_point() +
labs(x = "Verbal IQ",
y = "Language Score") +
theme_bw()
cor.test(dta4$viq, dta4$language)
Pearson's product-moment correlation
data: dta4$viq and dta4$language
t = 36.8, df = 2280, p-value <2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.58341 0.63494
sample estimates:
cor
0.60982
viq_avg <- with(dta4, tapply(viq, school, mean, na.rm = T))
language_avg <- with(dta4, tapply(language, school, mean, na.rm = T))
cor.test(viq_avg, language_avg)
Pearson's product-moment correlation
data: viq_avg and language_avg
t = 13.9, df = 129, p-value <2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.69418 0.83440
sample estimates:
cor
0.7736