# install.package("mlmRev")
library(mlmRev)
# load the data from the package
data(Gcsemv, package="mlmRev")
# view first 6 lines
head(Gcsemv)
school student gender written course
1 20920 16 M 23 NA
2 20920 25 F NA 71.2
3 20920 27 F 39 76.8
4 20920 31 F 36 87.9
5 20920 42 M 16 44.4
6 20920 62 F 36 NA
with(Gcsemv, cor(written, course, use="pairwise"))
[1] 0.47417
# compute the means by school
course_schavg <- with(Gcsemv, tapply(course, school, mean, na.rm=T))
written_schavg <- with(Gcsemv, tapply(written, school, mean, na.rm=T))
cor(course_schavg, written_schavg)
[1] 0.39568
library(tidyverse)
dta <- Gcsemv %>%
group_by(school) %>%
mutate(r_sch = cor(course, written, use="pairwise"))
dtar <- dta[!duplicated(dta$school),"r_sch"]
with(dta, plot(written ~ course,
bty = 'n',
cex = 0.5,
xlab = 'Written score',
ylab = 'Course'))
grid()
with(dta, abline(lm(written ~ course)))
points(written_schavg, course_schavg, pch=16, col=5)
abline(lm(course_schavg ~ written_schavg), col=5)
#input data
dta <- read.table("http://www.amstat.org/publications/jse/datasets/sat.dat.txt")
#assign variable names
names(dta) <- c("State", "Expend", "Ratio", "Salary", "Frac", "Verbal", "Math",
"Sat")
#check data structure
str(dta)
'data.frame': 50 obs. of 8 variables:
$ State : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
$ Expend: num 4.41 8.96 4.78 4.46 4.99 ...
$ Ratio : num 17.2 17.6 19.3 17.1 24 18.4 14.4 16.6 19.1 16.3 ...
$ Salary: num 31.1 48 32.2 28.9 41.1 ...
$ Frac : int 8 47 27 6 45 29 81 68 48 65 ...
$ Verbal: int 491 445 448 482 417 462 431 429 420 406 ...
$ Math : int 538 489 496 523 485 518 477 468 469 448 ...
$ Sat : int 1029 934 944 1005 902 980 908 897 889 854 ...
#look at the first 6 lines
head(dta)
State Expend Ratio Salary Frac Verbal Math Sat
1 Alabama 4.405 17.2 31.144 8 491 538 1029
2 Alaska 8.963 17.6 47.951 47 445 489 934
3 Arizona 4.778 19.3 32.175 27 448 496 944
4 Arkansas 4.459 17.1 28.934 6 482 523 1005
5 California 4.992 24.0 41.078 45 417 485 902
6 Colorado 5.443 18.4 34.571 29 462 518 980
# load data management and plotting package
library(tidyverse)
# create a factor variable with 3 levels from Frac
dta <- mutate(dta, Fracf = cut(Frac, breaks = c(0, 22, 49, 81),
labels = c("Low", "Medium", "High")))
# plot
ggplot(data=dta, aes(x=Salary, y=Sat, label=State)) +
stat_smooth(method="lm",
formula= y ~ x,
se=F,
color="red",
linetype=2,
size=rel(.5)) +
stat_smooth(aes(group=Fracf), method="lm",
formula= y ~ x,
se=F,
color="gray",
linetype=2,
size=rel(.5)) +
geom_text(aes(color=Fracf),
check_overlap=TRUE,
show.legend=FALSE,
size=rel(2)) +
labs(x="Salary ($1000)",
y="SAT Score") +
theme_bw()
# input data
dta <- read.csv("nlsy86wide.csv")
# inspect data structure
str(dta)
'data.frame': 166 obs. of 23 variables:
$ id : int 23901 25601 37401 40201 63501 70301 72001 76101 76801 77001 ...
$ sex : chr "Female" "Female" "Female" "Male" ...
$ race : chr "Majority" "Majority" "Majority" "Majority" ...
$ grade86 : int 0 0 0 0 1 0 0 0 0 0 ...
$ grade88 : int 2 1 2 1 3 2 1 3 2 2 ...
$ grade90 : int 3 3 5 2 4 3 3 4 5 4 ...
$ grade92 : int 5 6 6 5 6 5 5 6 6 5 ...
$ age86year : int 6 6 6 5 7 5 6 7 6 6 ...
$ age88year : int 8 8 8 8 9 8 8 9 9 8 ...
$ age90year : int 10 10 10 9 11 10 10 11 11 10 ...
$ age92year : int 12 12 12 12 13 12 12 13 13 12 ...
$ age86month: int 67 66 67 60 78 62 66 79 76 67 ...
$ age88month: int 96 95 95 91 108 93 94 109 104 94 ...
$ age90month: int 119 119 122 112 132 117 118 131 128 117 ...
$ age92month: int 142 143 144 139 155 139 140 154 151 139 ...
$ math86 : num 14.29 20.24 17.86 7.14 29.76 ...
$ math88 : num 15.5 36.9 22.6 21.4 50 ...
$ math90 : num 38.1 52.4 53.6 53.6 47.6 ...
$ math92 : num 41.7 58.3 58.3 51.2 71.4 ...
$ read86 : num 19.05 21.43 21.43 7.14 30.95 ...
$ read88 : num 29.8 32.1 45.2 21.4 50 ...
$ read90 : num 28.6 45.2 69 50 63.1 ...
$ read92 : num 45.2 57.1 78.6 59.5 82.1 ...
# examine first 6 lines
head(dta)
id sex race grade86 grade88 grade90 grade92 age86year age88year
1 23901 Female Majority 0 2 3 5 6 8
2 25601 Female Majority 0 1 3 6 6 8
3 37401 Female Majority 0 2 5 6 6 8
4 40201 Male Majority 0 1 2 5 5 8
5 63501 Male Majority 1 3 4 6 7 9
6 70301 Male Majority 0 2 3 5 5 8
age90year age92year age86month age88month age90month age92month math86
1 10 12 67 96 119 142 14.2857
2 10 12 66 95 119 143 20.2381
3 10 12 67 95 122 144 17.8571
4 9 12 60 91 112 139 7.1429
5 11 13 78 108 132 155 29.7619
6 10 12 62 93 117 139 14.2857
math88 math90 math92 read86 read88 read90 read92
1 15.476 38.095 41.667 19.0476 29.762 28.571 45.238
2 36.905 52.381 58.333 21.4286 32.143 45.238 57.143
3 22.619 53.571 58.333 21.4286 45.238 69.048 78.571
4 21.429 53.571 51.190 7.1429 21.429 50.000 59.524
5 50.000 47.619 71.429 30.9524 50.000 63.095 82.143
6 36.905 55.952 63.095 17.8571 46.429 64.286 96.429
library(tidyverse)
dta_1 <- dta %>%
gather(time, Age_y, age86year:age92year) %>%
mutate(Time=readr::parse_number(time)) %>%
select(id, sex, race, Time, Age_y)
dta_2 <- dta %>%
gather(time, Age_m, age86month:age92month)%>%
mutate(Time=readr::parse_number(time))%>%
select(Age_m)
dta_3 <- dta %>%
gather(time, Grade, grade86:grade92) %>%
mutate(Time=readr::parse_number(time))%>%
select(Grade)
dta_4 <- dta %>%
gather(time, Math, math86:math92) %>%
mutate(Time=readr::parse_number(time)) %>%
select(Math)
dta_5 <- dta %>%
gather(time, Read, read86:read92) %>%
mutate(Time=readr::parse_number(time)) %>%
select(Read)
dta_L <- cbind(dta_1, dta_2, dta_3, dta_4, dta_5)
ggplot(data=dta_L, aes(x=Age_m, y=Read, group=id)) +
geom_point(size=rel(.5)) +
stat_smooth(method ="lm", formula=y ~ x, se=F) +
facet_grid(race ~ sex) +
labs(x="Month", y="Reading score")
#read data
dta<- read.table("verbalIQ.txt", header = T)
str(dta)
'data.frame': 2287 obs. of 6 variables:
$ school : int 1 1 1 1 1 1 1 1 1 1 ...
$ pupil : int 17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
$ viq : num 15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
$ language: int 46 45 33 46 20 30 30 57 36 36 ...
$ csize : int 29 29 29 29 29 29 29 29 29 29 ...
$ ses : int 23 10 15 23 10 10 23 10 13 15 ...
head(dta)
school pupil viq language csize ses
1 1 17001 15.0 46 29 23
2 1 17002 14.5 45 29 10
3 1 17003 9.5 33 29 15
4 1 17004 11.0 46 29 23
5 1 17005 8.0 20 29 10
6 1 17006 9.5 30 29 10
ggplot(dta, aes(viq, language, group=school))+
stat_smooth(method='lm',se=F, color='lightgray', lwd=rel(.5))+
geom_point(size=.5)+
labs(x='Verval IQ', y='Language Score')+
theme_minimal()
library(dplyr)
library(broom)
res_lm <- dta %>%
group_by(school) %>%
do(tidy(lm(language ~viq, data = .))) %>%
as.data.frame(.)
head(res_lm)
school term estimate std.error statistic p.value
1 1 (Intercept) 13.2994 5.74305 2.3157 0.02984019
2 1 viq 2.2384 0.54079 4.1392 0.00039794
3 2 (Intercept) 12.1671 11.19351 1.0870 0.32664636
4 2 viq 1.2830 1.21564 1.0554 0.33953566
5 10 (Intercept) -30.1000 15.17971 -1.9829 0.14165616
6 10 viq 5.7619 1.43211 4.0234 0.02758449
m_i <- res_lm %>%
filter(term == '(Intercept)') %>%
summarize(m_intercept = mean(estimate), sd_intersept = sd(estimate))
m_s <- res_lm %>%
filter(term == 'viq') %>%
summarize(m_slope = mean(estimate), sd_slope = sd(estimate))
m_all <- cbind(m_i, m_s)
head(m_all)
m_intercept sd_intersept m_slope sd_slope
1 9.6341 16.027 2.5931 1.2792
lm_i <- res_lm %>%
filter(term == '(Intercept)')
lm_s <- res_lm %>%
filter(term == 'viq')
cor(lm_i$estimate, lm_s$estimate)
[1] -0.95948