Homework 1

Data management

# install.package("mlmRev")
library(mlmRev)
# load the data from the package
data(Gcsemv, package="mlmRev")
# view first 6 lines
head(Gcsemv)
  school student gender written course
1  20920      16      M      23     NA
2  20920      25      F      NA   71.2
3  20920      27      F      39   76.8
4  20920      31      F      36   87.9
5  20920      42      M      16   44.4
6  20920      62      F      36     NA

Summary statistics

with(Gcsemv, cor(written, course, use="pairwise"))
[1] 0.47417
# compute the means by school
course_schavg <- with(Gcsemv, tapply(course, school, mean, na.rm=T))
written_schavg <- with(Gcsemv, tapply(written, school, mean, na.rm=T))
cor(course_schavg, written_schavg)
[1] 0.39568

Visualization

library(tidyverse)
dta <- Gcsemv %>% 
  group_by(school) %>%
  mutate(r_sch = cor(course, written, use="pairwise")) 
dtar <- dta[!duplicated(dta$school),"r_sch"]
with(dta, plot(written ~ course, 
                bty = 'n', 
                cex = 0.5,
                xlab = 'Written score', 
                ylab = 'Course'))
grid()
with(dta, abline(lm(written ~ course)))
points(written_schavg, course_schavg, pch=16, col=5)
abline(lm(course_schavg ~ written_schavg), col=5)

Homework 2

Data management

#input data
dta <- read.table("http://www.amstat.org/publications/jse/datasets/sat.dat.txt")
#assign variable names
names(dta) <- c("State", "Expend", "Ratio", "Salary", "Frac", "Verbal", "Math",
                "Sat")
#check data structure
str(dta)
'data.frame':   50 obs. of  8 variables:
 $ State : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
 $ Expend: num  4.41 8.96 4.78 4.46 4.99 ...
 $ Ratio : num  17.2 17.6 19.3 17.1 24 18.4 14.4 16.6 19.1 16.3 ...
 $ Salary: num  31.1 48 32.2 28.9 41.1 ...
 $ Frac  : int  8 47 27 6 45 29 81 68 48 65 ...
 $ Verbal: int  491 445 448 482 417 462 431 429 420 406 ...
 $ Math  : int  538 489 496 523 485 518 477 468 469 448 ...
 $ Sat   : int  1029 934 944 1005 902 980 908 897 889 854 ...
#look at the first 6 lines
head(dta)
       State Expend Ratio Salary Frac Verbal Math  Sat
1    Alabama  4.405  17.2 31.144    8    491  538 1029
2     Alaska  8.963  17.6 47.951   47    445  489  934
3    Arizona  4.778  19.3 32.175   27    448  496  944
4   Arkansas  4.459  17.1 28.934    6    482  523 1005
5 California  4.992  24.0 41.078   45    417  485  902
6   Colorado  5.443  18.4 34.571   29    462  518  980
# load data management and plotting package
library(tidyverse)
# create a factor variable with 3 levels from Frac
dta <- mutate(dta, Fracf = cut(Frac, breaks = c(0, 22, 49, 81),
                           labels = c("Low", "Medium", "High")))
# plot
ggplot(data=dta, aes(x=Salary, y=Sat, label=State)) +
 stat_smooth(method="lm", 
             formula= y ~ x,
             se=F, 
             color="red", 
             linetype=2, 
             size=rel(.5)) +
 stat_smooth(aes(group=Fracf), method="lm", 
             formula= y ~ x,
             se=F, 
             color="gray", 
             linetype=2, 
             size=rel(.5)) +
 geom_text(aes(color=Fracf), 
           check_overlap=TRUE, 
           show.legend=FALSE, 
           size=rel(2)) +
 labs(x="Salary ($1000)", 
      y="SAT Score") +
 theme_bw()

Homework 3

# input data
dta <- read.csv("nlsy86wide.csv")

# inspect data structure
str(dta)
'data.frame':   166 obs. of  23 variables:
 $ id        : int  23901 25601 37401 40201 63501 70301 72001 76101 76801 77001 ...
 $ sex       : chr  "Female" "Female" "Female" "Male" ...
 $ race      : chr  "Majority" "Majority" "Majority" "Majority" ...
 $ grade86   : int  0 0 0 0 1 0 0 0 0 0 ...
 $ grade88   : int  2 1 2 1 3 2 1 3 2 2 ...
 $ grade90   : int  3 3 5 2 4 3 3 4 5 4 ...
 $ grade92   : int  5 6 6 5 6 5 5 6 6 5 ...
 $ age86year : int  6 6 6 5 7 5 6 7 6 6 ...
 $ age88year : int  8 8 8 8 9 8 8 9 9 8 ...
 $ age90year : int  10 10 10 9 11 10 10 11 11 10 ...
 $ age92year : int  12 12 12 12 13 12 12 13 13 12 ...
 $ age86month: int  67 66 67 60 78 62 66 79 76 67 ...
 $ age88month: int  96 95 95 91 108 93 94 109 104 94 ...
 $ age90month: int  119 119 122 112 132 117 118 131 128 117 ...
 $ age92month: int  142 143 144 139 155 139 140 154 151 139 ...
 $ math86    : num  14.29 20.24 17.86 7.14 29.76 ...
 $ math88    : num  15.5 36.9 22.6 21.4 50 ...
 $ math90    : num  38.1 52.4 53.6 53.6 47.6 ...
 $ math92    : num  41.7 58.3 58.3 51.2 71.4 ...
 $ read86    : num  19.05 21.43 21.43 7.14 30.95 ...
 $ read88    : num  29.8 32.1 45.2 21.4 50 ...
 $ read90    : num  28.6 45.2 69 50 63.1 ...
 $ read92    : num  45.2 57.1 78.6 59.5 82.1 ...
# examine first 6 lines
head(dta)
     id    sex     race grade86 grade88 grade90 grade92 age86year age88year
1 23901 Female Majority       0       2       3       5         6         8
2 25601 Female Majority       0       1       3       6         6         8
3 37401 Female Majority       0       2       5       6         6         8
4 40201   Male Majority       0       1       2       5         5         8
5 63501   Male Majority       1       3       4       6         7         9
6 70301   Male Majority       0       2       3       5         5         8
  age90year age92year age86month age88month age90month age92month  math86
1        10        12         67         96        119        142 14.2857
2        10        12         66         95        119        143 20.2381
3        10        12         67         95        122        144 17.8571
4         9        12         60         91        112        139  7.1429
5        11        13         78        108        132        155 29.7619
6        10        12         62         93        117        139 14.2857
  math88 math90 math92  read86 read88 read90 read92
1 15.476 38.095 41.667 19.0476 29.762 28.571 45.238
2 36.905 52.381 58.333 21.4286 32.143 45.238 57.143
3 22.619 53.571 58.333 21.4286 45.238 69.048 78.571
4 21.429 53.571 51.190  7.1429 21.429 50.000 59.524
5 50.000 47.619 71.429 30.9524 50.000 63.095 82.143
6 36.905 55.952 63.095 17.8571 46.429 64.286 96.429
library(tidyverse)
dta_1 <- dta %>% 
  gather(time, Age_y, age86year:age92year) %>%
  mutate(Time=readr::parse_number(time)) %>%
  select(id, sex, race, Time, Age_y)
  
dta_2 <- dta %>%
  gather(time, Age_m, age86month:age92month)%>%
  mutate(Time=readr::parse_number(time))%>%
  select(Age_m)

dta_3 <- dta %>%
  gather(time, Grade, grade86:grade92) %>%
  mutate(Time=readr::parse_number(time))%>%
  select(Grade)

dta_4 <- dta %>%
  gather(time, Math, math86:math92) %>%
  mutate(Time=readr::parse_number(time)) %>%
  select(Math)

dta_5 <- dta %>%
  gather(time, Read, read86:read92) %>%
  mutate(Time=readr::parse_number(time)) %>%
  select(Read)
dta_L <- cbind(dta_1, dta_2, dta_3, dta_4, dta_5)
ggplot(data=dta_L, aes(x=Age_m, y=Read, group=id)) +
 geom_point(size=rel(.5)) +
 stat_smooth(method ="lm", formula=y ~ x, se=F) +
 facet_grid(race ~ sex) +
 labs(x="Month", y="Reading score")

Homework 4

#read data
dta<- read.table("verbalIQ.txt", header = T)
str(dta)
'data.frame':   2287 obs. of  6 variables:
 $ school  : int  1 1 1 1 1 1 1 1 1 1 ...
 $ pupil   : int  17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
 $ viq     : num  15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
 $ language: int  46 45 33 46 20 30 30 57 36 36 ...
 $ csize   : int  29 29 29 29 29 29 29 29 29 29 ...
 $ ses     : int  23 10 15 23 10 10 23 10 13 15 ...
head(dta)
  school pupil  viq language csize ses
1      1 17001 15.0       46    29  23
2      1 17002 14.5       45    29  10
3      1 17003  9.5       33    29  15
4      1 17004 11.0       46    29  23
5      1 17005  8.0       20    29  10
6      1 17006  9.5       30    29  10

plot

ggplot(dta, aes(viq, language, group=school))+
  stat_smooth(method='lm',se=F, color='lightgray', lwd=rel(.5))+
  geom_point(size=.5)+
  labs(x='Verval IQ', y='Language Score')+
  theme_minimal()

conduct the linear regression by school

library(dplyr)
library(broom)
res_lm <- dta %>% 
  group_by(school) %>% 
  do(tidy(lm(language ~viq, data = .))) %>%
  as.data.frame(.)

head(res_lm)
  school        term estimate std.error statistic    p.value
1      1 (Intercept)  13.2994   5.74305    2.3157 0.02984019
2      1         viq   2.2384   0.54079    4.1392 0.00039794
3      2 (Intercept)  12.1671  11.19351    1.0870 0.32664636
4      2         viq   1.2830   1.21564    1.0554 0.33953566
5     10 (Intercept) -30.1000  15.17971   -1.9829 0.14165616
6     10         viq   5.7619   1.43211    4.0234 0.02758449

compute mean and sd of the intercept and slope

m_i <- res_lm %>%
  filter(term == '(Intercept)') %>%
  summarize(m_intercept = mean(estimate), sd_intersept = sd(estimate))

m_s <- res_lm %>%
  filter(term == 'viq') %>%
  summarize(m_slope = mean(estimate), sd_slope = sd(estimate))

m_all <- cbind(m_i, m_s)
head(m_all)
  m_intercept sd_intersept m_slope sd_slope
1      9.6341       16.027  2.5931   1.2792
lm_i <- res_lm %>%
  filter(term == '(Intercept)')

lm_s <- res_lm %>%
  filter(term == 'viq')

cor(lm_i$estimate, lm_s$estimate)
[1] -0.95948