1 Homework 1:

Draw a scatter plot of the variables written and course scores from the data set Gcsemv{mlmRev} and superimpose another scatter plot on it using mean written and mean course scores by school as in the language and math example. You can start by editing in RStudio either the gcsemv rmarkdown or the exam rmarkdown file.

1.1 Data management

# install.package("mlmRev")
library(mlmRev)

# load the data from the package
data(Gcsemv, package="mlmRev")

# view first 6 lines
head(Gcsemv)

  school student gender written course
1  20920      16      M      23     NA
2  20920      25      F      NA   71.2
3  20920      27      F      39   76.8
4  20920      31      F      36   87.9
5  20920      42      M      16   44.4
6  20920      62      F      36     NA

1.2 Summary statistics

with(Gcsemv, cor(written, course, use="pairwise"))

[1] 0.47417

# compute the means by school
course_schavg <- with(Gcsemv, tapply(course, school, mean, na.rm=T))
written_schavg <- with(Gcsemv, tapply(written, school, mean, na.rm=T))

cor(course_schavg, written_schavg)

[1] 0.39568

1.3 Visualization

library(tidyverse)

dta <- Gcsemv %>% 
  group_by(school) %>%
  mutate(r_sch = cor(course, written, use="pairwise"))

dtar <- dta[!duplicated(dta$school),"r_sch"]

with(dta, plot(written ~ course, 
                bty = 'n', 
                cex = 0.5,
                xlab = 'Written score', 
                ylab = 'Course'))
grid()
with(dta, abline(lm(written ~ course)))
points(written_schavg, course_schavg, pch=16, col=5)
abline(lm(course_schavg ~ written_schavg), col=5)

2 Homework 2:

In Rstudio, open and edit this R script from the SAT scores example to add an overall regression line of SAT score by teacher’s salary to three separate regression lines of the same set of variables by states with low, medium and high percentage of students taking the SAT exam.

2.1 input data

dta <- read.table("http://www.amstat.org/publications/jse/datasets/sat.dat.txt")

2.2 assign variable names

names(dta) <- c("State", "Expend", "Ratio", "Salary", "Frac", "Verbal", "Math",
                "Sat")

2.3 check data structure

str(dta)

'data.frame':   50 obs. of  8 variables:
 $ State : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
 $ Expend: num  4.41 8.96 4.78 4.46 4.99 ...
 $ Ratio : num  17.2 17.6 19.3 17.1 24 18.4 14.4 16.6 19.1 16.3 ...
 $ Salary: num  31.1 48 32.2 28.9 41.1 ...
 $ Frac  : int  8 47 27 6 45 29 81 68 48 65 ...
 $ Verbal: int  491 445 448 482 417 462 431 429 420 406 ...
 $ Math  : int  538 489 496 523 485 518 477 468 469 448 ...
 $ Sat   : int  1029 934 944 1005 902 980 908 897 889 854 ...

2.4 look at the first 6 lines

head(dta)

       State Expend Ratio Salary Frac Verbal Math  Sat
1    Alabama  4.405  17.2 31.144    8    491  538 1029
2     Alaska  8.963  17.6 47.951   47    445  489  934
3    Arizona  4.778  19.3 32.175   27    448  496  944
4   Arkansas  4.459  17.1 28.934    6    482  523 1005
5 California  4.992  24.0 41.078   45    417  485  902
6   Colorado  5.443  18.4 34.571   29    462  518  980

2.5 load data management and plotting package

library(tidyverse)

2.6 create a factor variable with 3 levels from Frac

dta <- mutate(dta, Fracf = cut(Frac, breaks = c(0, 22, 49, 81),
                           labels = c("Low", "Medium", "High")))

2.7 plot

ggplot(data=dta, aes(x=Salary, y=Sat, label=State)) +
 stat_smooth(method="lm", 
             formula= y ~ x,
             se=F, 
             color="red", 
             linetype=2, 
             size=rel(.5)) +
 stat_smooth(aes(x=Salary, y=Sat, group=Fracf), method="lm", 
             formula= y ~ x,
             se=F, 
             color="gray", 
             linetype=2, 
             size=rel(.5)) +
 geom_text(aes(color=Fracf), 
           check_overlap=TRUE, 
           show.legend=FALSE, 
           size=rel(2)) +
 labs(x="Salary ($1000)", 
      y="SAT Score") +
 theme_bw()

3 Homework 3:

Reproduce the plot on the NLSY 86 example by completing the R script using the data file in wide format.

3.1 input data

dta <- read.csv("./data/nlsy86wide.csv")

3.2 inspect data structure

str(dta)

'data.frame':   166 obs. of  23 variables:
 $ id        : int  23901 25601 37401 40201 63501 70301 72001 76101 76801 77001 ...
 $ sex       : chr  "Female" "Female" "Female" "Male" ...
 $ race      : chr  "Majority" "Majority" "Majority" "Majority" ...
 $ grade86   : int  0 0 0 0 1 0 0 0 0 0 ...
 $ grade88   : int  2 1 2 1 3 2 1 3 2 2 ...
 $ grade90   : int  3 3 5 2 4 3 3 4 5 4 ...
 $ grade92   : int  5 6 6 5 6 5 5 6 6 5 ...
 $ age86year : int  6 6 6 5 7 5 6 7 6 6 ...
 $ age88year : int  8 8 8 8 9 8 8 9 9 8 ...
 $ age90year : int  10 10 10 9 11 10 10 11 11 10 ...
 $ age92year : int  12 12 12 12 13 12 12 13 13 12 ...
 $ age86month: int  67 66 67 60 78 62 66 79 76 67 ...
 $ age88month: int  96 95 95 91 108 93 94 109 104 94 ...
 $ age90month: int  119 119 122 112 132 117 118 131 128 117 ...
 $ age92month: int  142 143 144 139 155 139 140 154 151 139 ...
 $ math86    : num  14.29 20.24 17.86 7.14 29.76 ...
 $ math88    : num  15.5 36.9 22.6 21.4 50 ...
 $ math90    : num  38.1 52.4 53.6 53.6 47.6 ...
 $ math92    : num  41.7 58.3 58.3 51.2 71.4 ...
 $ read86    : num  19.05 21.43 21.43 7.14 30.95 ...
 $ read88    : num  29.8 32.1 45.2 21.4 50 ...
 $ read90    : num  28.6 45.2 69 50 63.1 ...
 $ read92    : num  45.2 57.1 78.6 59.5 82.1 ...

3.3 examine first 6 lines

head(dta)

     id    sex     race grade86 grade88 grade90 grade92 age86year age88year
1 23901 Female Majority       0       2       3       5         6         8
2 25601 Female Majority       0       1       3       6         6         8
3 37401 Female Majority       0       2       5       6         6         8
4 40201   Male Majority       0       1       2       5         5         8
5 63501   Male Majority       1       3       4       6         7         9
6 70301   Male Majority       0       2       3       5         5         8
  age90year age92year age86month age88month age90month age92month  math86
1        10        12         67         96        119        142 14.2857
2        10        12         66         95        119        143 20.2381
3        10        12         67         95        122        144 17.8571
4         9        12         60         91        112        139  7.1429
5        11        13         78        108        132        155 29.7619
6        10        12         62         93        117        139 14.2857
  math88 math90 math92  read86 read88 read90 read92
1 15.476 38.095 41.667 19.0476 29.762 28.571 45.238
2 36.905 52.381 58.333 21.4286 32.143 45.238 57.143
3 22.619 53.571 58.333 21.4286 45.238 69.048 78.571
4 21.429 53.571 51.190  7.1429 21.429 50.000 59.524
5 50.000 47.619 71.429 30.9524 50.000 63.095 82.143
6 36.905 55.952 63.095 17.8571 46.429 64.286 96.429

3.4 from wide to long format

dta_1 <- dta %>% 
  gather(time, Age_y, age86year:age92year) %>%
  mutate(Time=readr::parse_number(time)) %>%
  select(id, sex, race, Time, Age_y)
  
dta_2 <- dta %>%
  gather(time, Age_m, age86month:age92month)%>%
  mutate(Time=readr::parse_number(time))%>%
  select(Age_m)

dta_3 <- dta %>%
  gather(time, Grade, grade86:grade92) %>%
  mutate(Time=readr::parse_number(time))%>%
  select(Grade)

dta_4 <- dta %>%
  gather(time, Math, math86:math92) %>%
  mutate(Time=readr::parse_number(time)) %>%
  select(Math)

dta_5 <- dta %>%
  gather(time, Read, read86:read92) %>%
  mutate(Time=readr::parse_number(time)) %>%
  select(Read)


dta_L <- cbind(dta_1, dta_2, dta_3, dta_4, dta_5)

#dta_L <- mutate(Time=readr::parse_number(time))

3.5 plot

ggplot(data=dta_L, aes(x=Age_m, y=Read, group=id)) +
 geom_point(size=rel(.5)) +
 stat_smooth(method ="lm", formula=y ~ x, se=F) +
 facet_grid(race ~ sex) +
 labs(x="Month", y="Reading score")

4 Homework 4:

For the IQ and language example, plot 131 school-level regression lines of language score against verbal IQ (similar to the figure on the page). Compute the mean and standard deviation of the intercept and slope estimates. What is the correlation coefficient between the two parameter estimates?

4.1 input data

dta <- read.table("./data/verbalIQ.txt", header = T)

4.2 inspect data structure

str(dta)

'data.frame':   2287 obs. of  6 variables:
 $ school  : int  1 1 1 1 1 1 1 1 1 1 ...
 $ pupil   : int  17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
 $ viq     : num  15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
 $ language: int  46 45 33 46 20 30 30 57 36 36 ...
 $ csize   : int  29 29 29 29 29 29 29 29 29 29 ...
 $ ses     : int  23 10 15 23 10 10 23 10 13 15 ...

head(dta)

  school pupil  viq language csize ses
1      1 17001 15.0       46    29  23
2      1 17002 14.5       45    29  10
3      1 17003  9.5       33    29  15
4      1 17004 11.0       46    29  23
5      1 17005  8.0       20    29  10
6      1 17006  9.5       30    29  10

4.3 Visualization

ggplot(dta, 
       aes(viq, 
           language, 
           group=school))+
 geom_point(alpha=.5) + 
 stat_smooth(method="lm", 
             formula=y ~ x, 
             se=F,
             col='black',
             lwd=rel(.5)) +
 labs(x="Verbal IQ ", 
      y="Language Score") +
 theme_minimal()

4.4 conduct the linear regression by school

#m1 <- lmList(viq ~language | school, data=dta)
library(dplyr)
library(broom)

res_lm <- dta %>% 
  group_by(school) %>% 
  do(tidy(lm(language ~viq, data = .))) %>%
  as.data.frame(.)

head(res_lm)

  school        term estimate std.error statistic    p.value
1      1 (Intercept)  13.2994   5.74305    2.3157 0.02984019
2      1         viq   2.2384   0.54079    4.1392 0.00039794
3      2 (Intercept)  12.1671  11.19351    1.0870 0.32664636
4      2         viq   1.2830   1.21564    1.0554 0.33953566
5     10 (Intercept) -30.1000  15.17971   -1.9829 0.14165616
6     10         viq   5.7619   1.43211    4.0234 0.02758449

4.5 compute mean and sd of the intercept and slope

m_i <- res_lm %>%
  filter(term == '(Intercept)') %>%
  summarize(m_intercept = mean(estimate), sd_intersept = sd(estimate))

m_s <- res_lm %>%
  filter(term == 'viq') %>%
  summarize(m_slope = mean(estimate), sd_slope = sd(estimate))

m_all <- cbind(m_i, m_s)
head(m_all)

  m_intercept sd_intersept m_slope sd_slope
1      9.6341       16.027  2.5931   1.2792

lm_i <- res_lm %>%
  filter(term == '(Intercept)')

lm_s <- res_lm %>%
  filter(term == 'viq')

cor(lm_i$estimate, lm_s$estimate)

[1] -0.95948

Homework exercises

Hao-Lun Fu

2020-09-21