1 HW 1

Draw a scatter plot of the variables written and course scores from the data set Gcsemv{mlmRev} and superimpose another scatter plot on it using mean written and mean course scores by school as in the language and math example. You can start by editing in RStudio either the gcsemv rmarkdown or the exam rmarkdown file.

1.1 Data management

# install.package("mlmRev")
library(mlmRev)
# load the data from the package
data(Gcsemv, package="mlmRev")
# view first 6 lines
head(Gcsemv)
  school student gender written course
1  20920      16      M      23     NA
2  20920      25      F      NA   71.2
3  20920      27      F      39   76.8
4  20920      31      F      36   87.9
5  20920      42      M      16   44.4
6  20920      62      F      36     NA

1.2 Visualization

library(tidyverse)
dta1 <- Gcsemv %>% 
  group_by(school) %>%
  mutate(r_sch = cor(course, written, use="pairwise")) 
head(dta1)
# A tibble: 6 x 6
# Groups:   school [1]
  school student gender written course r_sch
  <fct>  <fct>   <fct>    <dbl>  <dbl> <dbl>
1 20920  16      M           23   NA   0.789
2 20920  25      F           NA   71.2 0.789
3 20920  27      F           39   76.8 0.789
4 20920  31      F           36   87.9 0.789
5 20920  42      M           16   44.4 0.789
6 20920  62      F           36   NA   0.789
dta1_a <- dta1 %>%
        group_by(school) %>%
        summarize(ave_written = mean(written, na.rm=TRUE),
                  ave_course = mean(course, na.rm=TRUE))
ggplot(data=dta1, aes(x=written, y=course)) +
 geom_point(color="skyblue") +
 stat_smooth(method="lm", formula=y ~ x, se=F, col="skyblue") +
 geom_point(data=dta1_a, aes(ave_written, ave_course), color="steelblue") +
 stat_smooth(data=dta1_a, aes(ave_written, ave_course),
             method="lm", formula= y ~ x, se=F, color="steelblue") +
 labs(x="Written score", 
      y="Course score") +
 theme_bw()

1.3 The end

2 HW 2

In Rstudio, open and edit this R script from the SAT scores example to add an overall regression line of SAT score by teacher’s salary to three separate regression lines of the same set of variables by states with low, medium and high percentage of students taking the SAT exam.

2.1 input data

dta2 <- read.table("http://www.amstat.org/publications/jse/datasets/sat.dat.txt")

2.2 assign variable names

names(dta2) <- c("State", "Expend", "Ratio", "Salary", "Frac", "Verbal", "Math",
                "Sat")

2.3 check data structure

str(dta2)
'data.frame':   50 obs. of  8 variables:
 $ State : Factor w/ 50 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ Expend: num  4.41 8.96 4.78 4.46 4.99 ...
 $ Ratio : num  17.2 17.6 19.3 17.1 24 18.4 14.4 16.6 19.1 16.3 ...
 $ Salary: num  31.1 48 32.2 28.9 41.1 ...
 $ Frac  : int  8 47 27 6 45 29 81 68 48 65 ...
 $ Verbal: int  491 445 448 482 417 462 431 429 420 406 ...
 $ Math  : int  538 489 496 523 485 518 477 468 469 448 ...
 $ Sat   : int  1029 934 944 1005 902 980 908 897 889 854 ...

2.4 look at the first 6 lines

head(dta2)
       State Expend Ratio Salary Frac Verbal Math  Sat
1    Alabama  4.405  17.2 31.144    8    491  538 1029
2     Alaska  8.963  17.6 47.951   47    445  489  934
3    Arizona  4.778  19.3 32.175   27    448  496  944
4   Arkansas  4.459  17.1 28.934    6    482  523 1005
5 California  4.992  24.0 41.078   45    417  485  902
6   Colorado  5.443  18.4 34.571   29    462  518  980

2.5 load data management and plotting package

library(tidyverse)

2.6 create a factor variable with 3 levels from Frac

dta2 <- mutate(dta2, Fracf = cut(Frac, breaks = c(0, 22, 49, 81),
                           labels = c("Low", "Medium", "High")))
head(dta2)
       State Expend Ratio Salary Frac Verbal Math  Sat  Fracf
1    Alabama  4.405  17.2 31.144    8    491  538 1029    Low
2     Alaska  8.963  17.6 47.951   47    445  489  934 Medium
3    Arizona  4.778  19.3 32.175   27    448  496  944 Medium
4   Arkansas  4.459  17.1 28.934    6    482  523 1005    Low
5 California  4.992  24.0 41.078   45    417  485  902 Medium
6   Colorado  5.443  18.4 34.571   29    462  518  980 Medium

2.7 plot

ggplot(data=dta2, aes(x=Salary, y=Sat, label=State, group=Fracf)) +
 stat_smooth(method="lm", 
             formula= y ~ x,
             se=F, 
             color="gray", 
             linetype=2, 
             size=rel(.5)) +
  stat_smooth(aes(x = Salary, y = Sat, group =F), 
              method = "lm", 
              formula = y ~ x, 
              se = F, 
              color = "purple", 
              linetype = 1, 
              size =  rel(.8))+
 geom_text(aes(color=Fracf), 
           check_overlap=TRUE, 
           show.legend=FALSE, 
           size=rel(2)) +
 labs(x="Salary ($1000)", 
      y="SAT Score") +
 theme_bw()

2.8 end

3 HW 3

Reproduce the plot on the NLSY 86 example by completing the R script using the data file in wide format.

3.1 input data

dta3 <- read.csv("~/data/nlsy86wide.csv")

3.2 inspect data structure

str(dta3)
'data.frame':   166 obs. of  23 variables:
 $ id        : int  23901 25601 37401 40201 63501 70301 72001 76101 76801 77001 ...
 $ sex       : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 2 2 2 1 2 ...
 $ race      : Factor w/ 2 levels "Majority","Minority": 1 1 1 1 1 1 1 1 1 1 ...
 $ grade86   : int  0 0 0 0 1 0 0 0 0 0 ...
 $ grade88   : int  2 1 2 1 3 2 1 3 2 2 ...
 $ grade90   : int  3 3 5 2 4 3 3 4 5 4 ...
 $ grade92   : int  5 6 6 5 6 5 5 6 6 5 ...
 $ age86year : int  6 6 6 5 7 5 6 7 6 6 ...
 $ age88year : int  8 8 8 8 9 8 8 9 9 8 ...
 $ age90year : int  10 10 10 9 11 10 10 11 11 10 ...
 $ age92year : int  12 12 12 12 13 12 12 13 13 12 ...
 $ age86month: int  67 66 67 60 78 62 66 79 76 67 ...
 $ age88month: int  96 95 95 91 108 93 94 109 104 94 ...
 $ age90month: int  119 119 122 112 132 117 118 131 128 117 ...
 $ age92month: int  142 143 144 139 155 139 140 154 151 139 ...
 $ math86    : num  14.29 20.24 17.86 7.14 29.76 ...
 $ math88    : num  15.5 36.9 22.6 21.4 50 ...
 $ math90    : num  38.1 52.4 53.6 53.6 47.6 ...
 $ math92    : num  41.7 58.3 58.3 51.2 71.4 ...
 $ read86    : num  19.05 21.43 21.43 7.14 30.95 ...
 $ read88    : num  29.8 32.1 45.2 21.4 50 ...
 $ read90    : num  28.6 45.2 69 50 63.1 ...
 $ read92    : num  45.2 57.1 78.6 59.5 82.1 ...

3.3 examine first 6 lines

head(dta3)
     id    sex     race grade86 grade88 grade90 grade92 age86year
1 23901 Female Majority       0       2       3       5         6
2 25601 Female Majority       0       1       3       6         6
3 37401 Female Majority       0       2       5       6         6
4 40201   Male Majority       0       1       2       5         5
5 63501   Male Majority       1       3       4       6         7
6 70301   Male Majority       0       2       3       5         5
  age88year age90year age92year age86month age88month age90month
1         8        10        12         67         96        119
2         8        10        12         66         95        119
3         8        10        12         67         95        122
4         8         9        12         60         91        112
5         9        11        13         78        108        132
6         8        10        12         62         93        117
  age92month  math86 math88 math90 math92  read86 read88 read90 read92
1        142 14.2857 15.476 38.095 41.667 19.0476 29.762 28.571 45.238
2        143 20.2381 36.905 52.381 58.333 21.4286 32.143 45.238 57.143
3        144 17.8571 22.619 53.571 58.333 21.4286 45.238 69.048 78.571
4        139  7.1429 21.429 53.571 51.190  7.1429 21.429 50.000 59.524
5        155 29.7619 50.000 47.619 71.429 30.9524 50.000 63.095 82.143
6        139 14.2857 36.905 55.952 63.095 17.8571 46.429 64.286 96.429

3.4 data management

dta3_ <- dta3[c(1:3, 12:15, 20:23)]
head(dta3_)
     id    sex     race age86month age88month age90month age92month
1 23901 Female Majority         67         96        119        142
2 25601 Female Majority         66         95        119        143
3 37401 Female Majority         67         95        122        144
4 40201   Male Majority         60         91        112        139
5 63501   Male Majority         78        108        132        155
6 70301   Male Majority         62         93        117        139
   read86 read88 read90 read92
1 19.0476 29.762 28.571 45.238
2 21.4286 32.143 45.238 57.143
3 21.4286 45.238 69.048 78.571
4  7.1429 21.429 50.000 59.524
5 30.9524 50.000 63.095 82.143
6 17.8571 46.429 64.286 96.429
library(tidyverse)
dta3L_month <- dta3_[1:7] %>% 
  gather(time, month, age86month:age92month) %>%
  mutate(time = parse_number(time))

dta3L_read <- dta3_[c(1:3, 8:11)] %>% 
  gather(time, read, read86:read92) %>%
  mutate(time = parse_number(time))
str(dta3L_read)
'data.frame':   664 obs. of  5 variables:
 $ id  : int  23901 25601 37401 40201 63501 70301 72001 76101 76801 77001 ...
 $ sex : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 2 2 2 1 2 ...
 $ race: Factor w/ 2 levels "Majority","Minority": 1 1 1 1 1 1 1 1 1 1 ...
 $ time: num  86 86 86 86 86 86 86 86 86 86 ...
 $ read: num  19.05 21.43 21.43 7.14 30.95 ...
dta3L <- merge(dta3L_month, dta3L_read, by = c("id", "sex", "race", "time"))
str(dta3L)
'data.frame':   664 obs. of  6 variables:
 $ id   : int  1003201 1003201 1003201 1003201 1012901 1012901 1012901 1012901 1013201 1013201 ...
 $ sex  : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 1 1 ...
 $ race : Factor w/ 2 levels "Majority","Minority": 2 2 2 2 2 2 2 2 2 2 ...
 $ time : num  86 88 90 92 86 88 90 92 86 88 ...
 $ month: int  60 91 116 138 75 103 127 151 69 99 ...
 $ read : num  10.7 36.9 36.9 45.2 21.4 ...

3.5 plot

ggplot(data=dta3L, aes(x = month, y = read, group=id)) +
 geom_point(size=rel(.5)) +
 stat_smooth(method ="lm", formula=y ~ x, se=F) +
 facet_grid(race ~ sex) +
 labs(x="Month", y="Reading score") +
 theme_bw()

3.6 end

4 HW 4

For the IQ and language example, plot 131 school-level regression lines of language score against verbal IQ (similar to the figure on the page). Compute the mean and standard deviation of the intercept and slope estimates. What is the correlation coefficient between the two parameter estimates?

Column 1: School ID Column 2: Pupil ID Column 3: Verbal IQ Column 4: Language achievement score Column 5: The number of 8th graders in the classroom Column 6: Socioeconomic status score

4.1 input data

dta4 <- read.table("~/data/verbalIQ.txt", header = T)
head(dta4)
  school pupil  viq language csize ses
1      1 17001 15.0       46    29  23
2      1 17002 14.5       45    29  10
3      1 17003  9.5       33    29  15
4      1 17004 11.0       46    29  23
5      1 17005  8.0       20    29  10
6      1 17006  9.5       30    29  10

4.2 Visualization

ggplot(data = dta4, aes(x = viq, y = language, group = school)) +
  stat_smooth(method ="lm", formula=y ~ x, se=F, color = "grey")+
  geom_point() +
  labs(x = "Verbal IQ",
       y = "Language Score") +
  theme_bw()

cor.test(dta4$viq, dta4$language)

    Pearson's product-moment correlation

data:  dta4$viq and dta4$language
t = 36.8, df = 2280, p-value <2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.58341 0.63494
sample estimates:
    cor 
0.60982 
viq_avg <- with(dta4, tapply(viq, school, mean, na.rm = T))
language_avg <- with(dta4, tapply(language, school, mean, na.rm = T))
cor.test(viq_avg, language_avg)

    Pearson's product-moment correlation

data:  viq_avg and language_avg
t = 13.9, df = 129, p-value <2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.69418 0.83440
sample estimates:
   cor 
0.7736