Problem statement

Reproduce the plot on the NLSY 86 example by completing the R script using the data file in wide format.

Introduction

The data are drawn from the National Longitudinal Survey of Youth (NLSY). The sample observations are from the 1986, 1988, 1990, and 1992 assessment periods. Children were selected to be in kindergarten, first, and second grade and to be of age 5, 6, or 7 at the first assessment (1986). Both reading and mathematical achievement scores are recorded. The former is a recognition subscore of the Peabody Individual Achievement Test (PIAT). This was scaled as the percentage of 84 items that were answered correctly. The same 84 items were administered at all four time points, providing a consistent scale over time. The data set is a subsample of 166 subjects with complete observations.

Source: Bollen, K.A. & Curran, P.J. (2006). Latent curve models. A structural equation perspective. p.59.

Column 1: Student ID Column 2: Gender, male or female Column 3: Race, minority or majority Column 4: Measurement occasions Column 5: Grade at which measurements were made, Kindergarten = 0, First grade = 1, Second grade = 2 Column 6: Age in years Column 7: Age in months Column 8: Math score Column 9: Reading score

Data management

# input data
dta <- read.csv("C:/Users/Ching-Fang Wu/Documents/lmm/nlsy86wide.csv", h=T)

# inspect data structure
str(dta)

'data.frame':   166 obs. of  23 variables:
 $ id        : int  23901 25601 37401 40201 63501 70301 72001 76101 76801 77001 ...
 $ sex       : chr  "Female" "Female" "Female" "Male" ...
 $ race      : chr  "Majority" "Majority" "Majority" "Majority" ...
 $ grade86   : int  0 0 0 0 1 0 0 0 0 0 ...
 $ grade88   : int  2 1 2 1 3 2 1 3 2 2 ...
 $ grade90   : int  3 3 5 2 4 3 3 4 5 4 ...
 $ grade92   : int  5 6 6 5 6 5 5 6 6 5 ...
 $ age86year : int  6 6 6 5 7 5 6 7 6 6 ...
 $ age88year : int  8 8 8 8 9 8 8 9 9 8 ...
 $ age90year : int  10 10 10 9 11 10 10 11 11 10 ...
 $ age92year : int  12 12 12 12 13 12 12 13 13 12 ...
 $ age86month: int  67 66 67 60 78 62 66 79 76 67 ...
 $ age88month: int  96 95 95 91 108 93 94 109 104 94 ...
 $ age90month: int  119 119 122 112 132 117 118 131 128 117 ...
 $ age92month: int  142 143 144 139 155 139 140 154 151 139 ...
 $ math86    : num  14.29 20.24 17.86 7.14 29.76 ...
 $ math88    : num  15.5 36.9 22.6 21.4 50 ...
 $ math90    : num  38.1 52.4 53.6 53.6 47.6 ...
 $ math92    : num  41.7 58.3 58.3 51.2 71.4 ...
 $ read86    : num  19.05 21.43 21.43 7.14 30.95 ...
 $ read88    : num  29.8 32.1 45.2 21.4 50 ...
 $ read90    : num  28.6 45.2 69 50 63.1 ...
 $ read92    : num  45.2 57.1 78.6 59.5 82.1 ...

# examine first 6 lines
knitr::kable(head(dta))

id	sex	race	grade86	grade88	grade90	grade92	age86year	age88year	age90year	age92year	age86month	age88month	age90month	age92month	math86	math88	math90	math92	read86	read88	read90	read92
23901	Female	Majority	0	2	3	5	6	8	10	12	67	96	119	142	14.2857	15.476	38.095	41.667	19.0476	29.762	28.571	45.238
25601	Female	Majority	0	1	3	6	6	8	10	12	66	95	119	143	20.2381	36.905	52.381	58.333	21.4286	32.143	45.238	57.143
37401	Female	Majority	0	2	5	6	6	8	10	12	67	95	122	144	17.8571	22.619	53.571	58.333	21.4286	45.238	69.048	78.571
40201	Male	Majority	0	1	2	5	5	8	9	12	60	91	112	139	7.1429	21.429	53.571	51.190	7.1429	21.429	50.000	59.524
63501	Male	Majority	1	3	4	6	7	9	11	13	78	108	132	155	29.7619	50.000	47.619	71.429	30.9524	50.000	63.095	82.143
70301	Male	Majority	0	2	3	5	5	8	10	12	62	93	117	139	14.2857	36.905	55.952	63.095	17.8571	46.429	64.286	96.429

library(dplyr)
#install.packages("tidyr")
library(tidyr)
# 寬資料轉換成長資料
long_grade <- dta %>% gather (key = grade_y, value = grade, grade86:grade92)
longgrade <- long_grade[,c("id","sex","race","grade_y","grade")]

long_ageyear <- dta %>% gather (key = age_y, value = ageyear, age86year:age92year)
longageyear <- long_ageyear[,c("id","sex","race","age_y","ageyear")]

long_agemonth <- dta %>% gather (key = age_m, value = agemonth, age86month:age92month)
longagemonth <- long_agemonth[,c("id","sex","race","age_m","agemonth")]

long_math <- dta %>% gather (key = math_y, value = math, math86:math92)
longmath <- long_math[,c("id","sex","race","math_y","math")]

long_read <- dta %>% gather (key = read_y, value = read, read86:read92)
longread <- long_read[,c("id","sex","race","read_y","read")]

longdta <- cbind ((longgrade[,c("id","sex","race","grade_y","grade")]),
                      (longageyear[,c("id","age_y","ageyear")]),
                      (longagemonth[,c("id","age_m","agemonth")]), 
                      (longmath[,c("id","math_y","math")]), 
                      (longread[,c("id","read_y","read")]),by= "id")
                  
longdtafinal<-longdta[,c("id","sex","race","ageyear","agemonth","math","read")]

head(longdtafinal)

     id    sex     race ageyear agemonth    math    read
1 23901 Female Majority       6       67 14.2857 19.0476
2 25601 Female Majority       6       66 20.2381 21.4286
3 37401 Female Majority       6       67 17.8571 21.4286
4 40201   Male Majority       5       60  7.1429  7.1429
5 63501   Male Majority       7       78 29.7619 30.9524
6 70301   Male Majority       5       62 14.2857 17.8571

Visualization

library(tidyverse)
# plot
ggplot(data=longdtafinal, aes(x=agemonth, y=read, group=id)) +
 geom_point(size=rel(.5)) +
 stat_smooth(mapping = NULL,
             data = NULL,
             geom = "smooth",
             position = "identity",
             method ="lm", 
             formula= y ~ x,
             se=F, 
             fullrange = FALSE,
             level = 0.95,
             color="skyblue2", 
             linetype=1, 
             size=rel(.1)) +
  facet_grid(rows = vars(race),
             cols = vars(sex),
             scales = "free",
             space = "free",
             shrink = T,
             labeller = "label_value",
             as.table = T,
             switch = NULL,
             drop = T,
             margins = T, #F就變成原本的四格圖
             facets = NULL)  +
  labs(x="Month", y="Reading score") +
  theme_bw()

ggplot(data=longdtafinal, aes(x=agemonth, y=read, group=id)) +
 geom_point(size=rel(.5)) +
 stat_smooth(mapping = NULL,
             data = NULL,
             geom = "smooth",
             position = "identity",
             method ="lm", 
             formula= y ~ x,
             se=F, 
             fullrange = FALSE,
             level = 0.95,
             color="skyblue2", 
             linetype=1, 
             size=rel(.1)) +
  facet_grid(rows = vars(race),
             cols = vars(sex),
             scales = "free",
             space = "free",
             shrink = T,
             labeller = "label_value",
             as.table = T,
             switch = NULL,
             drop = T,
             margins = F, 
             facets = NULL)  +
  labs(x="Month", y="Reading score") +
  theme_bw()

W2 Exercise 3：Individual growth curves by groups

Ching-Fang Wu

14 September 2020

Problem statement

Introduction

Data management

Visualization

The End