1 Homework Exercises 1

1.1 Data management

# install.package("mlmRev")
library(mlmRev)
# load the data from the package
data(Gcsemv, package="mlmRev")
# invoke help document
?Gcsemv
# view first 6 lines
head(Gcsemv)
  school student gender written course
1  20920      16      M      23     NA
2  20920      25      F      NA   71.2
3  20920      27      F      39   76.8
4  20920      31      F      36   87.9
5  20920      42      M      16   44.4
6  20920      62      F      36     NA

1.2 Summary statistics

with(Gcsemv, cor(written, course, use="pairwise"))
[1] 0.47417
# compute the means by school
course_schavg <- with(Gcsemv, tapply(course, school, mean, na.rm=T))
written_schavg <- with(Gcsemv, tapply(written, school , mean, na.rm=T))
cor(course_schavg, written_schavg)
[1] 0.39568

1.3 Visualization

library(tidyverse)
dta <- Gcsemv %>% 
  group_by(school) %>%
  mutate(r_sch = cor(course, written, use="pairwise")) 
dtar <- dta[!duplicated(dta$school),"r_sch"]
# superimpose two plots
ggplot(data=Gcsemv, aes(x=course, y=written)) +
 geom_point(color="lightskyblue1") +
 geom_point(data=dtar, aes(x=course_schavg, y=written_schavg), color="steelblue") +labs(x="course score", 
      y="written score") +
 theme_bw()

## The end

2 Homework Exercises 2: SAT by states

2.1 input data

dta <- read.table("http://www.amstat.org/publications/jse/datasets/sat.dat.txt")

##assign variable names

names(dta) <- c("State", "Expend", "Ratio", "Salary", "Frac", "Verbal", "Math",
                "Sat")

2.2 check data structure

str(dta)
'data.frame':   50 obs. of  8 variables:
 $ State : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
 $ Expend: num  4.41 8.96 4.78 4.46 4.99 ...
 $ Ratio : num  17.2 17.6 19.3 17.1 24 18.4 14.4 16.6 19.1 16.3 ...
 $ Salary: num  31.1 48 32.2 28.9 41.1 ...
 $ Frac  : int  8 47 27 6 45 29 81 68 48 65 ...
 $ Verbal: int  491 445 448 482 417 462 431 429 420 406 ...
 $ Math  : int  538 489 496 523 485 518 477 468 469 448 ...
 $ Sat   : int  1029 934 944 1005 902 980 908 897 889 854 ...

2.3 look at the first 6 lines

head(dta)
       State Expend Ratio Salary Frac Verbal Math  Sat
1    Alabama  4.405  17.2 31.144    8    491  538 1029
2     Alaska  8.963  17.6 47.951   47    445  489  934
3    Arizona  4.778  19.3 32.175   27    448  496  944
4   Arkansas  4.459  17.1 28.934    6    482  523 1005
5 California  4.992  24.0 41.078   45    417  485  902
6   Colorado  5.443  18.4 34.571   29    462  518  980

2.4 load data management and plotting package

library(tidyverse)

2.5 create a factor variable with 3 levels from Frac

dta <- mutate(dta, Fracf = cut(Frac, breaks = c(0, 22, 49, 81),
                           labels = c("Low", "Medium", "High")))

2.6 plot

ggplot(data=dta, aes(x=Salary, y=Sat, label=State)) +
   stat_smooth(method="lm", 
             formula= y ~ x,
             se=F, 
             color="black", 
             linetype=3, 
             size=rel(.5)) +
   stat_smooth(aes(group = Fracf),
               method="lm", 
               formula= y ~ x,
               se=F, 
               color="gray", 
               linetype=2, 
               size=rel(.5))+
   geom_text(aes(color=Fracf), 
           check_overlap=TRUE, 
           show.legend=FALSE, 
           size=rel(2)) +
   labs(x="Salary ($1000)", 
      y="SAT Score") +
   theme_bw()

2.7 end

3 Homework Exercises 3: “Individual growth curves by groups”

##“2020-09-20”

3.1 input data

dta <- read.csv("C:\\Users\\88696\\Desktop\\20200914\\homework3\\nlsy86wide.csv")

3.2 inspect data structure

str(dta)
'data.frame':   166 obs. of  23 variables:
 $ id        : int  23901 25601 37401 40201 63501 70301 72001 76101 76801 77001 ...
 $ sex       : chr  "Female" "Female" "Female" "Male" ...
 $ race      : chr  "Majority" "Majority" "Majority" "Majority" ...
 $ grade86   : int  0 0 0 0 1 0 0 0 0 0 ...
 $ grade88   : int  2 1 2 1 3 2 1 3 2 2 ...
 $ grade90   : int  3 3 5 2 4 3 3 4 5 4 ...
 $ grade92   : int  5 6 6 5 6 5 5 6 6 5 ...
 $ age86year : int  6 6 6 5 7 5 6 7 6 6 ...
 $ age88year : int  8 8 8 8 9 8 8 9 9 8 ...
 $ age90year : int  10 10 10 9 11 10 10 11 11 10 ...
 $ age92year : int  12 12 12 12 13 12 12 13 13 12 ...
 $ age86month: int  67 66 67 60 78 62 66 79 76 67 ...
 $ age88month: int  96 95 95 91 108 93 94 109 104 94 ...
 $ age90month: int  119 119 122 112 132 117 118 131 128 117 ...
 $ age92month: int  142 143 144 139 155 139 140 154 151 139 ...
 $ math86    : num  14.29 20.24 17.86 7.14 29.76 ...
 $ math88    : num  15.5 36.9 22.6 21.4 50 ...
 $ math90    : num  38.1 52.4 53.6 53.6 47.6 ...
 $ math92    : num  41.7 58.3 58.3 51.2 71.4 ...
 $ read86    : num  19.05 21.43 21.43 7.14 30.95 ...
 $ read88    : num  29.8 32.1 45.2 21.4 50 ...
 $ read90    : num  28.6 45.2 69 50 63.1 ...
 $ read92    : num  45.2 57.1 78.6 59.5 82.1 ...

3.3 examine first 6 lines

head(dta)
     id    sex     race grade86 grade88 grade90 grade92 age86year age88year
1 23901 Female Majority       0       2       3       5         6         8
2 25601 Female Majority       0       1       3       6         6         8
3 37401 Female Majority       0       2       5       6         6         8
4 40201   Male Majority       0       1       2       5         5         8
5 63501   Male Majority       1       3       4       6         7         9
6 70301   Male Majority       0       2       3       5         5         8
  age90year age92year age86month age88month age90month age92month  math86
1        10        12         67         96        119        142 14.2857
2        10        12         66         95        119        143 20.2381
3        10        12         67         95        122        144 17.8571
4         9        12         60         91        112        139  7.1429
5        11        13         78        108        132        155 29.7619
6        10        12         62         93        117        139 14.2857
  math88 math90 math92  read86 read88 read90 read92
1 15.476 38.095 41.667 19.0476 29.762 28.571 45.238
2 36.905 52.381 58.333 21.4286 32.143 45.238 57.143
3 22.619 53.571 58.333 21.4286 45.238 69.048 78.571
4 21.429 53.571 51.190  7.1429 21.429 50.000 59.524
5 50.000 47.619 71.429 30.9524 50.000 63.095 82.143
6 36.905 55.952 63.095 17.8571 46.429 64.286 96.429
library(dplyr)

long_grade <- dta %>% gather (key = grade_y, value = grade, grade86:grade92)
longgrade <- long_grade[,c("id","sex","race","grade_y","grade")]

long_ageyear <- dta %>% gather (key = age_y, value = ageyear, age86year:age92year)
longageyear <- long_ageyear[,c("id","sex","race","age_y","ageyear")]

long_agemonth <- dta %>% gather (key = age_m, value = agemonth, age86month:age92month)
longagemonth <- long_agemonth[,c("id","sex","race","age_m","agemonth")]

long_math <- dta %>% gather (key = math_y, value = math, math86:math92)
longmath <- long_math[,c("id","sex","race","math_y","math")]

long_read <- dta %>% gather (key = read_y, value = read, read86:read92)
longread <- long_read[,c("id","sex","race","read_y","read")]

longdta <- cbind ((longgrade[,c("id","sex","race","grade_y","grade")]),
                      (longageyear[,c("id","age_y","ageyear")]),
                      (longagemonth[,c("id","age_m","agemonth")]), 
                      (longmath[,c("id","math_y","math")]), 
                      (longread[,c("id","read_y","read")]),by= "id")

longdtafinal <- longdta[,c("id","sex","race","ageyear","agemonth","math","read")]
head(longdtafinal)
     id    sex     race ageyear agemonth    math    read
1 23901 Female Majority       6       67 14.2857 19.0476
2 25601 Female Majority       6       66 20.2381 21.4286
3 37401 Female Majority       6       67 17.8571 21.4286
4 40201   Male Majority       5       60  7.1429  7.1429
5 63501   Male Majority       7       78 29.7619 30.9524
6 70301   Male Majority       5       62 14.2857 17.8571

3.4 plot

library(tidyverse)
ggplot(data=longdtafinal, aes(x=agemonth, y=read, group=id)) +
 geom_point(size=rel(.5)) +
 stat_smooth(mapping = NULL,
             data = NULL,
             geom = "smooth",
             position = "identity",
             method ="lm", 
             formula= y ~ x,
             se=F, 
             fullrange = FALSE,
             level = 0.95,
             color="skyblue2", 
             linetype=1, 
             size=rel(.1)) +
  facet_grid(rows = vars(race),
             cols = vars(sex),
             scales = "free",
             space = "free",
             shrink = T,
             labeller = "label_value",
             as.table = T,
             switch = NULL,
             drop = T,
             margins = T, #F就變成原本的四格圖
             facets = NULL)  +
  labs(x="Month", y="Reading score") +
  theme_bw()

ggplot(data=longdtafinal, aes(x=agemonth, y=read, group=id)) +
 geom_point(size=rel(.5)) +
 stat_smooth(mapping = NULL,
             data = NULL,
             geom = "smooth",
             position = "identity",
             method ="lm", 
             formula= y ~ x,
             se=F, 
             fullrange = FALSE,
             level = 0.95,
             color="skyblue2", 
             linetype=1, 
             size=rel(.1)) +
  facet_grid(rows = vars(race),
             cols = vars(sex),
             scales = "free",
             space = "free",
             shrink = T,
             labeller = "label_value",
             as.table = T,
             switch = NULL,
             drop = T,
             margins = F, 
             facets = NULL)  +
  labs(x="Month", y="Reading score") +
  theme_bw()

3.5 end

4 Homework Exercises 4:

author: “Chang-Chun Chen” date: “2020-09-20”

4.1 Input data

dta <- read.table("C:\\Users\\88696\\Desktop\\20200914\\homework4\\verbalIQ.txt",header=T)
head (dta)
  school pupil  viq language csize ses
1      1 17001 15.0       46    29  23
2      1 17002 14.5       45    29  10
3      1 17003  9.5       33    29  15
4      1 17004 11.0       46    29  23
5      1 17005  8.0       20    29  10
6      1 17006  9.5       30    29  10

4.2 plot 131 school-level regression lines of language score against verbal IQ

library(tidyverse)
ggplot(data=dta, aes(x=viq, y=language, group = school)) +
  geom_point() + 
  geom_smooth(method="lm", 
              formula= y ~ x,
              se=F, 
              color="gray", 
              linetype=2, 
              size=rel(.5)) +
    labs(x="Verbal IQ", 
       y="language") +
  theme_bw()

4.3 Compute the mean and standard deviation of the intercept and slope estimates.

mlang <- with(dta, tapply(language, school, mean))
mviq <- with(dta, tapply(viq, school, mean))
model=lm(formula = mlang ~ mviq, data = dta)
summary(model)

Call:
lm(formula = mlang ~ mviq, data = dta)

Residuals:
   Min     1Q Median     3Q    Max 
-8.416 -1.751  0.046  2.218  6.889 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept)   -7.269      3.431   -2.12    0.036
mviq           4.049      0.292   13.87   <2e-16

Residual standard error: 3.35 on 129 degrees of freedom
Multiple R-squared:  0.598, Adjusted R-squared:  0.595 
F-statistic:  192 on 1 and 129 DF,  p-value: <2e-16

4.4 Correlation coefficient between language score and verbal IQ

with(dta, cor(viq, language, method = "pearson"))
[1] 0.60982

4.5 Correlation coefficient visualization

library(tidyverse)
library(ggplot2)
library(nlme)
library(dplyr)
library(gapminder)

ggplot(data=dta, aes(x=viq, y=language)) +
  geom_point() + 
  geom_smooth(method="lm", 
              formula= y ~ x,
              se=F, 
              level = 0.95,
              color="red", 
              linetype=2, 
              size=rel(.5)) +
    labs(x="Verbal IQ", y="language") +
  theme_bw()

##The end