Exercise #01

Refer to the questions here.

Q1

library(MASS)
# count the sum of female
dta <- minn38
sum_sex <- aggregate(dta$f, list(dta$sex), sum)
sum_sex[1,2]

## [1] 7861

# count the sum of female to college
f.dta <- minn38[85:168,]
female_phs <- aggregate(f.dta$f, list(f.dta$phs), sum)
female_phs[1,2]

## [1] 2027

Q2

library("bitops")
library("RCurl")
library("psych")
#read in data
dta <- read.table(textConnection(getURL("http://titan.ccunix.ccu.edu.tw/~psycfs/lmm/Data/verbalIQ.txt")), header = TRUE)

mydta <- dta[,-c(1,5)]
describe(mydta)

##          vars    n       mean        sd  median    trimmed       mad   min
## pupil       1 2287 1337225.18 705458.10 1417007 1341135.43 904383.03 17001
## viq         2 2287      11.83      2.07      12      11.88      1.48     4
## language    3 2287      40.93      9.00      42      41.51     10.38     9
## ses         4 2287      27.81     10.91      27      27.26     10.38    10
##              max   range  skew kurtosis       se
## pupil    2587010 2570009 -0.06    -1.18 14751.57
## viq           18      14 -0.23     0.67     0.04
## language      58      49 -0.52    -0.32     0.19
## ses           50      40  0.40    -0.75     0.23

#do correlation
lm(language ~ viq+ses, data = mydta)

## 
## Call:
## lm(formula = language ~ viq + ses, data = mydta)
## 
## Coefficients:
## (Intercept)          viq          ses  
##      8.3313       2.4054       0.1488

Q3

dta <- nlschools
dta_mean <- aggregate(dta[,1:2], by = list(dta$class), mean)
head(dta_mean)

##   Group.1     lang        IQ
## 1     180 36.40000 10.320000
## 2     280 23.71429  9.000000
## 3    1082 30.40000 10.500000
## 4    1280 30.86667  9.433333
## 5    1580 30.87500 11.500000
## 6    1680 41.50000 11.687500

Q4

data("Gcsemv", package = "mlmRev")
dta <-Gcsemv
complete_dta <- dta[complete.cases(dta), ]  #remove rows with nas

# correlation between written and course / or class in gender first
Gcs_m <- aggregate(cbind(written, course) ~ gender, data = complete_dta,
                   FUN = mean)
with(complete_dta, plot(written, course, col = "gray"))
abline(lm(course ~ written, data = complete_dta), col = "gray")
with(Gcs_m, points(course, written))
abline(lm(written ~ course, data = Gcs_m))

Q5

library(magrittr)
library(plyr)
library(lattice)
library(ggplot2)

data("Gcsemv", package = "mlmRev")
dta <-Gcsemv
complete_dta <- dta[complete.cases(dta), ]

# all correlation
dtaall_cor <- cor(complete_dta$written, complete_dta$course)

# find each school's correlation
each_cor <- ddply(complete_dta, "school", summarise, corr=cor(written, course))
each_cor %<>% na.omit(each_cor)

# draw histogram
ggplot(data=each_cor, aes(each_cor$corr)) +
  geom_histogram(fill = "skyblue") +
  labs(x = "Correlation coefficients", y = "count") +
  geom_vline(xintercept = dtaall_cor) +
  geom_vline(xintercept = mean(each_cor[,2]), linetype="dotted", col="red")

Q6

library(pacman)

## Warning: package 'pacman' was built under R version 3.4.2

library(Hmisc)

## Warning: package 'Hmisc' was built under R version 3.4.2

# packages in use in this script
pacman::p_load(tidyverse, car, HSAUR3)

# make sure data is invoked
data(BtheB)

# data structure
str(BtheB)

## 'data.frame':    100 obs. of  8 variables:
##  $ drug     : Factor w/ 2 levels "No","Yes": 1 2 2 1 2 2 2 1 2 2 ...
##  $ length   : Factor w/ 2 levels "<6m",">6m": 2 2 1 2 2 1 1 2 1 2 ...
##  $ treatment: Factor w/ 2 levels "TAU","BtheB": 1 2 1 2 2 2 1 1 2 2 ...
##  $ bdi.pre  : num  29 32 25 21 26 7 17 20 18 20 ...
##  $ bdi.2m   : num  2 16 20 17 23 0 7 20 13 5 ...
##  $ bdi.3m   : num  2 24 NA 16 NA 0 7 21 14 5 ...
##  $ bdi.5m   : num  NA 17 NA 10 NA 0 3 19 20 8 ...
##  $ bdi.8m   : num  NA 20 NA 9 NA 0 7 13 11 12 ...

# first 6 lines
head(BtheB)

##   drug length treatment bdi.pre bdi.2m bdi.3m bdi.5m bdi.8m
## 1   No    >6m       TAU      29      2      2     NA     NA
## 2  Yes    >6m     BtheB      32     16     24     17     20
## 3  Yes    <6m       TAU      25     20     NA     NA     NA
## 4   No    >6m     BtheB      21     17     16     10      9
## 5  Yes    >6m     BtheB      26     23     NA     NA     NA
## 6  Yes    <6m     BtheB       7      0      0      0      0

# add ID to the original data and save a copy to dta
dta <- data.frame(ID = rownames(BtheB), BtheB)

# rename variables 3-7 to different names
names(dta)[5:9] <- paste0("month_", c(0, 2, 3, 5, 8))

# convert wide to long format
# add time variable and remove useless column
dtaL <- dta %>% gather(Test, Score, 5:9) %>%
  separate(Test, c("Var", "Month")) %>%
  mutate(Month = as.numeric(Month) , Group = treatment) %>%

  dplyr::select( -Var)

#
ot <- theme_set(theme_bw())

# panel plot by group
ggplot(data = dtaL , aes(x = Month, y = Score, group = ID)) +
  geom_point() +
  geom_line() +
  facet_wrap(~ Group)

## Warning: Removed 120 rows containing missing values (geom_point).

## Warning: Removed 120 rows containing missing values (geom_path).

# means plus-minus CIs
ggplot(data = dtaL , aes(x = Month, y = Score, color = Group)) +
  stat_summary(fun.data = "mean_cl_boot", geom = "pointrange",
               position = position_dodge(.3))

## Warning: Removed 120 rows containing non-finite values (stat_summary).

Q7

library(WWGbook)

# load in data
dta <- autism

#turn into factor
sicdegp <- factor(dta$sicdegp)

#plot
g1 <- ggplot(data = dta , aes(x = age, y = vsae, group = childid)) +
      geom_point() +
      geom_line() +
      facet_wrap(~ sicdegp)
g2 <- g1 + labs(x = "Age (years) ", y= "VSAE score")
g2

## Warning: Removed 2 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_path).