In-class exercise

library(tidyverse)
library(ggplot2)
library(lattice)
library(dplyr)

Q1.

Find out what each code chunk (indicated by ‘##’) in the R script does and provide comments.

## 用base graphic方式建一空圖(type='n')，畫出一個點
plot(women, type='n')
points(women[1,])

## 用xyplot畫women資料集，選取子集(row.name是1的)，點圖(type='p')
lattice::xyplot(weight ~ height, 
  data=women,
  subset=row.names(women)==1, type='p')

## 用ggplot畫women第一筆資料，橫軸height縱軸weight，點圖
ggplot(data=women[1,], aes(height, weight))+
  geom_point()

Q2.

The data set is concerned with grade 8 pupils (age about 11 years) in elementary schools in the Netherlands. After deleting pupils with missing values, the number of pupils is 2,287 and the number of schools is 131. Class size ranges from 4 to 35. The response variables are score on a language test and that on an arithmetic test. The research intest is on how the two test scores depend on the pupil’s intelligence (verbal IQ) and on the number of pupils in a school class. The class size is categorized into small, medium, and large with roughly equal number of observations in each category. The verbal IQ is categorized into low, middle and high with roughly equal number of observations in each category. Reproduce the plot below.

Source: Snijders, T. & Bosker, R. (2002). Multilevel Analysis.

Column 1: School ID 
Column 2: Pupil ID 
Column 3: Verbal IQ score 
Column 4: The number of pupils in a class
Column 5: Language test score 
Column 6: Arithmetic test score

dta <- read.table("langMathDutch.txt",header = T)
head(dta)

dta %>% 
  mutate(iqvL = cut(IQV, ordered = TRUE, 
                    breaks = quantile(IQV, probs = c(0, 0.33, 0.67, 1)), 
                    labels = c("Low", "Middle", "High")), 
         clsS = cut(size, ordered = TRUE,
                    breaks = quantile(size, probs = c(0, 0.33, 0.67, 1)),
                    labels = c("Small", "Medium", "Large")),
         tmp = paste(clsS, iqvL, sep = ",")) %>% 
  .[complete.cases(.),] %>%
  qplot(lang, arith, data = ., geom = c("point", "smooth"), method = "lm", 
        facets =  ~ tmp, shape = I(18),
        xlab="Language score",ylab="Arithmentic score")

## Warning: Ignoring unknown parameters: method

## Warning: Ignoring unknown parameters: shape

## `geom_smooth()` using formula 'y ~ x'

Q3.

Use the USPersonalExpenditure{datasets} for this problem. This data set consists of United States personal expenditures (in billions of dollars) in the categories; food and tobacco, household operation, medical and health, personal care, and private education for the years 1940, 1945, 1950, 1955 and 1960. Plot the US personal expenditure data in the style of the third plot on the “Time Use” case study in the course web page. You might want to transform the dollar amounts to log base 10 unit first.

USPersonalExpenditure %>% as.data.frame() %>% 
  mutate(categories = row.names(.)) %>% 
  gather(key = age, value = dollars, 1:5) %>%
  mutate(dollars = log10(dollars)) %>%
  qplot(dollars, age, data = .) +
    geom_segment(aes(xend = 1, yend = age)) +
    geom_vline(xintercept = 1, colour = "grey50") +
    facet_wrap(~ categories, nrow = 1)

Q4.

A sample of 158 children with autisim spectrum disorder were recruited. Social development was assessed using the Vineland Adaptive Behavior Interview survey form, a parent-reported measure of socialization. It is a combined score that included assessment of interpersonal relationships, play/leisure time activities, and coping skills. Initial language development was assessed using the Sequenced Inventory of Communication Development (SICD) scale. These assessments were repeated on these children when they were 3, 5, 9, 13 years of age. Source: West, B.T., Welch, K.B., & Galecki, A.T. (2002). Linear Mixed Models: Practical Guide Using Statistical Software. p. 220-271.

Data: autism{WWGbook}

Column 1: Age (in years) 
Column 2: Vineland Socialization Age Equivalent score 
Column 3: Sequenced Inventory of Communication Development Expressive Group (1 = Low, 2 = Medium, 3 = High) 
Column 4: Child ID

dta <- WWGbook::autism
head(dta)

dta$sicdegp <- factor(dta$sicdegp, levels = c(1,2,3), labels = c("Low", "Medium", "High"))
dta <- na.omit(dta)
ggplot(dta, aes(x = age, y = vsae)) +
  geom_line(aes(group = childid), alpha = 0.5) +
  geom_point(alpha = 0.5) +
  stat_smooth(method="lm", formula=y ~ poly(x, 2)) +
  facet_wrap(. ~ sicdegp) +
  labs(x="Age (in years, centered) ", y="VASE score") +
  theme_bw()

pd <- position_dodge(.3) # 使他們不要互相蓋住
p <- dta %>% group_by(age, sicdegp) %>%
  summarize(m_v=mean(vsae),
            se_v=sd(vsae)/sqrt(n())) %>%
  ggplot() +
  aes(age, m_v,
      group=sicdegp,
      shape=sicdegp) +
  geom_errorbar(aes(ymin=m_v - se_v,
                    ymax=m_v + se_v),
                width=.5, size=.3,
                position=pd) + # position 重疊可閃開
  geom_line(position=pd,
            linetype='dotted') +
  geom_point(position=pd,
             size=rel(3)) +
  labs(x="Age(in years - 2)", y="VASE score") +
  theme(legend.position=c(.1, .8)) +
  theme_bw()
p

Q5.

Use the diabetes dataset to generate a plot similar to the one below and inteprete the plot.

library(ggalluvial)

## Warning: package 'ggalluvial' was built under R version 3.6.2

dta <- read.csv("diabetes_mell.csv",header = T)
head(dta)

dta <- data.frame(with(dta[, c("race", "gender", "diabetes", "BMI")],
                       xtabs(~ race + gender + diabetes + BMI)))
head(dta)

ggplot(dta,
       aes(axis1=race,
           axis2=gender,
           axis3=diabetes,
           y=Freq)) +
  scale_x_discrete(limits=c("race",
                            "gender",
                            "diabetes"),
                   expand=c(.1, .05)) +
  labs(y='No. individuals') +
  ggtitle('Diabetes in overall population in US 2009-2010',
          subtitle = 'straitified by race, gender and diabetes mellitus') +
  geom_alluvium(aes(fill=BMI)) +
  geom_stratum() +
  geom_text(stat="stratum",
            infer.label=TRUE) +
  scale_fill_manual(values=c('gray40','tan1'))+
  theme_minimal() +
  theme(legend.position = 'bottom')

Q6.

Find out what each code chunk (indicated by ‘##’) in the R script does and provide comments.

## library the packages and open the R Documentation of ggplot2
# library(ggplot2)
# ?ggplot2

## install packages and library it
# install.packages("gapminder")
library(gapminder)

## get data and check the data structure
data(gapminder)
str(gapminder)

## Classes 'tbl_df', 'tbl' and 'data.frame':    1704 obs. of  6 variables:
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ pop      : int  8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
##  $ gdpPercap: num  779 821 853 836 740 ...

## assign the data to the gap variable
gap <- gapminder

## plot empty
ggplot(data = gap, aes(x = lifeExp))

## plot histogram of lifeExp
ggplot(data = gap, aes(x = lifeExp)) + 
    geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## setting histogram parameters, title, xlab, ylab, and theme
ggplot(data = gap, aes(x = lifeExp)) + 
  geom_histogram(fill = "blue", color = "black", bins = 10) + 
  ggtitle("Life expectancy for the gap dataset") + 
  xlab("Life expectancy (years)") + 
  ylab("Frequency") + 
  theme_classic()

## boxplot with each factor labels different colors(fill)
ggplot(data = gap, aes(x = continent, y = lifeExp, fill = continent)) + 
  geom_boxplot() + 
  ggtitle("Boxplots for lifeExp by continent") + 
  xlab("Continent") + 
  ylab("Life expectancy (years)") +
  theme_minimal() #+

  #guides(fill = FALSE) 

# What happens if you un-hashtage `guides(fill = FALSE)` and the plus sign in lines 68 and 69 above? 
# legend will disappear

## plot of lifeExp and gdpPercap with continent grouped(different shapes and colors), setting theme, title ,and lab. Text elements size and position adjusted
ggplot(data = gap, aes(x = lifeExp, y = gdpPercap, color = continent, shape = continent)) + 
    geom_point(size = 5, alpha = 0.5) + 
    theme_classic() +
    ggtitle("Scatterplot of life expectancy by gdpPercap") +
    xlab("Life expectancy (years)") + 
    ylab("gdpPercap (USD)") + 
    theme(legend.position = "top",
          plot.title = element_text(hjust = 0.5, size = 20),
          legend.title = element_text(size = 10),
          legend.text = element_text(size = 5),
          axis.text.x = element_text(angle = 45, hjust = 1))

Exercises

Q1.

Fifty male and fifty female students fill out the same questionnaire in weekly intervals starting five weeks before an important examination to measure state anxiety. The research interests are: 1. whether there are gender difference in state anxiety 2. individual differences in state anxiety Explore the answers to both questions with plots involving confidence intervals or error bars for the means.

Source: Von Eye, A., & Schuster C. (1998). Regression Analysis for Social Sciences. San Diego: Academic Press.

Column 1: Anxiety score 5 weeks before exam for female 
Column 2: Anxiety score 4 weeks before exam for female 
Column 3: Anxiety score 3 weeks before exam for female 
Column 4: Anxiety score 2 weeks before exam for female 
Column 5: Anxiety score 1 weeks before exam for female 
Column 6: Anxiety score 5 weeks before exam for male 
Column 7: Anxiety score 4 weeks before exam for male 
Column 8: Anxiety score 3 weeks before exam for male 
Column 9: Anxiety score 2 weeks before exam for male 
Column 10: Anxiety score 1 weeks before exam for male

library(magrittr)

dta <- read.table("stateAnxiety.txt", header = T)
head(dta)

dta <- dta %>% 
  mutate(id = row.names(.)) %>% 
  gather(key = t, value = score, 1:10) %<>% 
  separate(t, c("gender", "week"), sep = 1) %>%
  mutate(id = paste(gender,id,sep = ""), 
         week = as.numeric(week),
         Week = as.factor(week))
head(dta)

library(hrbrthemes)

pd <- position_dodge(.3) # 使他們不要互相蓋住
p <- dta %>% group_by(week, gender) %>% 
  summarize(m_p=mean(score),
            se_p=sd(score)/sqrt(n())) %>%
  ggplot() +
  aes(week, m_p,
      group=gender,
      color=gender) +
  geom_errorbar(aes(ymin=m_p - se_p,
                    ymax=m_p + se_p),
                width=.2, size=.3,
                position=pd) + # position 重疊可閃開
  geom_line(position=pd,
            linetype='dotted') +
  geom_point(position=pd,
             size=rel(3)) +
  scale_shape(guide=guide_legend(title=NULL)) +
  labs(x="Weeks", y="Mean Score") +
  theme_ipsum() +
  theme(legend.position=c(.1, .8))
p

# another method 
# it seems different from last chunk result?
library(doBy)
source("summarySE.R")
source("normDataWithin.R")
source("summarySEwithin.R")

dta.cl <- summarySEwithin(dta, measurevar="score", betweenvars="gender", withinvars="week", idvar="id")

# score by week with error bars
qplot(week, score, data=dta.cl, geom="line", color=gender, group=gender) +
    geom_point() +
    geom_errorbar(aes(ymin = score - se, ymax = score + se), width=.1) +
     xlab("Week") +
     ylab("Score")

ggplot(data = dta, aes(Week, score)) + 
  geom_boxplot(aes(color = gender)) + 
  labs(x = "Week",
       y = "Score") +
  theme_bw()

ggplot(data=dta, aes(x=week, y=score, group=id)) +
  geom_line(lty=3) +
  stat_summary(aes(group=1), fun.y=mean, geom="line") +
  stat_summary(aes(group=1), fun.y=mean, geom="point") +
  facet_grid(. ~ gender) +
  scale_x_continuous(name="Week",) +
  scale_y_continuous(name="Score" ) +
  theme("aspect.ratio"=1)

## Warning: `fun.y` is deprecated. Use `fun` instead.

## Warning: `fun.y` is deprecated. Use `fun` instead.

Q2.

Use the markdown file to replicate the contents of Weissgerber, T.L., Milic, N.M., Winham, S.J., Garovic, V.D. (2015). Beyond Bar and Line Graphs: Time for a New Data Presentation Paradigm. PLOS Biology , 13.

https://rpubs.com/PL_W/c7e2

Q3.

The dataset consists of a sample of 14 primary school children between 8 and 12 years old. The children were asked to respond on 8 emotions and coping strategies scales for each of 6 situations: fail to fulfill assingments in class, not allowed to play with other children, forbidden to do something by the teacher, victim of bullying, too much school work, forbidden to do something by the mother. Plot the data in some meaningful ways. You may have to manipulate data into a different format first.

Column 1: Unpleasant (Annoy) 
Column 2: Sad 
Column 3: Afraid 
Column 4: Angry 
Column 5: Approach coping 
Column 6: Avoidant coping 
Column 7: Social support seeking 
Column 8: Emotional reaction, especially agression 
Column 9: Situation ID 
Column 10: Children ID

Source: Roeder, I., Boekaerts, M., & Kroonenberg, P. M. (2002). The stress and coping questionnaire for children (School version and Asthma version): Construction, factor structure, and psychometric properties. Psychological Reports, 91, 29-36.

dta <- read.table("coping.txt", header = T)
dtaL <- dta %>% gather(key="emotions", value="scale", 1:8)
head(dta)

head(dtaL)

# detach(plyr)
library(hrbrthemes)
library(ggcorrplot)

qplot(scale, data=dtaL, geom="density", facets = situation ~ emotions)

ggcorrplot(cor(dta[1:8]),
           hc.order=TRUE,
           type="lower",
           lab=T)+
  theme(legend.position="NONE")

pd <- position_dodge(.3)
dtaL %>% dplyr::group_by(sbj,emotions) %>% 
  dplyr::summarise(m = mean(scale), se = sd(scale)/sqrt(n())) %>%
  ggplot() +
  aes(emotions, m,group=sbj,color=sbj) +
  geom_errorbar(aes(ymin=m - se, ymax=m + se),
                width=.2, size=.3, position=pd) + # position 重疊可閃開
  geom_line(position=pd, linetype='dotted') +
  geom_point(position=pd, size=rel(3)) +
  scale_shape(guide=guide_legend(title=NULL)) +
  labs(x="Emotions", y="Response score") +
  theme_ipsum() +
  theme(legend.position='bottom')

dtaL %>% dplyr::group_by(sbj,situation) %>% 
  dplyr::summarise(m = mean(scale), se = sd(scale)/sqrt(n())) %>%
  ggplot() +
  aes(situation, m,group=sbj,color=sbj) +
  geom_errorbar(aes(ymin=m - se, ymax=m + se),
                width=.2, size=.3, position=pd) + # position 重疊可閃開
  geom_line(position=pd, linetype='dotted') +
  geom_point(position=pd, size=rel(3)) +
  scale_shape(guide=guide_legend(title=NULL)) +
  labs(x="Situations", y="Response score") +
  theme_ipsum() +
  theme(legend.position='bottom')

dtaL %>% dplyr::group_by(emotions,situation) %>% 
  dplyr::summarise(m = mean(scale),se = sd(scale)/sqrt(n())) %>%
  ggplot() +
  aes(emotions, m,group=situation,color=situation) +
  geom_errorbar(aes(ymin=m - se, ymax=m + se),
                width=.2, size=.3, position=pd) + # position 重疊可閃開
  geom_line(position=pd, linetype='dotted') +
  geom_point(position=pd, size=rel(3)) +
  scale_shape(guide=guide_legend(title=NULL)) +
  labs(x="Emotions", y="Response score") +
  theme_ipsum() +
  theme(legend.position='bottom')

Q4.

Revise the plot in the contrast detection example so that the plot is faceted by subject × spatial frequency condition.

library(pacman)
# tidyr stuff and psyphy package for model fitting
p_load(tidyverse, psyphy)

fls <- list.files(path = "./data", pattern = ".csv")
fL <- paste0("data/", fls)
dta <- lapply(fL, read_csv) %>% bind_rows()

## Parsed with column specification:
## cols(
##   subject = col_character(),
##   contrast = col_double(),
##   sf = col_double(),
##   target_side = col_character(),
##   response = col_character(),
##   unique_id = col_character()
## )
## Parsed with column specification:
## cols(
##   subject = col_character(),
##   contrast = col_double(),
##   sf = col_double(),
##   target_side = col_character(),
##   response = col_character(),
##   unique_id = col_character()
## )
## Parsed with column specification:
## cols(
##   subject = col_character(),
##   contrast = col_double(),
##   sf = col_double(),
##   target_side = col_character(),
##   response = col_character(),
##   unique_id = col_character()
## )
## Parsed with column specification:
## cols(
##   subject = col_character(),
##   contrast = col_double(),
##   sf = col_double(),
##   target_side = col_character(),
##   response = col_character(),
##   unique_id = col_character()
## )
## Parsed with column specification:
## cols(
##   subject = col_character(),
##   contrast = col_double(),
##   sf = col_double(),
##   target_side = col_character(),
##   response = col_character(),
##   unique_id = col_character()
## )

dta$sf <- factor(dta$sf)
dta$subject <- factor(dta$subject)
head(dta <- as.data.frame(dta))

# score the correct responses
dta$correct <- ifelse(dta$target_side == dta$response, 1, 0)

# bootstrapped CIs for proportion of correct responses
p <- ggplot(dta, aes(contrast, correct)) +
  stat_summary(fun.data = "mean_cl_boot", color = "dodgerblue") +
  scale_x_log10() +
  scale_y_continuous(limits = c(0, 1)) +
  labs(x = "Contrast in log unit", y = "Proportion of correct responses")

# Links for Binomial Family for m-alternative Forced-choice

# fit a detection model in which the chance of correct response is .5
m0 <- glm(correct ~ log10(contrast) + subject + sf, data = dta, 
          family = binomial(mafc.probit(2)))
m0

## 
## Call:  glm(formula = correct ~ log10(contrast) + subject + sf, family = binomial(mafc.probit(2)), 
##     data = dta)
## 
## Coefficients:
##        (Intercept)     log10(contrast)           subjectS2  
##            6.05604             4.50312             1.40361  
##          subjectS3           subjectS4           subjectS5  
##            0.61735             0.94523             2.02649  
## sf1.49534878122122  sf4.47213595499958  sf13.3748060995284  
##            2.03385             1.39973            -0.04848  
##               sf40  
##           -2.15747  
## 
## Degrees of Freedom: 6999 Total (i.e. Null);  6990 Residual
## Null Deviance:       7555 
## Residual Deviance: 5070  AIC: 5090

# generate predicted responses 
x1 <- rep(with(dta, seq(min(contrast), max(contrast), len = 500)),5)
x2 <- with(dta, sample(levels(subject), 2500, replace = T))
x3 <- with(dta, sample(levels(sf), 2500, replace = T))
xval <- data.frame(contrast = x1, subject = x2, sf = x3)
yval <- predict(m0, xval, type = "response")
m0_pred <- data.frame(xval, yval)

# augument to the observed plot
p <- p + 
  geom_line(data = m0_pred, aes(x = contrast, y = yval))+
  facet_grid(subject ~ sf)

print(p)

Q5.

Use the Cushings{MASS} data set to generate a plot similar to the following one:

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

dta <- MASS::Cushings
dta$Type <- factor(dta$Type, levels = c("a","b","c","u"), labels = c("Adenoma","Bilateral Hyperplasia","Carcinoma","Unknown"))
head(dta)

library(ggrepel)
library(ggthemes)

ggplot(dta, aes(Tetrahydrocortisone,Pregnanetriol,color=Type)) +
  geom_point() +
  geom_text_repel(data=dta %>% group_by(Type) %>% sample_n(., 1),
                   aes(label=Type))+
  labs(x="Tetrahydrocortisone (mg/24 hours)",
       y="Pregnanetriol (mg/24 hours)") +
  ggtitle("Cushing's syndrome") +
  theme(plot.title = element_text(hjust = 1, size = 12, face="bold")) +
  scale_color_manual(values=c('dodgerblue',
                              'indianred',
                              'forestgreen',
                              'goldenrod'))+
  theme(legend.position='NONE')

Class7 : Grammar of graphics

0424

In-class exercise

Q1.

Q2.

Q3.

Q4.

Q5.

Q6.

Exercises

Q1.

Q2.

Q3.

Q4.

Q5.