In-class exercise

library(tidyverse)
library(lattice)

Q1.

Render the R script for replicating figures in Chapter 4 of Lattice: Multivariate Data Visualization with R (Sarkar, D. 2008) to html document with comments at each code chunk indicated by ‘##’.

## VADeaths dataset
VADeaths
##       Rural Male Rural Female Urban Male Urban Female
## 50-54       11.7          8.7       15.4          8.4
## 55-59       18.1         11.7       24.3         13.6
## 60-64       26.9         20.3       37.0         19.3
## 65-69       41.0         30.9       54.6         35.1
## 70-74       66.0         54.3       71.1         50.0
## check data types
class(VADeaths)
## [1] "matrix"
## check the methods in dotplot 
methods("dotplot")
## [1] dotplot.array*   dotplot.default* dotplot.formula* dotplot.matrix* 
## [5] dotplot.numeric* dotplot.table*  
## see '?methods' for accessing help and source code
## plot dotplot, no grouping
dotplot(VADeaths, groups=FALSE)

dotplot(VADeaths, groups=FALSE, 
        layout=c(1, 4), # add layout(column1,row4)
        aspect=0.7, # 距離逼邊多遠
        origin=0, # line從0開始
        type=c("p", "h"), # "h":histogram-like vertical lines
        main="Death Rates in Virginia - 1940", 
        xlab="Rate (per 1000)")

## type="o"點線圖, auto.key圖例, space放哪一側
dotplot(VADeaths, type="o",
        auto.key=list(lines=TRUE, space="right"),
        main="Death Rates in Virginia - 1940",
        xlab="Rate (per 1000)")

## barchart
barchart(VADeaths, groups=FALSE,
         layout=c(1, 4), 
         aspect=0.1, 
         reference=FALSE, #這是什麼?
         main="Death Rates in Virginia - 1940",
         xlab="Rate (per 100)")

## load data from packages
data(postdoc, package="latticeExtra")

## prop.table, margin=1各組各自比例,=2考慮自己占總體的比例
barchart(prop.table(postdoc, margin=1), 
         xlab="Proportion",
         auto.key=list(adj=1)) # adj圖塊位置

## strip圖表標題文字,abbreviate縮寫,minlength最長字數限制
dotplot(prop.table(postdoc, margin=1), 
        groups=FALSE, 
        xlab="Proportion",
        par.strip.text=list(abbreviate=TRUE, minlength=10)) 

## prop table dotplot
dotplot(prop.table(postdoc, margin=1), 
        groups=FALSE, 
        index.cond=function(x, y) median(x),
        xlab="Proportion", 
        layout=c(1, 5), 
        aspect=0.15,
        scales=list(y=list(relation="free", rot=0, cex=0.3)), # ylab setting
        prepanel=function(x, y) {
            list(ylim=levels(reorder(y, x))) # set y axis limit
        },
        panel=function(x, y, ...) {
            panel.dotplot(x, reorder(y, x), ...) # reorder the dots
        })

## load data from packages
data(Chem97, package="mlmRev")
head(Chem97)
## table
gcsescore.tab <- xtabs(~ gcsescore + gender, Chem97)
head(gcsescore.tab)
##          gender
## gcsescore M F
##     0     1 0
##     1     2 0
##     1.5   0 1
##     2     1 0
##     2.083 1 0
##     2.428 1 1
## 變成dataframe
gcsescore.df <- as.data.frame(gcsescore.tab)
head(gcsescore.df)
## 把gcsescore轉成numeric
gcsescore.df$gcsescore <- as.numeric(as.character(gcsescore.df$gcsescore))
## xyplot
xyplot(Freq ~ gcsescore | gender, 
       data = gcsescore.df, 
       type="h", 
       layout=c(1, 2), 
       xlab="Average GCSE Score")

## table
score.tab <- xtabs(~score + gender, Chem97)
## 變成dataframe
score.df <- as.data.frame(score.tab)
## barchart, origin=0 bar從0開始畫
barchart(Freq ~ score | gender, score.df, origin=0)

Q2.

Create a new student-teacher ratio variable from the enrltot and teachers variables in the data set Caschool{Ecdat} to generate the following plot in which reading scores (readscr) for grade span assignment grspan equals “KK-08” in the data set are split into three levels: lower-third, middle-third, and upper-third:

data(Caschool, package="Ecdat")
head(Caschool)
Caschool %>% filter(grspan=="KK-08") %>% 
  mutate(STRatio = enrltot/teachers,
         readlLevel = cut(readscr, ordered=T, 
                          breaks=quantile(readscr, probs=c(0, .33, .67, 1)), 
                          labels=c("L", "M", "H"))) %>%
  xyplot(readscr ~ STRatio | readlLevel, data = ., type = c("p","r"), 
         xlab = "Student-Teacher ratio", ylab = "Reading score")

Q3.

The data set concerns student evaluation of instructor’s beauty and teaching quality for several courses at the University of Texas. The teaching evaluatons were done at the end of the semester, and the beauty judgments were made later, by six students who had not attended the classes and were not aware of the course evaluations.

Source: Hamermesh, D.S., & Parker, A.M. (2005). Beauty in the classroom: instructor’s pulchritude and putative pedagogical productivity.a Economics and Education Review, 24, 369-376. Reported in Gelman, A., & Hill, J. (2006). Data analysis using regression and hierarchical/multilevel models. p. 51.

Column 1: Course evaluation score 
Column 2: Beauty score 
Column 3: Gender of professor, 1 = Female, 0 = Male 
Column 4: Pofessor age in years 
Column 5: Minority status of professor, 1 = Minority, 0 = Others 
Column 6: Tenure status of professor, 1 = Tenured, 0 = No 
Column 7: Course ID 

Use the lattice package to produce the plot above. Hint: Use ‘reorder’ after obtaining regression coefficients to rearrange conditioning panels.

dta <- read.table("beautyCourseEval.txt", header = T)
head(dta)
# 參考 Jay Liao : https://rpubs.com/d920056/601759
xyplot(eval ~ beauty | factor(courseID), data = dta, type = c("p","r","g"), 
       index.cond = function(x,y) coef(lm(y~x))[2], layout = c(6,6),
       xlab = "Beauty judgement score", ylab = "Average course evaluation score")

# using lapply
d <- lapply(c(0:30), function(x) 
  coef(lm(eval ~ beauty, data = dta[dta$courseID == x,]))[2]) %>% 
  do.call(rbind, .) %>% 
  order(., decreasing=T)
d = d-1
xyplot(eval ~ beauty | factor(courseID, levels = c(d)), data = dta, 
       type = c("p","r","g"), layout = c(6,6),
       xlab = "Beauty judgement score", ylab = "Average course evaluation score")

# why it can't work? (using index.cond)
xyplot(eval ~ beauty | factor(courseID), data = dta, type = c("p","r","g"), 
        index.cond = list(c(d)), layout = c(6,6),
        xlab = "Beauty judgement score", ylab = "Average course evaluation score")

Q4.

A sample of 40 psychology students at a large southwestern university took four subtests (Vocabulary, Similarities, Block Design, and Picture Completion) of the Wechsler (1981) Adult Intelligence Scale-Revised. The researchers also used Magnetic Resonance Imaging (MRI) to determine the brain size of the subjects. Source: Willerman, L., Schultz, R., Rutledge, J.N., & Bigler, E. (1991), In Vivo Brain Size and Intelligence, Intelligence, 15, 223-228.

Column 1: Subject ID 
Column 2: Gender ID 
Column 3: Full scale IQ 
Column 4: Verbal IQ 
Column 5: Performance IQ
Column 6: Body weight in pounds 
Column 7: Height in inches 
Column 8: Totol pixel counts from 18 MRI scans 

Use appropriate lattice graphics to answer the following questions.

  1. Are there gender differences in the three IQ scores?
  2. Is the relationship between height and weight gender dependent?
  3. Is the relationship between IQ and brainsize (as measured by MRIcount) gender dependent?
dta <- read.table("brainsize.txt", header = T)
head(dta)
dtaL <- reshape(dta, varying=list(3:5), v.names="Score", timevar="IQtype",times=c("FSIQ", "VIQ", "PIQ"), direction="long")
head(dtaL)
histogram(~ Score | IQtype + Gender, data = dtaL)

xyplot(Height ~ Weight | Gender, type = c("p","r","g"), data = dta)

xyplot(MRICount ~ Score | IQtype + Gender, type = c("p","r","g"), data = dtaL)

Exercises

Q1.

Use trellis graphics to explore various ways to display the sample data from the National Longitudinal Survey of Youth.

dta <- read_csv("nlsy86long.csv")
## Parsed with column specification:
## cols(
##   id = col_double(),
##   sex = col_character(),
##   race = col_character(),
##   time = col_double(),
##   grade = col_double(),
##   year = col_double(),
##   month = col_double(),
##   math = col_double(),
##   read = col_double()
## )
head(dta)
dotplot(math ~ grade | sex, groups = race, data=dta, ylim = c(0,60),
          layout=c(1,2), cex=0.5, jitter.x=T, auto.key = list(columns = 2))

xyplot(read ~ math | factor(grade) + race, groups = sex, type = c("p","r","g"), 
       data = dta, pch = 19, cex = 0.5, auto.key = list(columns = 2))

splom( ~ cbind(math, read) | race, data=dta,
         pscale=0,type=c("r"))

histogram(~ read | race, data=dta,
          xlab="Read Score", type="density",
          layout=c(1, 2), border="white",
          panel=function(x, ...) {
                panel.histogram(x, ...)
                panel.mathdensity(dmath=dnorm, col="gray",
                      args=list(mean=mean(x, na.rm=T), sd=sd(x, na.rm=T)))})

Q2.

Eight different physical measurements of 30 French girls were recorded from 4 to 15 years old. Explore various ways to display the data using trellis graphics. Source: Sempe, M., et al. (1987). Multivariate and longitudinal data on growing children: Presentation of the French auxiological survey. In J. Janssen, et al. (1987). Data analysis. The Ins and Outs of solving real problems (pp. 3-6). New York: Plenum Press.

Column 1: Weight in grams 
Column 2: Height in mms 
Column 3: Head to butt length in mms 
Column 4: Head circumference in mms
Column 5: Chest circumference in mms
Column 6: Arm length in mms
Column 7: Calf length in mms
Column 8: Pelvis circumference in mms
Column 9: Age in years 
Column 10: Girl ID 
dta <- read.table("girlsGrowth.txt",header = T)
head(dta)
splom( ~ cbind(Wt,Ht,Hb,Hc,Cc,Arm,Calf,Pelvis), data=dta, pscale=0, type=c("r"))

xyplot(Ht ~ Wt | factor(age), type = c("p","r","g"), 
       data = dta, pch = 19, cex = 0.2)

Q3.

Your manager gave you a sales data on sevral products in a SAS format. Your task is to summarize and report the data in tables and graphs using the R lattice package. Source: Gupta, S. K. (2006). Data Management and Reporting Made Easy with SAS Learning Edition 2.0

Recode the region variable (1 to 4) by “Nothern”, “Southern”, “Eastern” and “Western”; the district variable (1 - 5) by “North East”, “South East”, “South West”, “North West”, “Central West”; the quarter variable (1-4) by “1st”, “2nd”, “3rd”, “4th”; and the month variable (1-12) by “Jan”, “Feb”, etc. Set negative sales values to zero.

pacman::p_load('sas7bdat')
dta <- read.sas7bdat("sales.sas7bdat")
head(dta)
dta$region <- factor(dta$region, levels = c(1:4),labels = c("Nothern", "Southern", "Eastern", "Western"))
dta$district <- factor(dta$district, levels = c(1:5),labels = c("North East", "South East", "South West", "North West", "Central West"))
dta$quarter <- factor(dta$quarter, levels = c(1:4),labels = c("1st", "2nd", "3rd", "4th"))
dta$month <- factor(dta$month, levels = c(1:12),labels = c("Jan","Feb","Mar","Apr","May","Jun","July","Aug","Sep","Oct","Nov","Dec"))
dta[dta$sales < 0,]$sales = 0
dta$year <- factor(dta$year)
dta$market <- factor(dta$market)
head(dta)
densityplot( ~ sales | category + quarter, data=dta)

bwplot(sales ~ product | quarter, data=dta)

bwplot(quantity ~ product | year, data=dta)

xyplot(sales ~ expense | market, groups = quarter, type = c("p","smooth","g"), 
       data = dta, pch = 19, cex = 0.3, auto.key=list(column=4))

xyplot(sales ~ quantity | region, groups = quarter, type = c("p","smooth","g"), 
       data = dta, pch = 19, cex = 0.3, auto.key=list(column=4))
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## FALSE, : pseudoinverse used at 127.45
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## FALSE, : neighborhood radius 222.55
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## FALSE, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## FALSE, : There are other near singularities as well. 22500

Q4.

Use the Lattice package to graphically explore the age and gender effects on reaction time reported in the Bassin data example.

dta <- read.table("timetrial.txt",header = T)
head(dta)
summary(dta)
##  Sex         Age            Trial1             Trial2        
##  F:43   Min.   : 2.00   Min.   :-0.41300   Min.   :-0.78200  
##  M:70   1st Qu.: 9.00   1st Qu.:-0.04700   1st Qu.:-0.04600  
##         Median :13.00   Median : 0.06500   Median : 0.03900  
##         Mean   :20.58   Mean   : 0.06535   Mean   : 0.04869  
##         3rd Qu.:35.00   3rd Qu.: 0.15100   3rd Qu.: 0.12400  
##         Max.   :52.00   Max.   : 0.70900   Max.   : 2.00400  
##      Trial3             Trial4             Trial5        
##  Min.   :-0.49900   Min.   :-0.52900   Min.   :-0.36000  
##  1st Qu.:-0.02500   1st Qu.:-0.02700   1st Qu.:-0.02500  
##  Median : 0.05200   Median : 0.03400   Median : 0.04400  
##  Mean   : 0.05727   Mean   : 0.04911   Mean   : 0.04714  
##  3rd Qu.: 0.11600   3rd Qu.: 0.12400   3rd Qu.: 0.11100  
##  Max.   : 0.89600   Max.   : 1.03700   Max.   : 1.14900
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
parcoord(dta[, 3:7], col=as.numeric(dta$Sex))

x <-dta %>% group_by(Sex,Age) %>% 
  summarize(t1 = mean(Trial1),
            t2 = mean(Trial2),
            t3 = mean(Trial3),
            t4 = mean(Trial4),
            t5 = mean(Trial5),)
parcoord(x[,3:7], col=as.numeric(x$Sex))

parcoord(x[,3:7], col=as.numeric(x$Age))

splom(~ dta[,3:7] | Sex, type = c("p","r"),
      data = dta, pch = '.', pscale = 0)

dtaL <- reshape(dta, varying=list(3:7), v.names="time", timevar="Trial", times=c("Trial1", "Trial2", "Trial3", "Trial4", "Trial5"), direction="long")
head(dtaL)
deleteOut <- dtaL %>% filter(time < 0.5 & time > -0.5)
xyplot(time ~ Age | Sex, groups = Trial, type = c("p","smooth","g"), 
       data = deleteOut, pch = 19, cex = 0.3, auto.key=list(column=3))