In-class exercises 1

Render the R script for replicating figures in Chapter 4 of Lattice: Multivariate Data Visualization with R (Sarkar, D. 2008) to html document with comments at each code chunk indicated by ‘##’.

##read the data
VADeaths
##       Rural Male Rural Female Urban Male Urban Female
## 50-54       11.7          8.7       15.4          8.4
## 55-59       18.1         11.7       24.3         13.6
## 60-64       26.9         20.3       37.0         19.3
## 65-69       41.0         30.9       54.6         35.1
## 70-74       66.0         54.3       71.1         50.0
##identify data type
class(VADeaths)
## [1] "matrix"
##View dotplot content
library(lattice)
methods("dotplot")
## [1] dotplot.array*   dotplot.default* dotplot.formula* dotplot.matrix* 
## [5] dotplot.numeric* dotplot.table*  
## see '?methods' for accessing help and source code
##plot with raw data
dotplot(VADeaths, groups=FALSE)

dotplot(VADeaths, groups=FALSE, 
        layout=c(1, 4), ##four columns and one row
        aspect=0.7, ##graphic width
        origin=0,
        type=c("p", "h"), ##dot and line
        main="Death Rates in Virginia - 1940", 
        xlab="Rate (per 1000)")

##legend on right side
dotplot(VADeaths, type="o",
        auto.key=list(lines=TRUE, space="right"),
        main="Death Rates in Virginia - 1940",
        xlab="Rate (per 1000)")

##bar chart
barchart(VADeaths, groups=FALSE,
         layout=c(1, 4), ##four columns and one row
         aspect=0.7, ##graphic width
         reference=FALSE, 
         main="Death Rates in Virginia - 1940",
         xlab="Rate (per 100)")

##read the postdoc
data(postdoc, package="latticeExtra")
##calculate the percentage of each group
barchart(prop.table(postdoc, margin=1), 
         xlab="Proportion",
         auto.key=list(adj=1)) ##list group name

##calculate the percentage of each group (dot chart)
dotplot(prop.table(postdoc, margin=1),groups=FALSE, 
        xlab="Proportion",
        par.strip.text=list(abbreviate=TRUE, minlength=10))##abbreviated text

## stack for long dotplot
dotplot(prop.table(postdoc, margin=1), groups=FALSE, 
        index.cond=function(x, y) median(x),
        xlab="Proportion", 
        layout=c(1, 5),##five columns and one row
        aspect=0.6,##graphic width
        scales=list(y=list(relation="free", rot=0)),
        prepanel=function(x, y) {
            list(ylim=levels(reorder(y, x)))
        },##row the dot chart
        panel=function(x, y, ...) {
            panel.dotplot(x, reorder(y, x), ...)
        })

##read data Chem97{mlmRev}
data(Chem97, package="mlmRev")

##使用xtabs函數將多個項度製成表格
gcsescore.tab <- xtabs(~ gcsescore + gender, Chem97)

##as.data.frame儲存成矩陣資料格式
gcsescore.df <- as.data.frame(gcsescore.tab)

##as.numeric將gcsescore儲存成數字形式,as.character轉換成文字的形式
gcsescore.df$gcsescore <- as.numeric(as.character(gcsescore.df$gcsescore))

##使用xyplot語法畫lattice散佈圖,指定x 與y 軸的資料,y值=Freq,X值gcsescore,Z值=gender
xyplot(Freq ~ gcsescore | gender, 
       data = gcsescore.df, 
       type="h", 
       layout=c(1, 2), 
       xlab="Average GCSE Score")

#使用xtabs函數將多個項度製成表格
score.tab <- xtabs(~score + gender, Chem97)

##as.data.frame儲存成矩陣資料格式
score.df <- as.data.frame(score.tab)

##製作barchart()長條圖
barchart(Freq ~ score | gender, score.df, origin=0)

In-class exercises 2

Create a new student-teacher ratio variable from the enrltot and teachers variables in the data set Caschool{Ecdat} to generate the following plot in which reading scores (readscr) for grade span assignment grspan equals “KK-08” in the data set are split into three levels: lower-third, middle-third, and upper-third:

#讀取資料
head(Ecdat::Caschool)
##   distcod  county                        district grspan enrltot teachers
## 1   75119 Alameda              Sunol Glen Unified  KK-08     195    10.90
## 2   61499   Butte            Manzanita Elementary  KK-08     240    11.15
## 3   61549   Butte     Thermalito Union Elementary  KK-08    1550    82.90
## 4   61457   Butte Golden Feather Union Elementary  KK-08     243    14.00
## 5   61523   Butte        Palermo Union Elementary  KK-08    1335    71.50
## 6   62042  Fresno         Burrel Union Elementary  KK-08     137     6.40
##   calwpct mealpct computer testscr   compstu  expnstu      str    avginc
## 1  0.5102  2.0408       67  690.80 0.3435898 6384.911 17.88991 22.690001
## 2 15.4167 47.9167      101  661.20 0.4208333 5099.381 21.52466  9.824000
## 3 55.0323 76.3226      169  643.60 0.1090323 5501.955 18.69723  8.978000
## 4 36.4754 77.0492       85  647.70 0.3497942 7101.831 17.35714  8.978000
## 5 33.1086 78.4270      171  640.85 0.1280899 5235.988 18.67133  9.080333
## 6 12.3188 86.9565       25  605.55 0.1824818 5580.147 21.40625 10.415000
##       elpct readscr mathscr
## 1  0.000000   691.6   690.0
## 2  4.583333   660.5   661.9
## 3 30.000002   636.3   650.9
## 4  0.000000   651.9   643.5
## 5 13.857677   641.8   639.9
## 6 12.408759   605.7   605.4
dta2 <- Ecdat::Caschool
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lattice)
#製作三等份的cutpoint
cutpoint1<- min(dta2$readscr)+((max(dta2$readscr)-min(dta2$readscr))/3)
cutpoint2<- max(dta2$readscr)-((max(dta2$readscr)-min(dta2$readscr))/3)
#產生新的STration 及readscr的分級結果變項
dta2 %>% 
  mutate(STratio= enrltot / teachers,
         rlevels= case_when(readscr < cutpoint1 ~"lower-third",
                           readscr >=cutpoint1 & readscr < cutpoint2 ~"middle-third",
                           readscr >= cutpoint2 ~"upper-third")) %>%
  group_by(grspan) %>% #分組
  xyplot(readscr~STratio | rlevels, type=c("g", "r", "p"), data=.,
         xlab = "Student-Teacher Ratio", ylab = "Reading score") 

In-class exercises 3

The data set concerns student evaluation of instructor’s beauty and teaching quality for several courses at the University of Texas. The teaching evaluatons were done at the end of the semester, and the beauty judgments were made later, by six students who had not attended the classes and were not aware of the course evaluations.

dta3 <- read.table("beautyCourseEval.txt", header = T)
head(dta3)
##   eval     beauty sex age minority tenure courseID
## 1  4.3  0.2015666   1  36        1      0        3
## 2  4.5 -0.8260813   0  59        0      1        0
## 3  3.7 -0.6603327   0  51        0      1        4
## 4  4.3 -0.7663125   1  40        0      1        2
## 5  4.4  1.4214450   1  31        0      0        0
## 6  4.2  0.5002196   0  62        0      1        0
dta3$courseID<-as.character(dta3$courseID)
##??
library(nlme)
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
m<-lmList(eval~beauty| courseID, data = dta3)
m1<-data.frame(courseID=rownames(coef(m)),coef(m),check.names=FALSE)
m2<-m1[with(m1,order(beauty, decreasing=F)),]
m.order<-list(m2$courseID)

xyplot(eval~beauty|courseID, data=dta3, type=c("p", "g", "r"), layout=c(6,6), index.cond=m.order, xlab="Beauty judgment score", ylab="Average course evaluation score")

In-class exercises 4

A sample of 40 psychology students at a large southwestern university took four subtests (Vocabulary, Similarities, Block Design, and Picture Completion) of the Wechsler (1981) Adult Intelligence Scale-Revised. The researchers also used Magnetic Resonance Imaging (MRI) to determine the brain size of the subjects.

dta4 <- read.table("brainsize.txt", header = T)
head(dta4)
##   Sbj Gender FSIQ VIQ PIQ Weight Height MRICount
## 1   1 Female  133 132 124    118   64.5   816932
## 2   2   Male  140 150 124     NA   72.5  1001121
## 3   3   Male  139 123 150    143   73.3  1038437
## 4   4   Male  133 129 128    172   68.8   965353
## 5   5 Female  137 132 134    147   65.0   951545
## 6   6 Female   99  90 110    146   69.0   928799
str(dta4)
## 'data.frame':    40 obs. of  8 variables:
##  $ Sbj     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender  : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 1 1 1 2 2 ...
##  $ FSIQ    : int  133 140 139 133 137 99 138 92 89 133 ...
##  $ VIQ     : int  132 150 123 129 132 90 136 90 93 114 ...
##  $ PIQ     : int  124 124 150 128 134 110 131 98 84 147 ...
##  $ Weight  : int  118 NA 143 172 147 146 138 175 134 172 ...
##  $ Height  : num  64.5 72.5 73.3 68.8 65 69 64.5 66 66.3 68.8 ...
##  $ MRICount: int  816932 1001121 1038437 965353 951545 928799 991305 854258 904858 955466 ...
  1. Are there gender differences in the three IQ scores?
library(reshape)
## 
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
## 
##     rename
dta41<-melt(dta4, id=c("Sbj", "Gender", "Weight", "Height", "MRICount"))
dta41 %>% bwplot(value~Gender | variable,
                data=.,
                par.settings=standard.theme(color=FALSE), 
                    xlab="Gender", ylab="IQ score")

lapply(dta4[3:5], function(x) 

  t.test(x ~ dta4$Gender, na.action = na.pass))
## $FSIQ
## 
##  Welch Two Sample t-test
## 
## data:  x by dta4$Gender
## t = -0.40267, df = 37.892, p-value = 0.6895
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -18.68639  12.48639
## sample estimates:
## mean in group Female   mean in group Male 
##                111.9                115.0 
## 
## 
## $VIQ
## 
##  Welch Two Sample t-test
## 
## data:  x by dta4$Gender
## t = -0.77262, df = 36.973, p-value = 0.4447
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -21.010922   9.410922
## sample estimates:
## mean in group Female   mean in group Male 
##               109.45               115.25 
## 
## 
## $PIQ
## 
##  Welch Two Sample t-test
## 
## data:  x by dta4$Gender
## t = -0.1598, df = 37.815, p-value = 0.8739
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -15.72079  13.42079
## sample estimates:
## mean in group Female   mean in group Male 
##               110.45               111.60

-There are NO gender differences in the three IQ scores.

  1. Is the relationship between height and weight gender dependent?
xyplot(Weight ~ Height, groups=Gender,
       data=dta4, type=c('p', 'g', 'r'),
       auto.key=list(column=2), 
       xlab="Height", ylab="Weight")

lapply(dta4[6:7], function(x) 
  summary(lm(x~Gender, data=dta4, na.action=na.omit)))
## $Weight
## 
## Call:
## lm(formula = x ~ Gender, data = dta4, na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.444 -15.383   3.678  13.306  37.800 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  137.200      4.132  33.203  < 2e-16 ***
## GenderMale    29.244      6.004   4.871 2.23e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.48 on 36 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.3972, Adjusted R-squared:  0.3805 
## F-statistic: 23.73 on 1 and 36 DF,  p-value: 2.227e-05
## 
## 
## $Height
## 
## Call:
## lm(formula = x ~ Gender, data = dta4, na.action = na.omit)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.132 -2.432  0.235  2.152  5.568 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  65.7650     0.6298  104.42  < 2e-16 ***
## GenderMale    5.6666     0.9023    6.28 2.62e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.816 on 37 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.516,  Adjusted R-squared:  0.5029 
## F-statistic: 39.44 on 1 and 37 DF,  p-value: 2.624e-07
  1. Is the relationship between IQ and brainsize (as measured by MRIcount) gender dependent?
xyplot(MRICount~value|variable, 
       groups=Gender, 
       data=dta41,
       auto.key=list(column=2),
       xlab="IQ", ylab="MRICount", type=c('p', 'g', 'r'), layout=c(3,1))

lapply(dta4[3:5], function(x) 
  summary(lm(MRICount+x~Gender, data=dta4)))
## $FSIQ
## 
## Call:
## lm(formula = MRICount + x ~ Gender, data = dta4)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -74893 -34603  -7286  19996 128676 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   862766      12502  69.008  < 2e-16 ***
## GenderMale     92204      17681   5.215 6.77e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 55910 on 38 degrees of freedom
## Multiple R-squared:  0.4171, Adjusted R-squared:  0.4018 
## F-statistic: 27.19 on 1 and 38 DF,  p-value: 6.774e-06
## 
## 
## $VIQ
## 
## Call:
## lm(formula = MRICount + x ~ Gender, data = dta4)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -74888 -34605  -7294  20001 128677 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   862764      12502  69.010  < 2e-16 ***
## GenderMale     92207      17680   5.215 6.77e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 55910 on 38 degrees of freedom
## Multiple R-squared:  0.4172, Adjusted R-squared:  0.4018 
## F-statistic:  27.2 on 1 and 38 DF,  p-value: 6.767e-06
## 
## 
## $PIQ
## 
## Call:
## lm(formula = MRICount + x ~ Gender, data = dta4)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -74894 -34596  -7278  19994 128671 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   862765      12503  69.007  < 2e-16 ***
## GenderMale     92202      17681   5.215 6.78e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 55910 on 38 degrees of freedom
## Multiple R-squared:  0.4171, Adjusted R-squared:  0.4018 
## F-statistic: 27.19 on 1 and 38 DF,  p-value: 6.778e-06

yes, R-squared =0.4171, p<0.001