Q1

The R script illustrates how to implement ‘small multiples’ in base graphics given the 4 different diets of the ChickWeight{datasets} example.

Adapt the script to produce a plot of 5 panels in which each panel shows a histogram of IQ for each of 5 classes with over 30 pupils in the nlschools{MASS} dataset.

ChickWeight{datasets} example

#input data
dta <- ChickWeight
head(dta)

##   weight Time Chick Diet
## 1     42    0     1    1
## 2     51    2     1    1
## 3     59    4     1    1
## 4     64    6     1    1
## 5     76    8     1    1
## 6     93   10     1    1

str(dta)

## Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame':   578 obs. of  4 variables:
##  $ weight: num  42 51 59 64 76 93 106 125 149 171 ...
##  $ Time  : num  0 2 4 6 8 10 12 14 16 18 ...
##  $ Chick : Ord.factor w/ 50 levels "18"<"16"<"15"<..: 15 15 15 15 15 15 15 15 15 15 ...
##  $ Diet  : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "formula")=Class 'formula'  language weight ~ Time | Chick
##   .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv> 
##  - attr(*, "outer")=Class 'formula'  language ~Diet
##   .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv> 
##  - attr(*, "labels")=List of 2
##   ..$ x: chr "Time"
##   ..$ y: chr "Body weight"
##  - attr(*, "units")=List of 2
##   ..$ x: chr "(days)"
##   ..$ y: chr "(gm)"

typeof(dta)

## [1] "list"

#split()依據4個level分割資料
dta_diet <- split(dta, dta$Diet) #依據Diet分割資料
#head(dta_diet) #看一下分割後的資料

#同時畫好幾張圖時使用par()分割畫面
par(mfrow=c(2, 2), mar=c(4, 4, 0, 0)) 
## mfrow=c(2, 2)建立2X2的空間
## mar=c(4, 4, 0, 0)=c(bottom, left, top, right)設定圖與四個邊界的距離

# scatter plot
lapply(dta_diet, function(x) {
                  plot(x$weight ~ x$Time, 
                       xlab="Time (day)", 
                       ylab="Weight (gm)")
                  legend('topleft', #在左上角做圖例
                         paste("Diet", x$Diet[1], sep="="),#標註diet的level
                         bty='n')}) #圖例去掉邊框

## $`1`
## $`1`$rect
## $`1`$rect$w
## [1] 5.821281
## 
## $`1`$rect$h
## [1] 52.72942
## 
## $`1`$rect$left
## [1] -0.84
## 
## $`1`$rect$top
## [1] 315.8
## 
## 
## $`1`$text
## $`1`$text$x
## [1] 1.151298
## 
## $`1`$text$y
## [1] 289.4353
## 
## 
## 
## $`2`
## $`2`$rect
## $`2`$rect$w
## [1] 5.821281
## 
## $`2`$rect$h
## [1] 57.02589
## 
## $`2`$rect$left
## [1] -0.84
## 
## $`2`$rect$top
## [1] 342.68
## 
## 
## $`2`$text
## $`2`$text$x
## [1] 1.151298
## 
## $`2`$text$y
## [1] 314.1671
## 
## 
## 
## $`3`
## $`3`$rect
## $`3`$rect$w
## [1] 5.821281
## 
## $`3`$rect$h
## [1] 65.22824
## 
## $`3`$rect$left
## [1] -0.84
## 
## $`3`$rect$top
## [1] 386.36
## 
## 
## $`3`$text
## $`3`$text$x
## [1] 1.151298
## 
## $`3`$text$y
## [1] 353.7459
## 
## 
## 
## $`4`
## $`4`$rect
## $`4`$rect$w
## [1] 5.821281
## 
## $`4`$rect$h
## [1] 55.26824
## 
## $`4`$rect$left
## [1] -0.84
## 
## $`4`$rect$top
## [1] 333.32
## 
## 
## $`4`$text
## $`4`$text$x
## [1] 1.151298
## 
## $`4`$text$y
## [1] 305.6859

nlschools{MASS} dataset

# input data
dta<-MASS::nlschools
#?nlschools

head(dta)

##   lang   IQ class GS SES COMB
## 1   46 15.0   180 29  23    0
## 2   45 14.5   180 29  10    0
## 3   33  9.5   180 29  15    0
## 4   46 11.0   180 29  23    0
## 5   20  8.0   180 29  10    0
## 6   30  9.5   180 29  10    0

str(dta)

## 'data.frame':    2287 obs. of  6 variables:
##  $ lang : int  46 45 33 46 20 30 30 57 36 36 ...
##  $ IQ   : num  15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
##  $ class: Factor w/ 133 levels "180","280","1082",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ GS   : int  29 29 29 29 29 29 29 29 29 29 ...
##  $ SES  : int  23 10 15 23 10 10 23 10 13 15 ...
##  $ COMB : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...

# rename
pacman::p_load(dplyr, magrittr)
dta %<>%
 dplyr::rename(classID=class, ClassSize=GS)

# split by classID
dta_class <- split(dta, dta$classID)

# find the classes with over 30 pupils
pupils30 <- lapply(dta_class, function(x) nrow(x[])>30)
which(pupils30 == TRUE)

##  5480 15580 15980 16180 18380 
##    26    78    80    82    91

#一直以為"要篩選出有30個以上的學生的班級"，要使用GS>=30，結果不是，感到相當困惑
#pacman::p_load(magrittr)
#dta %>% dplyr::filter(GS >= '30')
#dtaGS <- dta[dta$GS >= 30, ]

ClassID=5480的學生人數為26 ?!

# set plot
par(mfrow=c(1, 5), mar=c(4, 4, 1, 1))

# histogram
lapply(dta_class[c("15580","15980","16180","18380")], 
       function(x) {
         hist(x$IQ, 
              xlab = "IQ",
              ylab = "Number of student",
              main = paste("class", x$class[1], sep=" "))
         }
       )|> 
  invisible()

Q2

Deaths per 100,000 from male suicides for 5 age groups and 15 countries are given in the table below. The data set is available as suicides2{HSAUR3}. Construct a side-by-side box plot of the data across different age groups and comment briefly.

Source: Everitt, B.S., & Hothorn, T. (2009). A Handbook of Statistical Analyses Using R. 2nd ed. p. 15.

#input data
dta2<-HSAUR3::suicides2
str(dta2)

## 'data.frame':    15 obs. of  5 variables:
##  $ A25.34: num  22 9 22 29 16 28 48 7 8 26 ...
##  $ A35.44: num  27 19 19 40 25 35 65 8 11 29 ...
##  $ A45.54: num  31 10 21 52 36 41 84 11 18 36 ...
##  $ A55.64: num  34 14 31 53 47 49 81 18 20 32 ...
##  $ A65.74: num  24 27 49 69 56 52 107 27 28 28 ...

head(dta2)

##         A25.34 A35.44 A45.54 A55.64 A65.74
## Canada      22     27     31     34     24
## Israel       9     19     10     14     27
## Japan       22     19     21     31     49
## Austria     29     40     52     53     69
## France      16     25     36     47     56
## Germany     28     35     41     49     52

#side-by-side box plot作法參考上課講義p.17
#boxplot(x),x=數值向量或者列表

#rename
names(dta2)<- c("25-34", "35-44", "45-54", "55-64","65-74")
boxplot(dta2)

data.frame_dta2<-as.data.frame(dta2)
str(data.frame_dta2) #結果和dta2一樣

## 'data.frame':    15 obs. of  5 variables:
##  $ 25-34: num  22 9 22 29 16 28 48 7 8 26 ...
##  $ 35-44: num  27 19 19 40 25 35 65 8 11 29 ...
##  $ 45-54: num  31 10 21 52 36 41 84 11 18 36 ...
##  $ 55-64: num  34 14 31 53 47 49 81 18 20 32 ...
##  $ 65-74: num  24 27 49 69 56 52 107 27 28 28 ...

head(data.frame_dta2)

##         25-34 35-44 45-54 55-64 65-74
## Canada     22    27    31    34    24
## Israel      9    19    10    14    27
## Japan      22    19    21    31    49
## Austria    29    40    52    53    69
## France     16    25    36    47    56
## Germany    28    35    41    49    52

#rename
#使用 names() 函數將dataframe的「變數名稱」以向量的型態輸出
names(data.frame_dta2)<- c("Age25-34", "Age35-44", "Age45-54", "Age55-64","Age65-74")

##Side-By-Side vertical Boxplots
boxplot(data.frame_dta2)

#試試as.data.frame.table
data.frame.table_dta2<-as.data.frame.table(dta2) 
str(data.frame.table_dta2)

## 'data.frame':    75 obs. of  7 variables:
##  $ Var1      : Factor w/ 15 levels "Canada","Israel",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Var2      : Factor w/ 5 levels "25-34","35-44",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Freq.25.34: num  22 9 22 29 16 28 48 7 8 26 ...
##  $ Freq.35.44: num  27 19 19 40 25 35 65 8 11 29 ...
##  $ Freq.45.54: num  31 10 21 52 36 41 84 11 18 36 ...
##  $ Freq.55.64: num  34 14 31 53 47 49 81 18 20 32 ...
##  $ Freq.65.74: num  24 27 49 69 56 52 107 27 28 28 ...

head(data.frame.table_dta2)

##      Var1  Var2 Freq.25.34 Freq.35.44 Freq.45.54 Freq.55.64 Freq.65.74
## 1  Canada 25-34         22         27         31         34         24
## 2  Israel 25-34          9         19         10         14         27
## 3   Japan 25-34         22         19         21         31         49
## 4 Austria 25-34         29         40         52         53         69
## 5  France 25-34         16         25         36         47         56
## 6 Germany 25-34         28         35         41         49         52

#rename
names(data.frame.table_dta2)<- c("Country","Age_group","Age25-34", "Age35-44", "Age45-54", "Age55-64","Age65-74")

#Side-By-Side Horizontal Boxplots
boxplot(data.frame.table_dta2[,3:7],
        horizontal=T,   #T=水平放置box-plot
        varwidth=T,     #T=box的寬度與樣本量的平方根成比例
        col = colors(), #也可以指定喜歡的顏色col="Khaki"
        cex.axis=.8,    #放大座標標記
        pch = 4,        #離群點的type
        lwd = 1.2,      #線的寬度
        main="Distribution of Male Suicides", 
        xlab="Deaths per 100,000 from male suicides", 
        ylab="Age groups")

abline(v=seq(10, 100, 10),lty=3, #畫垂直輔助線
       col='gray')

#顏色更換參考：https://htmlcolorcodes.com/colors/

結論：每10萬名男性平均自殺人數有隨著年齡增加的趨勢。

#講義p.10是用plot()來畫boxplot
#好奇轉成data.frame.table是否也能有相同效果，結果不行....
plot(data.frame.table_dta2[,3:7])

Q3

Use the dataset to replicate the plot below: Source: Wainer, H. (2001). Uneducated Guesses. Using Evidence to Uncover Misguided Education Policies.

# input data
dta3<- read.table("C:/Users/Ching-Fang Wu/Documents/dataM/sat_gpa.txt", h=T)

head(dta3)

##           College SAT_No GPA_No SAT_Yes GPA_Yes
## 1         Barnard   1210   3.08    1317    3.30
## 2    Northwestern   1243   3.10    1333    3.24
## 3         Bowdoin   1200   2.85    1312    3.12
## 4           Colby   1220   2.90    1280    3.04
## 5 Carnegie Mellon   1237   2.70    1308    2.94
## 6    Georgia Tech   1233   2.62    1287    2.80

str(dta3)

## 'data.frame':    6 obs. of  5 variables:
##  $ College: chr  "Barnard" "Northwestern" "Bowdoin" "Colby" ...
##  $ SAT_No : int  1210 1243 1200 1220 1237 1233
##  $ GPA_No : num  3.08 3.1 2.85 2.9 2.7 2.62
##  $ SAT_Yes: int  1317 1333 1312 1280 1308 1287
##  $ GPA_Yes: num  3.3 3.24 3.12 3.04 2.94 2.8

# Segments作法參考講義p.36-37
#第一步畫起始點
with(dta3, plot(SAT_No, GPA_No))

#第二步畫終點
with(dta3, plot(SAT_Yes, GPA_Yes))

#第三步將兩點連線
with(dta3, segments(SAT_No, GPA_No, SAT_Yes, GPA_Yes))

接下來進行優化，並調整X軸Y軸尺度。

#第一步畫起始點(實心點)
with(dta3, plot(SAT_No, GPA_No, 
                  type="p", #p:畫點,l:畫線,n:不畫任何點與線
                  pch=19, #16、19實心點
                  xlim=c(1150, 1400), 
                  ylim=c(2.6,3.4),
                  xlab="SAT (V+M)",
                  ylab="First Year GPA"),
                  cex.axis=1, cex.lab=1)

#第一步畫起始點
with(dta3, plot(SAT_No, GPA_No, 
                  type="p", #p:畫點,l:畫線,n:不畫任何點與線
                  pch=19, #16、19實心點
                  xlim=c(1150, 1400), 
                  ylim=c(2.6,3.4),
                  xlab="SAT (V+M)",
                  ylab="First Year GPA"),
                  cex.axis=1, cex.lab=1)

#第二步畫終點(空心點)
with(dta3, plot(SAT_Yes, GPA_Yes,
                  type="p", #p:畫點,l:畫線,n:不畫任何點與線
                  pch=1))

with(dta3, plot(SAT_No, GPA_No, 
                  type="p", #p:畫點,l:畫線,n:不畫任何點與線
                  pch=19, #16、19實心點
                  xlim=c(1150, 1400), 
                  ylim=c(2.6,3.4),
                  xlab="SAT (V+M)",
                  ylab="First Year GPA"),
                  cex.axis=1, cex.lab=1)

with(dta3, plot(SAT_Yes, GPA_Yes,
                  type="p", #p:畫點,l:畫線,n:不畫任何點與線
                  pch=1, #空心點
                  xlim=c(1150, 1400), 
                  ylim=c(2.6,3.4),
                  xlab="SAT (V+M)",
                  ylab="First Year GPA"),
                  cex.axis=1, cex.lab=1)

不懂為何點圖沒有疊加上去，而是變成兩張圖?

#第一步先畫框
with(dta3, plot(SAT_No, GPA_No, 
                  type="n", #p:畫點,l:畫線,n:不畫任何點與線
                  xlim=c(1150, 1400), 
                  ylim=c(2.6,3.4),
                  xlab="SAT (V+M)",
                  ylab="First Year GPA"),
                  cex.axis=1, cex.lab=1)

#第一步先畫框
with(dta3, plot(SAT_No, GPA_No, 
                  type="n", #p:畫點,l:畫線,n:不畫任何點與線
                  xlim=c(1150, 1400), 
                  ylim=c(2.6,3.4),
                  xlab="SAT (V+M)",
                  ylab="First Year GPA"),
                  cex.axis=1, cex.lab=1)

#第二步畫實心點和空心點
with(dta3, plot(SAT_No, GPA_No, 
                  type="p", #p:畫點,l:畫線,n:不畫任何點與線
                  pch=19))

#畫空心點
with(dta3, plot(SAT_Yes, GPA_Yes,
                  type="p", #p:畫點,l:畫線,n:不畫任何點與線
                  pch=1)) #空心點

我不懂…為何還是無法在同一張圖上!?

以下循著昶濬及婉禎的作法練習操作一次。

par(pty="s",mgp=c(2,1,0))
par(mar=c(3,3,0,0)+0.2)
par(oma=c(3,0,3,0))


with(dta3, plot(SAT_No, GPA_No, 
                  type="n", #不要呈現點
                  xlim=c(1150, 1400), 
                  ylim=c(2.6,3.4),
                  xlab="SAT (V+M)",
                  ylab="First Year GPA"),
                  cex.axis=1, cex.lab=1)

#先畫線
#先畫"Bowdoin"，不然"Colby"會被蓋掉
with(dta3[c(3),], segments(SAT_No, GPA_No, SAT_Yes, GPA_Yes, lty=1, lwd=2,  col="grey"))
with(dta3[-3,], segments(SAT_No, GPA_No, SAT_Yes, GPA_Yes, lty=1, lwd=1,  col="black"))

par(pty="s",mgp=c(2,1,0))
par(mar=c(3,3,0,0)+0.2)
par(oma=c(3,0,3,0))


with(dta3, plot(SAT_No, GPA_No, 
                  type="n", #不要呈現點
                  xlim=c(1150, 1400), 
                  ylim=c(2.6,3.4),
                  xlab="SAT (V+M)",
                  ylab="First Year GPA"),
                  cex.axis=1, cex.lab=1)

#先畫線
#先畫"Bowdoin"，不然"Colby"會被蓋掉
with(dta3[c(3),], segments(SAT_No, GPA_No, SAT_Yes, GPA_Yes, lty=1, lwd=2,  col="grey"))
with(dta3[-3,], segments(SAT_No, GPA_No, SAT_Yes, GPA_Yes, lty=1, lwd=1,  col="black"))

#再畫點
with(dta3, points(SAT_No, GPA_No, pch=21, bg="black", cex=1.5))
with(dta3, points(SAT_Yes, GPA_Yes, pch=21, bg="white", cex=1.5))

par(pty="s",mgp=c(2,1,0))
par(mar=c(3,3,0,0)+0.2)
par(oma=c(3,0,3,0))


with(dta3, plot(SAT_No, GPA_No, 
                  type="n", #不要呈現點
                  xlim=c(1150, 1400), 
                  ylim=c(2.6,3.4),
                  xlab="SAT (V+M)",
                  ylab="First Year GPA"),
                  cex.axis=1, cex.lab=1)

#先畫線
#先畫"Bowdoin"，不然"Colby"會被蓋掉
with(dta3[c(3),], segments(SAT_No, GPA_No, SAT_Yes, GPA_Yes, lty=1, lwd=2,  col="grey"))
with(dta3[-3,], segments(SAT_No, GPA_No, SAT_Yes, GPA_Yes, lty=1, lwd=1,  col="black"))

#再畫點
with(dta3, points(SAT_No, GPA_No, pch=21, bg="black", cex=1.5))
with(dta3, points(SAT_Yes, GPA_Yes, pch=21, bg="white", cex=1.5))

# Add text at coordinates
## dta3[-3,]不要畫"Bowdoin", cex為text size
with(dta3[-3,], text(SAT_Yes+44, GPA_Yes, labels=College, cex=.8))
# 透過font讓Bowdoin變粗(1=plain, 2=bold, 3=italic, 4=bold-italic)
with(dta3, text(1312+44, 3.12, labels="Bowdoin", font = 2, cex=.8))

par(pty="s",mgp=c(2,1,0))
par(mar=c(3,3,0,0)+0.2)
par(oma=c(3,0,3,0))


with(dta3, plot(SAT_No, GPA_No, 
                  type="n", #不要呈現點
                  xlim=c(1150, 1400), 
                  ylim=c(2.6,3.4),
                  xlab="SAT (V+M)",
                  ylab="First Year GPA"),
                  cex.axis=1, cex.lab=1)

#先畫線
#先畫"Bowdoin"，不然"Colby"會被蓋掉
with(dta3[c(3),], segments(SAT_No, GPA_No, SAT_Yes, GPA_Yes, lty=1, lwd=2,  col="grey"))
with(dta3[-3,], segments(SAT_No, GPA_No, SAT_Yes, GPA_Yes, lty=1, lwd=1,  col="black"))

#再畫點
with(dta3, points(SAT_No, GPA_No, pch=21, bg="black", cex=1.5))
with(dta3, points(SAT_Yes, GPA_Yes, pch=21, bg="white", cex=1.5))

# Add text at coordinates
## dta3[-3,]不要畫"Bowdoin", cex為text size
with(dta3[-3,], text(SAT_Yes+44, GPA_Yes, labels=College, cex=.8))
# 透過font讓Bowdoin變粗(1=plain, 2=bold, 3=italic, 4=bold-italic)
with(dta3, text(1312+44, 3.12, labels="Bowdoin", font = 2, cex=.8))

#legend
legend("topleft", c("Submitted SAT Scores", "Did NOT Submit SAT Scores"),
       pch=c(21,19),
       bg=c("white","black"),
       pt.cex=1, bty="n", cex = .8)

par(pty="s",mgp=c(2,1,0))
par(mar=c(3,3,0,0)+0.2)
par(oma=c(3,0,3,0))


with(dta3, plot(SAT_No, GPA_No, 
                  type="n", #不要呈現點
                  xlim=c(1150, 1400), 
                  ylim=c(2.6,3.4),
                  xlab="SAT (V+M)",
                  ylab="First Year GPA"),
                  cex.axis=1, cex.lab=1)

#先畫線
#先畫"Bowdoin"，不然"Colby"會被蓋掉
with(dta3[c(3),], segments(SAT_No, GPA_No, SAT_Yes, GPA_Yes, lty=1, lwd=2,  col="grey"))
with(dta3[-3,], segments(SAT_No, GPA_No, SAT_Yes, GPA_Yes, lty=1, lwd=1,  col="black"))

#再畫點
with(dta3, points(SAT_No, GPA_No, pch=21, bg="black", cex=1.5))
with(dta3, points(SAT_Yes, GPA_Yes, pch=21, bg="white", cex=1.5))

# Add text at coordinates
## dta3[-3,]不要畫"Bowdoin", cex為text size
with(dta3[-3,], text(SAT_Yes+44, GPA_Yes, labels=College, cex=.8))
# 透過font讓Bowdoin變粗(1=plain, 2=bold, 3=italic, 4=bold-italic)
with(dta3, text(1312+44, 3.12, labels="Bowdoin", font = 2, cex=.8))

#legend
legend("topleft", c("Submitted SAT Scores", "Did NOT Submit SAT Scores"),
       pch=c(21,19),
       bg=c("white","black"),
       pt.cex=1, bty="n", cex = .8)

# axis
axis(1, seq(1150,1400, length=16), labels=F)
axis(2, seq(2.6,3.4, length=21), labels=F)

# annotation
mtext("Figure 1.4 The mean SAT coupled with the mean first-year GPA for the class of \n 1999 at six schools  shown for those who submitted SAT scores for admis- \n sion and those who did not", side=1, cex=.8, adj = 0, line=2, outer = T)

DataM L6 in-class exercise

Ching-Fang Wu

Sun Nov 21 15:09:40 2021

Q1

ChickWeight{datasets} example

nlschools{MASS} dataset

Q2

Q3