Exercise 1

Find out what each code chunk (indicated by ‘##’) in the R script does and provide comments.

使用 woman 資料繪圖,先不要繪製任何圖,將第一列資料以點畫上

plot(women, type='n')
points(women[1,])

第一筆資料為 (58, 115) ,在圖上位置正確。

women[1,]
##   height weight
## 1     58    115

使用 lattice 繪製身高與體重的 xyplot ,選取 women 資料裡第一列的子集資料以點繪製

lattice::xyplot(weight ~ height, 
  data=women,
  subset=row.names(women)==1, type='p')

使用 ggplot2 繪製 women 的第一列資料,x 軸為身高, y 軸為體重,使用點圖呈現

library(ggplot2)
ggplot(data=women[1,], aes(height, weight))+
  geom_point()

Exercise 2

The data set is concerned with grade 8 pupils (age about 11 years) in elementary schools in the Netherlands. After deleting pupils with missing values, the number of pupils is 2,287 and the number of schools is 131. Class size ranges from 4 to 35. The response variables are score on a language test and that on an arithmetic test. The research intest is on how the two test scores depend on the pupil’s intelligence (verbal IQ) and on the number of pupils in a school class.

The class size is categorized into small, medium, and large with roughly equal number of observations in each category. The verbal IQ is categorized into low, middle and high with roughly equal number of observations in each category. Reproduce the plot below.

Source: Snijders, T. & Bosker, R. (2002). Multilevel Analysis.

讀取資料

dta2 <- read.table("C:/Users/TheorEco Lab/Desktop/108-2/DataManagement/0420/langMathDutch.txt", header = T)
str(dta2)
## 'data.frame':    2287 obs. of  6 variables:
##  $ school: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pupil : int  17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
##  $ IQV   : num  15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
##  $ size  : int  29 29 29 29 29 29 29 29 29 29 ...
##  $ lang  : int  46 45 33 46 20 30 30 57 36 36 ...
##  $ arith : int  24 19 24 26 9 13 13 30 23 22 ...

Column 1: School ID Column 2: Pupil ID Column 3: Verbal IQ score Column 4: The number of pupils in a class Column 5: Language test score Column 6: Arithmetic test score

將 class size 和 verbal IQ 依指示訂 levels

class size -> Small, Medium, Large

quantile(dta2$size, probs=seq(from=0, to=1, by=.33))
##  0% 33% 66% 99% 
##   5  20  27  37
dta2$sizef <- with(dta2, cut(size, ordered=T, breaks=c(0, 19, 27, 50), labels=c("Small", "Medium", "Large")))
with(dta2, table(sizef))
## sizef
##  Small Medium  Large 
##    749    837    701

Verbal IQ score -> Low, Middle, High

quantile(dta2$IQV, probs = seq(from=0, to=1, by=.33))
##   0%  33%  66%  99% 
##  4.0 11.0 12.5 17.0
dta2$IQVf <- with(dta2, cut(IQV, order=T, breaks = c(0, 11, 12.5, 20), labels = c("Low", "Middle", "High")))
with(dta2, table(IQVf))
## IQVf
##    Low Middle   High 
##    825    758    704

ggplot 繪圖

library(ggplot2)

建立底層

p20 <- ggplot(data=dta2, 
             aes(x=lang, y=arith))
p20

加上資料點圖

p21 <- p20 +
  geom_point()
p21

加上迴歸線和信賴區間

p22 <- p21 +
  stat_smooth(data=dta2, 
              formula=y ~ x,
              method='lm', 
              se=TRUE)
p22

加上座標軸標籤

p23 <- p22 +
  labs(x="Language score", 
       y="Arithmetic score")
p23

分組

p24 <- p23 + 
  facet_wrap(. ~ sizef + IQVf, labeller=labeller(.multi_line=F))
p24

這裡參考吳同學提供的 labeller=labeller(.multi_line=F) 的用法,這裡facet_wrap 可以做的調整,而這裡是有關 labeller 的功能。

調整座標軸

p25 <- p24 +
  scale_x_continuous(limits=c(8, 60), 
                    breaks=seq(10, 50, 10)) +
  scale_y_continuous(limits=c(0, 35), 
                    breaks=seq(5, 30, 5))
p25

Exercise 3

Use the USPersonalExpenditure{datasets} for this problem. This data set consists of United States personal expenditures (in billions of dollars) in the categories; food and tobacco, household operation, medical and health, personal care, and private education for the years 1940, 1945, 1950, 1955 and 1960. Plot the US personal expenditure data in the style of the third plot on the “Time Use” case study in the course web page (as below). You might want to transform the dollar amounts to log base 10 unit first.

查看資料結構

head(datasets::USPersonalExpenditure)
##                       1940   1945  1950 1955  1960
## Food and Tobacco    22.200 44.500 59.60 73.2 86.80
## Household Operation 10.500 15.500 29.00 36.5 46.20
## Medical and Health   3.530  5.760  9.71 14.0 21.10
## Personal Care        1.040  1.980  2.45  3.4  5.40
## Private Education    0.341  0.974  1.80  2.6  3.64

重整資料

將單一類別不同年份的花費融合成年份變數和對應的值

library(reshape2)
dta3 <- melt(datasets::USPersonalExpenditure[,1:5], id=col(datasets::USPersonalExpenditure, as.factor = TRUE))
head(dta3)
##                  Var1 Var2  value
## 1    Food and Tobacco 1940 22.200
## 2 Household Operation 1940 10.500
## 3  Medical and Health 1940  3.530
## 4       Personal Care 1940  1.040
## 5   Private Education 1940  0.341
## 6    Food and Tobacco 1945 44.500

命名變數

names(dta3) <- c("Category", "Year", "Expend")
dta3$Year <- as.factor(dta3$Year)
head(dta3)
##              Category Year Expend
## 1    Food and Tobacco 1940 22.200
## 2 Household Operation 1940 10.500
## 3  Medical and Health 1940  3.530
## 4       Personal Care 1940  1.040
## 5   Private Education 1940  0.341
## 6    Food and Tobacco 1945 44.500

新增取 log 欄位

dta3$log_expend <- log10(dta3$Expend)
head(dta3)
##              Category Year Expend  log_expend
## 1    Food and Tobacco 1940 22.200  1.34635297
## 2 Household Operation 1940 10.500  1.02118930
## 3  Medical and Health 1940  3.530  0.54777471
## 4       Personal Care 1940  1.040  0.01703334
## 5   Private Education 1940  0.341 -0.46724562
## 6    Food and Tobacco 1945 44.500  1.64836001

使用 ggplot 繪製 lolipop plot

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
p30 <- ggplot(dta3, 
              aes(x = log_expend, y = reorder(Category, desc(Category)))) +
        # 這裡刻意讓 y 軸的類別排序按原順序由上到下
  geom_point() +
  labs(x = "log of Expenditures (in billions of dollars originally", 
       y = "Category")

p30

p31 <- p30 +
  facet_grid(. ~ Year)

p31

p32 <- p31 + 
  geom_segment(aes(xend = mean(log_expend), 
                   yend = Category))
p32

p33 <- p32 +
  geom_vline(aes(xintercept=mean(log_expend)))

p33

p34 <- p33 +
  theme(axis.text.x = element_text(color = "black", size = rel(.7)), 
        axis.text.y = element_text(color = "black", size = rel(.9)))

p34

Exercise 4

A sample of 158 children with autisim spectrum disorder were recruited. Social development was assessed using the Vineland Adaptive Behavior Interview survey form, a parent-reported measure of socialization. It is a combined score that included assessment of interpersonal relationships, play/leisure time activities, and coping skills. Initial language development was assessed using the Sequenced Inventory of Communication Development (SICD) scale. These assessments were repeated on these children when they were 3, 5, 9, 13 years of age.

Source: West, B.T., Welch, K.B., & Galecki, A.T. (2002). Linear Mixed Models: Practical Guide Using Statistical Software. p. 220-271.

Data: autism{WWGbook}

Column 1: Age (in years) Column 2: Vineland Socialization Age Equivalent score Column 3: Sequenced Inventory of Communication Development Expressive Group (1 = Low, 2 = Medium, 3 = High) Column 4: Child ID

Replicate the two plots above using ggplot2.

讀取資料

# install.packages("WWGbook")
dta4 <- na.omit(WWGbook::autism)
str(dta4)
## 'data.frame':    610 obs. of  4 variables:
##  $ age    : int  2 3 5 9 13 2 3 5 9 13 ...
##  $ vsae   : int  6 7 18 25 27 17 18 12 18 24 ...
##  $ sicdegp: int  3 3 3 3 3 3 3 3 3 3 ...
##  $ childid: int  1 1 1 1 1 3 3 3 3 3 ...
##  - attr(*, "na.action")= 'omit' Named int  507 553
##   ..- attr(*, "names")= chr  "507" "553"
dta4$sicd <- factor(dta4$sicdegp, levels = c(1, 2, 3), labels = c("L", "M", "H"))
str(dta4)
## 'data.frame':    610 obs. of  5 variables:
##  $ age    : int  2 3 5 9 13 2 3 5 9 13 ...
##  $ vsae   : int  6 7 18 25 27 17 18 12 18 24 ...
##  $ sicdegp: int  3 3 3 3 3 3 3 3 3 3 ...
##  $ childid: int  1 1 1 1 1 3 3 3 3 3 ...
##  $ sicd   : Factor w/ 3 levels "L","M","H": 3 3 3 3 3 3 3 3 3 3 ...
##  - attr(*, "na.action")= 'omit' Named int  507 553
##   ..- attr(*, "names")= chr  "507" "553"

Plot 1 Layer 1: 建畫布,以點線圖放入座標軸資料,加入迴歸線

p40 <- ggplot(dta4, aes(x = scale(age, center = TRUE, scale = FALSE), y = vsae)) +
  geom_line(aes(group = childid)) + 
  geom_point(size = rel(2), alpha = .5) +
   stat_smooth(data=dta4, 
              formula=y ~ x,
              method='lm', 
              se=TRUE)

p40

這裡使用 dcale(x, center=..., scale=...) 來中心化年齡資料,以年齡資料的平均數為中心 0,但不採用 se 重建尺度 (scale = FALSE)。

Plot 1 Layer 2: 分組,座標軸命名

p41 <- p40 +
  facet_grid(. ~ sicd) +
  labs(x = "Age (in years, centered)", 
       y = "VSAE score")

p41

Plot 1 Layer 3: 調整座標軸和主題

p42 <- p41 +
  scale_x_continuous(breaks = seq(-2.5, 5, 2.5)) +
  theme_bw()

p42

Plot 2

dta4$age_2 <- dta4$age-2
pd <- position_dodge(.3)
p43 <- dta4 %>% group_by(sicd, age_2) %>%
  summarize(m=mean(vsae), 
            se=sd(vsae)/sqrt(n())) %>%
  
  ggplot() + 
  aes(age_2, m, 
      group=sicd, 
      shape=sicd) +
  geom_errorbar(aes(ymin=m - se,
                    ymax=m + se),
                width=.2, size=.3, 
                position = pd) +
  geom_line(aes(linetype = sicd), position = pd) +
  geom_point(position = pd, 
             size=rel(3)) +
  scale_shape_manual(values = c(1, 2, 16), name="Group") + 
  labs(x="Age (in years - 2)", y="VSAE score")
  

p43

p44 <- p43 +
  scale_linetype_discrete(name = "Group") +
  theme_bw() +
  theme(legend.background = element_rect(linetype = "solid", color = "black"),
        legend.position=c(.09, .8), 
        legend.key = element_rect(color = "gray"), 
        ) 

p44

Exercise 5

Use the diabetes dataset to generate a plot similar to the one below and inteprete the plot.

讀取資料

dta5 <- read.csv("C:/Users/TheorEco Lab/Desktop/108-2/DataManagement/0420/diabetes_mell.csv", header = T)
head(dta5,3)
##    SEQN RIAGENDR RIDRETH1 DIQ010 BMXBMI gender  race diabetes           BMI
## 1 51624        1        3      2  32.22  Males White       No    Overweight
## 2 51626        1        4      2  22.00  Males Black       No Normal weight
## 3 51627        1        4      2  18.22  Males Black       No Normal weight
str(dta5)
## 'data.frame':    8706 obs. of  9 variables:
##  $ SEQN    : int  51624 51626 51627 51628 51629 51630 51632 51633 51634 51635 ...
##  $ RIAGENDR: int  1 1 1 2 1 2 1 1 1 1 ...
##  $ RIDRETH1: int  3 4 4 4 1 3 2 3 1 3 ...
##  $ DIQ010  : int  2 2 2 1 2 2 2 2 2 1 ...
##  $ BMXBMI  : num  32.2 22 18.2 42.4 32.6 ...
##  $ gender  : Factor w/ 2 levels "Females","Males": 2 2 2 1 2 1 2 2 2 2 ...
##  $ race    : Factor w/ 3 levels "Black","Hispanic",..: 3 1 1 1 2 3 2 3 2 3 ...
##  $ diabetes: Factor w/ 2 levels "No","Yes": 1 1 1 2 1 1 1 1 1 2 ...
##  $ BMI     : Factor w/ 2 levels "Normal weight",..: 2 1 1 2 2 2 1 2 1 2 ...

整理資料

dta5m <- data.frame(with(dta5[, c("race", "gender", "diabetes", "BMI")], 
                          xtabs(~ race + gender + diabetes + BMI)))
head(dta5m, 3)
##       race  gender diabetes           BMI Freq
## 1    Black Females       No Normal weight  347
## 2 Hispanic Females       No Normal weight  712
## 3    White Females       No Normal weight  998
str(dta5m)
## 'data.frame':    24 obs. of  5 variables:
##  $ race    : Factor w/ 3 levels "Black","Hispanic",..: 1 2 3 1 2 3 1 2 3 1 ...
##  $ gender  : Factor w/ 2 levels "Females","Males": 1 1 1 2 2 2 1 1 1 2 ...
##  $ diabetes: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 2 2 2 ...
##  $ BMI     : Factor w/ 2 levels "Normal weight",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Freq    : int  347 712 998 429 706 873 6 11 12 15 ...
dta5m$race <- factor(dta5m$race, levels(dta5m$race)[c(1, 3, 2)])
levels(dta5m$race)[1:3] <- c("Black", "White", "Hispanic")
str(dta5m)
## 'data.frame':    24 obs. of  5 variables:
##  $ race    : Factor w/ 3 levels "Black","White",..: 1 3 2 1 3 2 1 3 2 1 ...
##  $ gender  : Factor w/ 2 levels "Females","Males": 1 1 1 2 2 2 1 1 1 2 ...
##  $ diabetes: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 2 2 2 ...
##  $ BMI     : Factor w/ 2 levels "Normal weight",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Freq    : int  347 712 998 429 706 873 6 11 12 15 ...
library(ggalluvial)
p5 <- ggplot(dta5m, 
       aes(axis1 = race,
           axis2 = gender, 
           axis3 = diabetes, 
           y = Freq)) +
  scale_x_discrete(limits=c("race", 
                            "gender", 
                            "diabetes"), 
                   expand=c(.1, .05)) +
  labs(y = "No. individuals") +
  geom_alluvium(aes(fill=BMI), reverse = FALSE) +
  geom_stratum(reverse = FALSE) + 
  geom_text(stat="stratum", 
            infer.label=TRUE, reverse = FALSE) +
  scale_fill_manual(values=c('orange','darkgrey'))+
  theme_minimal() +
  theme(legend.position = "bottom") +
  ggtitle("Diabetes in overall population in US 2009-2010", 
          subtitle = "stratified by race, gender and diabetes mellitus")

p5

Exercise 6

Find out what each code chunk (indicated by ‘##’) in the R script does and provide comments.

載入 ggplot2 package,從 Help 調出 ggplot2 的說明

library(ggplot2)
?ggplot2
## starting httpd help server ... done

安裝並載入 gapminder package

#install.packages("gapminder")
library(gapminder)

載入 gapminder 裡的 data,查看資料結構

data(gapminder)
str(gapminder)
## tibble [1,704 x 6] (S3: tbl_df/tbl/data.frame)
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ year     : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ lifeExp  : num [1:1704] 28.8 30.3 32 34 36.1 ...
##  $ pop      : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
##  $ gdpPercap: num [1:1704] 779 821 853 836 740 ...

gapminder data 另存成 gap

gap <- gapminder

使用 ggplot 繪製資料 gap 的 layer 1, x 軸為 lifeExp

ggplot(data = gap, aes(x = lifeExp))

在 layer 1 上加上 histogram

ggplot(data = gap, aes(x = lifeExp)) + 
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

將 histogram 的底設定為藍色,邊框為黑色, binwidth 設定為 10,並加上圖表標題、座標軸名稱,最後使用主題 classic

ggplot(data = gap, aes(x = lifeExp)) + 
  geom_histogram(fill = "blue", color = "black", bins = 10) + 
  ggtitle("Life expectancy for the gap dataset") + 
  xlab("Life expectancy (years)") + 
  ylab("Frequency") + 
  theme_classic() 

以 boxplot 呈現 gap 資料的 continent 和 lifeExp 的關係

ggplot(data = gap, aes(x = continent, y = lifeExp, fill = continent)) + 
  geom_boxplot() + 
  ggtitle("Boxplots for lifeExp by continent") + 
  xlab("Continent") + 
  ylab("Life expectancy (years)") +
  theme_minimal() # + 

  # guides(fill = FALSE) 

What happens if you un-hashtage guides(fill = FALSE) and the plus sign in lines 68 and 69 above?

將 hashtage 移除,圖例標籤被移除了

ggplot(data = gap, aes(x = continent, y = lifeExp, fill = continent)) + 
  geom_boxplot() + 
  ggtitle("Boxplots for lifeExp by continent") + 
  xlab("Continent") + 
  ylab("Life expectancy (years)") +
  theme_minimal()  + 
  guides(fill = FALSE) 

依據 gap 資料裡的 continent 分組,繪製 lifeExp 與 gdpPercap 的點圖

ggplot(data = gap, aes(x = lifeExp, y = gdpPercap, color = continent, shape = continent)) + 
    geom_point(size = 5, alpha = 0.5) + 
    theme_classic() +
    ggtitle("Scatterplot of life expectancy by gdpPercap") +
    xlab("Life expectancy (years)") + 
    ylab("gdpPercap (USD)") + 
    theme(legend.position = "top",
          plot.title = element_text(hjust = 0.5, size = 20),
          legend.title = element_text(size = 10),
          legend.text = element_text(size = 5),
          axis.text.x = element_text(angle = 45, hjust = 1)) 

In lines the ggplot code above, what are the arguments inside of our second “theme” argument doing?

第二組 theme 所表現的是整張圖的設計,包含:

圖例位置:top

ggplot(data = gap, aes(x = lifeExp, y = gdpPercap, color = continent, shape = continent)) + 
    geom_point(size = 5, alpha = 0.5) + 
    theme_classic() +
    ggtitle("Scatterplot of life expectancy by gdpPercap") +
    xlab("Life expectancy (years)") + 
    ylab("gdpPercap (USD)") + 
    theme(legend.position = "top") 

調整圖表標題的位置與大小

ggplot(data = gap, aes(x = lifeExp, y = gdpPercap, color = continent, shape = continent)) + 
    geom_point(size = 5, alpha = 0.5) + 
    theme_classic() +
    ggtitle("Scatterplot of life expectancy by gdpPercap") +
    xlab("Life expectancy (years)") + 
    ylab("gdpPercap (USD)") + 
    theme(legend.position = "top",
          plot.title = element_text(hjust = 0.5, size = 20)) 

調整圖例大小

ggplot(data = gap, aes(x = lifeExp, y = gdpPercap, color = continent, shape = continent)) + 
    geom_point(size = 5, alpha = 0.5) + 
    theme_classic() +
    ggtitle("Scatterplot of life expectancy by gdpPercap") +
    xlab("Life expectancy (years)") + 
    ylab("gdpPercap (USD)") + 
    theme(legend.position = "top",
          plot.title = element_text(hjust = 0.5, size = 20),
          legend.title = element_text(size = 10)) 

調整圖例中文字的大小

ggplot(data = gap, aes(x = lifeExp, y = gdpPercap, color = continent, shape = continent)) + 
    geom_point(size = 5, alpha = 0.5) + 
    theme_classic() +
    ggtitle("Scatterplot of life expectancy by gdpPercap") +
    xlab("Life expectancy (years)") + 
    ylab("gdpPercap (USD)") + 
    theme(legend.position = "top",
          plot.title = element_text(hjust = 0.5, size = 20),
          legend.title = element_text(size = 10),
          legend.text = element_text(size = 5)) 

調整 x 軸的標籤,傾斜 45 度

ggplot(data = gap, aes(x = lifeExp, y = gdpPercap, color = continent, shape = continent)) + 
    geom_point(size = 5, alpha = 0.5) + 
    theme_classic() +
    ggtitle("Scatterplot of life expectancy by gdpPercap") +
    xlab("Life expectancy (years)") + 
    ylab("gdpPercap (USD)") + 
    theme(legend.position = "top",
          plot.title = element_text(hjust = 0.5, size = 20),
          legend.title = element_text(size = 10),
          legend.text = element_text(size = 5),
          axis.text.x = element_text(angle = 45, hjust = 1)) 

The End

Let it all hang out in informatics graphics.