20211129_Grammar of graphics

In-class1

Find out what each code chunk (indicated by ‘##’) in the R script does and provide comments.

data(women)

##base系統畫women, 先劃一個空白畫布'n'，再畫出第一列的資料點
plot(women, type='n')
points(women[1,])

##lattice系統畫圖，用 xyplot繪製散佈圖，繪製點圖
lattice::xyplot(weight ~ height, 
                data=women,
                subset=row.names(women)==1, type='p')

##用ggplot2系統化圖，使用加號（+）連接後續的繪圖類型與細部參數，aes指定資料與圖形對應關係的函數，geom_point 指定以資料點的方式繪製散佈圖
library(ggplot2)
ggplot(data=women[1,], aes(height, weight)) +
  geom_point()

##在W1畫一個空白圖
w1 <- plot(women, type='n')

##無法存至w1中
w1

## NULL

##使用lattice畫圖存至W2
w2 <- lattice::xyplot(weight ~ height, data=women)

w2

##使用base畫圖無法存至w1
class(w1)

## [1] "NULL"

##w2為格子類的圖型
class(w2)

## [1] "trellis"

##確認trellis裡面的內容
 methods(class='trellis')

## [1] [          dim        dimnames   dimnames<- plot       print      summary   
## [8] t          update    
## see '?methods' for accessing help and source code

#總結:以上了解三種畫圖型態的比較，base、lattice、ggplot2，個人喜歡ggplot2，用+來代表後面所要增加的細部工作，覺得很直覺使用上很方便。

In-class2

The data set is concerned with grade 8 pupils (age about 11 years) in elementary schools in the Netherlands. After deleting pupils with missing values, the number of pupils is 2,287 and the number of schools is 131. Class size ranges from 4 to 35. The response variables are score on a language test and that on an arithmetic test. The research intest is on how the two test scores depend on the pupil’s intelligence (verbal IQ) and on the number of pupils in a school class.

The class size is categorized into small, medium, and large with roughly equal number of observations in each category. The verbal IQ is categorized into low, middle and high with roughly equal number of observations in each category. Reproduce the plot below.

Source: Snijders, T. & Bosker, R. (2002). Multilevel Analysis.

Data

Column 1: School ID Column 2: Pupil ID Column 3: Verbal IQ score Column 4: The number of pupils in a class Column 5: Language test score Column 6: Arithmetic test score

Data Wrangle

dat <- read.table("langMathDutch.txt", header = T)

head(dat)

##   school pupil  IQV size lang arith
## 1      1 17001 15.0   29   46    24
## 2      1 17002 14.5   29   45    19
## 3      1 17003  9.5   29   33    24
## 4      1 17004 11.0   29   46    26
## 5      1 17005  8.0   29   20     9
## 6      1 17006  9.5   29   30    13

str(dat)

## 'data.frame':    2287 obs. of  6 variables:
##  $ school: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pupil : int  17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
##  $ IQV   : num  15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
##  $ size  : int  29 29 29 29 29 29 29 29 29 29 ...
##  $ lang  : int  46 45 33 46 20 30 30 57 36 36 ...
##  $ arith : int  24 19 24 26 9 13 13 30 23 22 ...

summary(dat)

##      school        pupil              IQV             size           lang      
##  Min.   :  1   Min.   :  17001   Min.   : 4.00   Min.   : 5.0   Min.   : 9.00  
##  1st Qu.: 67   1st Qu.: 677013   1st Qu.:10.50   1st Qu.:17.0   1st Qu.:35.00  
##  Median :141   Median :1417007   Median :12.00   Median :24.0   Median :42.00  
##  Mean   :133   Mean   :1337225   Mean   :11.83   Mean   :23.1   Mean   :40.93  
##  3rd Qu.:195   3rd Qu.:1957204   3rd Qu.:13.00   3rd Qu.:28.0   3rd Qu.:48.00  
##  Max.   :258   Max.   :2587010   Max.   :18.00   Max.   :37.0   Max.   :58.00  
##      arith      
##  Min.   : 2.00  
##  1st Qu.:14.00  
##  Median :20.00  
##  Mean   :19.44  
##  3rd Qu.:25.00  
##  Max.   :30.00

##將IQV分三等份
quantile(dat$IQV,c(0.33,0.66,0.99))

##  33%  66%  99% 
## 11.0 12.5 17.0

##設定分三等份後的新變項IQVf
dat$IQVf <- with(dat, cut(IQV, order=T, breaks = c( 0, 11, 12.5, 18), labels = c("Low", "Middle", "High")))

##將size分三等份
quantile(dat$size,c(0.33,0.66,0.99))

## 33% 66% 99% 
##  20  27  37

##設定分三等份後的新變項sizef 
dat$sizef <- with(dat, cut(size , order=T, breaks = c(0, 20, 27, 50), labels = c("Small", "Middle", "High")))

##再次確認資料
head(dat)

##   school pupil  IQV size lang arith IQVf sizef
## 1      1 17001 15.0   29   46    24 High  High
## 2      1 17002 14.5   29   45    19 High  High
## 3      1 17003  9.5   29   33    24  Low  High
## 4      1 17004 11.0   29   46    26  Low  High
## 5      1 17005  8.0   29   20     9  Low  High
## 6      1 17006  9.5   29   30    13  Low  High

Plot

#facet_wrap指的是以一個標準作為分類
ggplot(data=dat)+geom_point(mapping=aes(x=lang, y=arith)) +facet_wrap(.~sizef+IQVf)+geom_smooth(mapping=aes(x=lang, y=arith))+labs(x= "Language score", y= "Arithmetic score")

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#facet_grid，可以用2個以上類別分組，依據資料水平或垂直分組。這是放右邊表示水平分組
ggplot(data=dat)+geom_point(mapping=aes(x=lang, y=arith)) +facet_grid(.~sizef+IQVf)+geom_smooth(mapping=aes(x=lang, y=arith))+labs(x= "Language score", y= "Arithmetic score")

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

##總結：facet的這個部分總共有facet_grid, facet_null, 及facet_wrap三個形態，常用facet_grid。

In-class3

A sample of 158 children with autisim spectrum disorder were recruited. Social development was assessed using the Vineland Adaptive Behavior Interview survey form, a parent-reported measure of socialization. It is a combined score that included assessment of interpersonal relationships, play/leisure time activities, and coping skills. Initial language development was assessed using the Sequenced Inventory of Communication Development (SICD) scale. These assessments were repeated on these children when they were 3, 5, 9, 13 years of age.

Source: West, B.T., Welch, K.B., & Galecki, A.T. (2002). Linear Mixed Models: Practical Guide Using Statistical Software. p. 220-271.

Data: autism{WWGbook}

Column 1: Age (in years) Column 2: Vineland Socialization Age Equivalent score Column 3: Sequenced Inventory of Communication Development Expressive Group (1 = Low, 2 = Medium, 3 = High) Column 4: Child ID

library(ggplot2)
library(magrittr)
library(dplyr)

## 
## 載入套件：'dplyr'

## 下列物件被遮斷自 'package:stats':
## 
##     filter, lag

## 下列物件被遮斷自 'package:base':
## 
##     intersect, setdiff, setequal, union

data(autism, package="WWGbook")
dat3<-autism
head(dat3)

##   age vsae sicdegp childid
## 1   2    6       3       1
## 2   3    7       3       1
## 3   5   18       3       1
## 4   9   25       3       1
## 5  13   27       3       1
## 6   2   17       3       3

str(dat3)

## 'data.frame':    612 obs. of  4 variables:
##  $ age    : int  2 3 5 9 13 2 3 5 9 13 ...
##  $ vsae   : int  6 7 18 25 27 17 18 12 18 24 ...
##  $ sicdegp: int  3 3 3 3 3 3 3 3 3 3 ...
##  $ childid: int  1 1 1 1 1 3 3 3 3 3 ...

##設定sicdegp的新變項sizef 
dat3$sicdegpf <- factor(dat3$sicdegp, levels=c(1,2,3), labels = c("L", "M", "H"))

head(dat3)

##   age vsae sicdegp childid sicdegpf
## 1   2    6       3       1        H
## 2   3    7       3       1        H
## 3   5   18       3       1        H
## 4   9   25       3       1        H
## 5  13   27       3       1        H
## 6   2   17       3       3        H

ggplot(dat3, aes(x = scale(age, center = TRUE, scale = F),
                y = vsae)) +
  geom_line(aes(group = childid)) + 
  geom_point(alpha = .5) + #設定點，alpha透明度
 geom_smooth(method = "lm", se = T) +    
    #stat_smooth或geom_smooth都可以畫出線性回歸線lm
  facet_grid(. ~ sicdegpf) +
  labs(x = "Age (in years, centered)", y = "VSAE score") +
  theme_minimal()#內建主題theme_minimal()為簡約版

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 row(s) containing missing values (geom_path).

## Warning: Removed 2 rows containing missing values (geom_point).

dat3$Age2<-(dat3$age-2)

pd <- position_dodge(.3)

p <- na.omit(dat3) %>% group_by(sicdegpf, Age2) %>%  
    #library(magrittr)，na.omit(dat3)返回：NA 省略後的值
  summarize(m_p=mean(vsae), #library(dplyr)
            se_p=sd(vsae)/sqrt(n()), .groups='drop') %>%
  ggplot() + 
  aes(Age2, m_p, 
      group=sicdegpf,
      shape=sicdegpf) +
  geom_errorbar(aes(ymin=m_p - se_p,
                    ymax=m_p + se_p),
                width=.2, size=.3, 
                position=pd) +
  geom_line(position=pd, 
            linetype='dotted') +
  geom_point(position=pd, 
             size=rel(3)) +#數字大線條越明顯
  scale_shape(guide=guide_legend(title='Group')) +
   labs(x="Age (in years - 2)", y="VSAE score") +
  theme(legend.position=c(.1, .9))
p

In-class4

Use the diabetes dataset to generate a plot similar to the one below and inteprete the plot.

#使用沖積圖工具
pacman::p_load(ggalluvial)

dat4 <- read.csv("diabetes_mell.csv", header = T)
head(dat4)

##    SEQN RIAGENDR RIDRETH1 DIQ010 BMXBMI  gender     race diabetes           BMI
## 1 51624        1        3      2  32.22   Males    White       No    Overweight
## 2 51626        1        4      2  22.00   Males    Black       No Normal weight
## 3 51627        1        4      2  18.22   Males    Black       No Normal weight
## 4 51628        2        4      1  42.39 Females    Black      Yes    Overweight
## 5 51629        1        1      2  32.61   Males Hispanic       No    Overweight
## 6 51630        2        3      2  30.57 Females    White       No    Overweight

str(dat4)

## 'data.frame':    8706 obs. of  9 variables:
##  $ SEQN    : int  51624 51626 51627 51628 51629 51630 51632 51633 51634 51635 ...
##  $ RIAGENDR: int  1 1 1 2 1 2 1 1 1 1 ...
##  $ RIDRETH1: int  3 4 4 4 1 3 2 3 1 3 ...
##  $ DIQ010  : int  2 2 2 1 2 2 2 2 2 1 ...
##  $ BMXBMI  : num  32.2 22 18.2 42.4 32.6 ...
##  $ gender  : chr  "Males" "Males" "Males" "Females" ...
##  $ race    : chr  "White" "Black" "Black" "Black" ...
##  $ diabetes: chr  "No" "No" "No" "Yes" ...
##  $ BMI     : chr  "Overweight" "Normal weight" "Normal weight" "Overweight" ...

##整理資料
dat4_v1 <- data.frame(with(dat4[, c("gender", "race", "diabetes", "BMI")],xtabs(~ gender + race + diabetes + BMI)))

head(dat4_v1)

##    gender     race diabetes           BMI Freq
## 1 Females    Black       No Normal weight  347
## 2   Males    Black       No Normal weight  429
## 3 Females Hispanic       No Normal weight  712
## 4   Males Hispanic       No Normal weight  706
## 5 Females    White       No Normal weight  998
## 6   Males    White       No Normal weight  873

ggplot(dat4_v1, 
       aes(axis1=race,
           axis2=diabetes, 
           axis3=BMI, 
           y=Freq)) +
  scale_x_discrete(limits=c("race", 
                            "diabetes", 
                            "BMI"), 
                   expand=c(.1, .5)) +
  labs(x='Health and Behavior between Races', 
       y='Count') +
  geom_alluvium(aes(fill=gender)) +#依性別分別填充顏色
  geom_stratum() +
  geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
  scale_fill_manual(values=c('darkgreen','hotpink3'))+#指定顏色
  theme_minimal() +
  ggtitle("Subjects stratified by race, diabetes, BMI, and gender")

Exercises 1

Fifty male and fifty female students fill out the same questionnaire in weekly intervals starting five weeks before an important examination to measure state anxiety.

The research interests are: 1. whether there are gender difference in state anxiety 2. individual differences in state anxiety. Explore the answers to both questions with plots involving confidence intervals or error bars for the means.

Source: Von Eye, A., & Schuster C. (1998). Regression Analysis for Social Sciences. San Diego: Academic Press.

pacman::p_load(ggplot2,readr, tidyr)

Edat1 <- read.table("stateAnxiety.txt", header = T)
head(Edat1)|>knitr::kable()

f1	f2	f3	f4	f5	m1	m2	m3	m4	m5
13	17	18	20	24	6	14	22	20	24
26	31	33	38	42	4	11	14	12	23
13	17	24	29	32	17	25	26	29	38
22	24	26	27	29	19	22	26	30	34
18	19	19	22	30	12	21	21	23	24
32	31	30	31	32	11	16	20	19	22

str(Edat1)

## 'data.frame':    50 obs. of  10 variables:
##  $ f1: int  13 26 13 22 18 32 16 18 14 20 ...
##  $ f2: int  17 31 17 24 19 31 16 22 17 19 ...
##  $ f3: int  18 33 24 26 19 30 21 25 23 23 ...
##  $ f4: int  20 38 29 27 22 31 27 29 21 25 ...
##  $ f5: int  24 42 32 29 30 32 30 35 25 28 ...
##  $ m1: int  6 4 17 19 12 11 14 9 12 11 ...
##  $ m2: int  14 11 25 22 21 16 23 18 16 13 ...
##  $ m3: int  22 14 26 26 21 20 26 20 23 17 ...
##  $ m4: int  20 12 29 30 23 19 29 20 26 14 ...
##  $ m5: int  24 23 38 34 24 22 33 24 32 20 ...

Edat1_long <-Edat1 %>% 
  pivot_longer(cols = starts_with(c("f","m")), 
                      names_to = "Week", 
                      values_to = "Score") |>
  mutate(Gender = gsub("[0-9]","",Week),
         #mutate() 函數位於dplyr裡：新增變數設定Gender變項
         Week = parse_number(Week),
         ID = rep(1:100, each=5))

Edat1_long |> as.data.frame() |> head(11)

##    Week Score Gender ID
## 1     1    13      f  1
## 2     2    17      f  1
## 3     3    18      f  1
## 4     4    20      f  1
## 5     5    24      f  1
## 6     1     6      m  2
## 7     2    14      m  2
## 8     3    22      m  2
## 9     4    20      m  2
## 10    5    24      m  2
## 11    1    26      f  3

##睿寧作法，stack 函數可以將原先多個直行向量，轉換成單一向量進行標示
long =stack(Edat1)
head(long)

##   values ind
## 1     13  f1
## 2     26  f1
## 3     13  f1
## 4     22  f1
## 5     18  f1
## 6     32  f1

typeof(Edat1_long)

## [1] "list"

##將男女放在一起，女性生焦慮程度高於男性
ggplot(Edat1_long, aes(x=Week, y=Score, group=ID))+
  geom_line(mapping = aes(color=Gender))+
  geom_point()+
  facet_wrap(~ Gender)+#facet_wrap前面有用過，分類一個變項用
  labs(x="weeks before an important examination", y="Anxiety score")+
  theme_minimal()

##看男女的差異
ggplot(Edat1_long, aes(x=Week, y=Score, group=Week,color=Gender))+
  facet_wrap(~ Gender)+geom_boxplot()

##看男女的個人差異
ggplot(Edat1_long, aes(x=Week, y=Score, group=ID,color=Gender))+
  facet_wrap(~ Gender)+geom_boxplot()

Exercises 2

Use the markdown file to replicate the contents of Weissgerber, T.L., Milic, N.M., Winham, S.J., Garovic, V.D. (2015). Beyond Bar and Line Graphs: Time for a New Data Presentation Paradigm. PLOS Biology , 13. The two data sets are here provided: journal.pbio.1002128.s002.XLS and journal.pbio.1002128.s003.XLS . You can also download everything in a zip file from this location.

#Interpreting the plots

The short horizontal black lines in the plots show the median values. The mean should not be shown for data that are analyzed non-parametrically, as these data do not meet the distributional assumptions required to calculate the mean. If your data meet the assumptions required for parametric testing the graphs can be changed to show the mean instead of the median. To use the mean with the code below, simply replace “median” with “mean” where it occurs in the chunks below. Open circles show measurements for each participant or observation.

## Registered S3 method overwritten by 'printr':
##   method                from     
##   knit_print.data.frame rmarkdown

#Independent data, points not jittered (no overlapping points)

Use these next few chunks of code to create scatterplots for independent data in two to five groups, when there are no overlapping points within any group. Independent data means that the variable of interest is measured one time in each subject, and subjects are not related to each other. If your data do not meet this criteria, see the further below for paired or non-independent data. Overlapping points means that two subjects have values that are so close that they will overlap on the graph and you will not be able to see both points clearly. If your data have overlapping points, use the code in the “Points Jittered” section below.

First step is to read in the data from Excel spreadsheet (which needs to be in the same folder as this source file, or else you need to specify the full path to the file on your computer), then extract the specific rows and columns that contain the data to be plotted. The table below shows the format of the data in the Excel sheet. This chunk below can be adapted to read data from any Excel file, just change the file name and adapt the subsetting line so that it captures the relevant rows and columns in the Excel file.

library(readxl)
# read in data from PLOS Biology article supplementary materials
independent_data <- read_excel("journal.pbio.1002128.s002.XLSX", sheet = 1)
# subset just groups 1-5 from the 'No overlapping points' sheet
independent_data <- independent_data[15:30,2:6]
# assign column names
names(independent_data) <- independent_data[1, ]
# remove first row with column names
independent_data <- independent_data[-1, ]
kable(independent_data)

Group 1	Group 2	Group 3	Group 4	Group 5
5	7	9	42	2
3	3	7	2	0
6	9	10	5	3
8	10	12	55	5
10	33	14	9	7
13	15	17	12	10
1	18	20	15	13
4	6	40	3	1
18	20	22	NA	15
4	30	35	NA	1
7	NA	42	NA	4
9	NA	13	NA	6
14	NA	NA	NA	11
15	NA	NA	NA	12
17	NA	NA	NA	14

An important step is reshaping the data from their current wide format to a more tidy long format. Long formats are most useful for plotting and statistical analysis in R. Here’s what the data look like in the long format:

# reshape for plotting
library(tidyr)
independent_data_long <- gather(independent_data, group, value, `Group 1`:`Group 5`, convert = TRUE)
kable(independent_data_long)

group	value
Group 1	5
Group 1	3
Group 1	6
Group 1	8
Group 1	10
Group 1	13
Group 1	1
Group 1	4
Group 1	18
Group 1	4
Group 1	7
Group 1	9
Group 1	14
Group 1	15
Group 1	17
Group 2	7
Group 2	3
Group 2	9
Group 2	10
Group 2	33
Group 2	15
Group 2	18
Group 2	6
Group 2	20
Group 2	30
Group 2	NA
Group 2	NA
Group 2	NA
Group 2	NA
Group 2	NA
Group 3	9
Group 3	7
Group 3	10
Group 3	12
Group 3	14
Group 3	17
Group 3	20
Group 3	40
Group 3	22
Group 3	35
Group 3	42
Group 3	13
Group 3	NA
Group 3	NA
Group 3	NA
Group 4	42
Group 4	2
Group 4	5
Group 4	55
Group 4	9
Group 4	12
Group 4	15
Group 4	3
Group 4	NA
Group 4	NA
Group 4	NA
Group 4	NA
Group 4	NA
Group 4	NA
Group 4	NA
Group 5	2
Group 5	0
Group 5	3
Group 5	5
Group 5	7
Group 5	10
Group 5	13
Group 5	1
Group 5	15
Group 5	1
Group 5	4
Group 5	6
Group 5	11
Group 5	12
Group 5	14

Now we are ready to plot, starting with subsetting just groups 1 and and 2 from the long data frame. Open circles show measurements for each participant or observation.

# plot
library(ggplot2)
library(dplyr)
# subset groups 1 & 2
independent_data_long_groups_1_and_2 <- independent_data_long %>% 
  filter(group %in% c("Group 1", "Group 2"))
# plot
ggplot(independent_data_long_groups_1_and_2, aes(group, as.numeric(value))) +
  geom_point(shape = 1, size = 4) +
  stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.2, size = 1) +
  xlab("") +
    ylab("Measurement (units)") +
  theme_minimal(base_size = 16)

Plotting groups 1, 2, and 3, the only thing that changes is the subsetting method:

# subset groups 1, 2 & 3
independent_data_long_groups_1_2_3 <-  independent_data_long %>% 
  filter(group %in% c("Group 1", "Group 2", "Group 3"))
# plot
ggplot(independent_data_long_groups_1_2_3, aes(group, as.numeric(value))) +
  geom_point(shape = 1, size = 4) +
  stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.2, size = 1) +
  xlab("") +
  ylab("Measurement (units)") +
  theme_minimal(base_size = 16)

Plotting groups 1 to 4:

# groups 1, 2, 3, & 4
independent_data_long_groups_1_2_3_4 <-  independent_data_long %>% 
  filter(group %in% c("Group 1", "Group 2", "Group 3", "Group 4"))
# plot
ggplot(independent_data_long_groups_1_2_3_4, aes(group, as.numeric(value))) +
  geom_point(shape = 1, size = 4) +
  stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.2, size = 1) +
  xlab("") +
  ylab("Measurement (units)") +
  theme_minimal(base_size = 16)

And finally plotting all five groups, no subsetting required:

# all five groups
ggplot(independent_data_long, aes(group, as.numeric(value))) +
  geom_point(shape = 1, size = 4) +
  stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.2, size = 1) +
  xlab("") +
  ylab("Measurement (units)") +
  theme_minimal(base_size = 16)

#Independent data, points jittered

Use the code chunks below to create scatterplots for independent data in two to five groups, when there are overlapping points in at least one group. Independent data means that the variable of interest is measured one time in each subject, and subjects are not related to each other. If your data do not meet this criteria, see the code chunks below for paired or non-independent data. Overlapping points means that two subjects have values that are so close that they will overlap on the plot, and you will not be able to see both points clearly. Adjust the width and height values in the position_jitter function to refine the jitter settings for the points so that they do no overlap. Note that I’ve set a jitter height value, which is not part of the Excel plots, but I think is quite effective at separating points.

First we get the data from the Excel spreadsheet:

library(readxl)
# read in data from PLOS Biology article supplementary materials
independent_data_j <- read_excel("journal.pbio.1002128.s002.XLSX", sheet = 2)
# subset data from the 'points jittered' sheet
independent_data_j <- independent_data_j[16:115,2:3]
# group numbers are not given in the spreadsheet, so we'll add them
independent_data_j$Groups <- c(rep(1, 20), rep(2, 20), rep(3, 20), rep(4, 20), rep(5, 20))
# assign column names
names(independent_data_j) <- c("Subject ID", "Measurement", "Group")

The data are already in a nice tidy long format, with Group Name in one column and Measurement Values in another column, so we don’t need to reshape them. We can go directly to plotting them, first two groups, then three, then four, then all five groups. Once again the only thing that varies is how we subset the original data.

# plot
library(ggplot2)
library(dplyr)
# groups 1 & 2
independent_data_j_groups_1_and_2 <- independent_data_j %>% 
  filter(Group %in% 1:2)
# plot
ggplot(independent_data_j_groups_1_and_2, aes(as.factor(Group), as.numeric(Measurement))) +
  geom_jitter(shape = 1, size = 4, position=position_jitter(width = 0.2, height = 0.2)) +
  stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.2, size = 1) +
  xlab("") +
  ylab("Measurement (units)") +
  theme_minimal(base_size = 16)

# groups 1, 2 & 3
independent_data_j_groups_1_2_3 <- independent_data_j %>% 
  filter(Group %in% 1:3)
# plot
ggplot(independent_data_j_groups_1_2_3, aes(as.factor(Group), as.numeric(Measurement))) +
  geom_jitter(shape = 1, size = 4, position=position_jitter(width = 0.2, height = 0.2)) +
  stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.2, size = 1) +
  xlab("") +
  ylab("Measurement (units)") +
  theme_minimal(base_size = 16)

# groups 1, 2, 3, & 4
independent_data_j_groups_1_2_3_4 <- independent_data_j %>% 
  filter(Group %in% 1:4)
# plot
ggplot(independent_data_j_groups_1_2_3_4, aes(as.factor(Group), as.numeric(Measurement))) +
  geom_jitter(shape = 1, size = 4, position=position_jitter(width = 0.2, height = 0.2)) +
  stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.2, size = 1) +
  xlab("") +
  ylab("Measurement (units)") +
  theme_minimal(base_size = 16)

# all five groups
ggplot(independent_data_j, aes(as.factor(Group), as.numeric(Measurement))) +
  geom_jitter(shape = 1, size = 4, position=position_jitter(width = 0.2, height = 0.2)) +
  stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.2, size = 1) +
  xlab("") +
  ylab("Measurement (units)") +
  theme_minimal(base_size = 16)

#Paired or Non-independent Data: 1 Group, 2 Conditions

Use these chunks to create scatterplots for paired or matched data (2 conditions) in one group of subjects. Paired data are when you measure the variable of interest more than one time in each participant. Matched data are when participants in groups 1 and 2 are matched for important characteristics. If your data are independent, please see the chunks above for Independent Data.

The short horizontal black line in the “Difference in Measurement” graph shows the median difference. Medians for each condition are not shown, and should not be calculated. Unlike means, medians are not additive. The median difference does not equal the difference in the medians. Open circles and black lines connecting the circles show paired measurements for each participant or observation.

library(readxl)
# read in data from PLOS Biology article supplementary materials
One_group_two_conditions <- read_excel("journal.pbio.1002128.s003.XLS", sheet = 1)
# subset data from the 'points jittered' sheet
One_group_two_conditions <- One_group_two_conditions[12:23,1:3]
# assign column names
names(One_group_two_conditions) <- c("Subject ID", "Condition 1 Name",  "Condition 2 Name")
One_group_two_conditions$difference <- as.numeric(One_group_two_conditions$`Condition 2 Name`)  - as.numeric(One_group_two_conditions$`Condition 1 Name`)

The data in the Excel sheet are in an untidy wide format, so let’s convert them to a tidy long format:

# reshape for plotting
library(tidyr)
One_group_two_conditions_long <- gather(One_group_two_conditions, group, value, `Condition 1 Name`:`Condition 2 Name`, -`Subject ID`, -difference, convert = TRUE)

Now we can plot:

# plot
library(ggplot2)
library(gridExtra)

g1 <- ggplot(One_group_two_conditions_long, aes(group, as.numeric(value), group = `Subject ID`)) + 
  geom_point(shape = 1, size = 4) + 
  geom_line() + 
  xlab("") +
  ylab("Measurement (units)") + 
  theme_minimal(base_size = 16) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

# differences 
g2 <- ggplot(One_group_two_conditions_long, aes(x = 1, y = difference)) +
  geom_point(shape = 1, size = 4) + 
  stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.001, size = 1) +
  xlab("") +
  ylab("Difference in Measurement (units)") +
  theme_minimal(base_size = 16) + 
  scale_x_continuous(breaks = NULL) +
  coord_fixed(ratio = 0.0005)

# combine the two plots
grid.arrange(g1, g2, ncol = 2)

#Paired or Non-independent Data: 2 Groups, 2 Conditions

First we read in the data from the spreadsheet:

library(readxl)
# read in data from PLOS Biology article supplementary materials
Two_groups_two_conditions <- read_excel("journal.pbio.1002128.s003.XLS", sheet = 2)
# subset data from the 'points jittered' sheet
Two_groups_two_conditions <- Two_groups_two_conditions[12:41,2:5]
# assign group names
Two_groups_two_conditions$group <- c(rep("Group 1 Name", 15), rep("Group 2 Name", 15)) 
names(Two_groups_two_conditions) <- c("Condition 1 Name A", "Condition 2 Name A",   "Condition 1 Name B", "Condition 2 Name B")

The data in the Excel sheet are in an unusual structure, so a few steps for reshaping into a tidy form are needed. Here’s how we can tidy them and how they look after being tidied:

# convert to simple long form
Two_groups_two_conditions[,1] <-  unlist(c(Two_groups_two_conditions[1:15,1], Two_groups_two_conditions[16:30,3]))
Two_groups_two_conditions[,2] <-  unlist(c(Two_groups_two_conditions[1:15,2], Two_groups_two_conditions[16:30,4]))
# drop unneeded columns
Two_groups_two_conditions <- Two_groups_two_conditions[,c(1:2, 5)]
# assign column names
names(Two_groups_two_conditions) <- c("Condition 1", "Condition 2", "Group")
Two_groups_two_conditions$`Subject ID` <- 1:30
# compute differences
Two_groups_two_conditions$difference <- as.numeric(Two_groups_two_conditions$`Condition 2`)  - as.numeric(Two_groups_two_conditions$`Condition 1`)

# convert to long again
library(tidyr)
Two_groups_two_conditions_long <- gather(Two_groups_two_conditions, condition, value, c(`Condition 1`, `Condition 2`), convert = TRUE)
kable(Two_groups_two_conditions_long)

Group	Subject ID	difference	condition	value
Group 1 Name	1	8	Condition 1	5
Group 1 Name	2	4	Condition 1	1
Group 1 Name	3	5	Condition 1	7
Group 1 Name	4	2	Condition 1	9
Group 1 Name	5	7	Condition 1	2
Group 1 Name	6	-1	Condition 1	6
Group 1 Name	7	1	Condition 1	4
Group 1 Name	8	3	Condition 1	11
Group 1 Name	9	-2	Condition 1	14
Group 1 Name	10	6	Condition 1	13
Group 1 Name	11	NA	Condition 1	NA
Group 1 Name	12	NA	Condition 1	NA
Group 1 Name	13	NA	Condition 1	NA
Group 1 Name	14	NA	Condition 1	NA
Group 1 Name	15	NA	Condition 1	NA
Group 2 Name	16	-2	Condition 1	20
Group 2 Name	17	-4	Condition 1	13
Group 2 Name	18	1	Condition 1	15
Group 2 Name	19	5	Condition 1	8
Group 2 Name	20	2	Condition 1	3
Group 2 Name	21	1	Condition 1	7
Group 2 Name	22	-7	Condition 1	14
Group 2 Name	23	0	Condition 1	12
Group 2 Name	24	3	Condition 1	11
Group 2 Name	25	1	Condition 1	9
Group 2 Name	26	NA	Condition 1	NA
Group 2 Name	27	NA	Condition 1	NA
Group 2 Name	28	NA	Condition 1	NA
Group 2 Name	29	NA	Condition 1	NA
Group 2 Name	30	NA	Condition 1	NA
Group 1 Name	1	8	Condition 2	13
Group 1 Name	2	4	Condition 2	5
Group 1 Name	3	5	Condition 2	12
Group 1 Name	4	2	Condition 2	11
Group 1 Name	5	7	Condition 2	9
Group 1 Name	6	-1	Condition 2	5
Group 1 Name	7	1	Condition 2	5
Group 1 Name	8	3	Condition 2	14
Group 1 Name	9	-2	Condition 2	12
Group 1 Name	10	6	Condition 2	19
Group 1 Name	11	NA	Condition 2	NA
Group 1 Name	12	NA	Condition 2	NA
Group 1 Name	13	NA	Condition 2	NA
Group 1 Name	14	NA	Condition 2	NA
Group 1 Name	15	NA	Condition 2	NA
Group 2 Name	16	-2	Condition 2	18
Group 2 Name	17	-4	Condition 2	9
Group 2 Name	18	1	Condition 2	16
Group 2 Name	19	5	Condition 2	13
Group 2 Name	20	2	Condition 2	5
Group 2 Name	21	1	Condition 2	8
Group 2 Name	22	-7	Condition 2	7
Group 2 Name	23	0	Condition 2	12
Group 2 Name	24	3	Condition 2	14
Group 2 Name	25	1	Condition 2	10
Group 2 Name	26	NA	Condition 2	NA
Group 2 Name	27	NA	Condition 2	NA
Group 2 Name	28	NA	Condition 2	NA
Group 2 Name	29	NA	Condition 2	NA
Group 2 Name	30	NA	Condition 2	NA

Now we can plot:

# plot
library(ggplot2)
g1 <- ggplot(Two_groups_two_conditions_long, aes(condition, as.numeric(value), group = `Subject ID`)) +
  geom_point(size = 4, shape = 1) +
  geom_line() + 
  xlab("") +
  ylab("Measurement (units)") +
  theme_minimal(base_size = 16) +
  facet_grid(~Group) + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

# difference
g2 <-  ggplot(Two_groups_two_conditions_long, aes(Group, difference)) +
  geom_point(size = 4, shape = 1) +
  stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.3) +
  xlab("") +
  ylab("Difference in Measurement (units)") +
  theme_minimal(base_size = 16)  +
  coord_fixed(ratio = 0.15)

# combine the two plots
grid.arrange(g1, g2, ncol = 2)

Here’s a summary of the computational environment the these plots were produced in. If you want to reproduce the plots here you’ll need a similar version of R and the packages listed below:

sessionInfo()

## R version 4.1.2 (2021-11-01)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19043)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=Chinese (Traditional)_Taiwan.950 
## [2] LC_CTYPE=Chinese (Traditional)_Taiwan.950   
## [3] LC_MONETARY=Chinese (Traditional)_Taiwan.950
## [4] LC_NUMERIC=C                                
## [5] LC_TIME=Chinese (Traditional)_Taiwan.950    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] gridExtra_2.3     readxl_1.3.1      printr_0.2        knitr_1.36       
##  [5] tidyr_1.1.4       readr_2.0.1       ggalluvial_0.12.3 dplyr_1.0.7      
##  [9] magrittr_2.0.1    ggplot2_3.3.5    
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.1.1 xfun_0.26        purrr_0.3.4      splines_4.1.2   
##  [5] lattice_0.20-45  colorspace_2.0-2 vctrs_0.3.8      generics_0.1.0  
##  [9] htmltools_0.5.2  yaml_2.2.1       mgcv_1.8-38      utf8_1.2.2      
## [13] rlang_0.4.11     jquerylib_0.1.4  pillar_1.6.3     glue_1.4.2      
## [17] withr_2.4.2      DBI_1.1.1        lifecycle_1.0.1  stringr_1.4.0   
## [21] munsell_0.5.0    gtable_0.3.0     cellranger_1.1.0 evaluate_0.14   
## [25] labeling_0.4.2   tzdb_0.1.2       fastmap_1.1.0    fansi_0.5.0     
## [29] highr_0.9        Rcpp_1.0.7       scales_1.1.1     farver_2.1.0    
## [33] hms_1.1.1        digest_0.6.28    stringi_1.7.4    grid_4.1.2      
## [37] tools_4.1.2      tibble_3.1.5     pacman_0.5.1     crayon_1.4.1    
## [41] pkgconfig_2.0.3  ellipsis_0.3.2   Matrix_1.3-4     assertthat_0.2.1
## [45] rmarkdown_2.11   R6_2.5.1         nlme_3.1-153     compiler_4.1.2

Exercises 3

The dataset consists of a sample of 14 primary school children between 8 and 12 years old. The children were asked to respond on 8 emotions and coping strategies scales for each of 6 situations: fail to fulfill assingments in class, not allowed to play with other children, forbidden to do something by the teacher, victim of bullying, too much school work, forbidden to do something by the mother. Plot the data in some meaningful ways. You may have to manipulate data into a different format first.

Data

Column 1: Unpleasant (Annoy) Column 2: Sad Column 3: Afraid Column 4: Angry Column 5: Approach coping Column 6: Avoidant coping Column 7: Social support seeking Column 8: Emotional reaction, especially agression Column 9: Situation ID Column 10: Children ID Source: Roeder, I., Boekaerts, M., & Kroonenberg, P. M. (2002). The stress and coping questionnaire for children (School version and Asthma version): Construction, factor structure, and psychometric properties. Psychological Reports, 91, 29-36.

Edat3 <- read.table("coping.txt", header = T)
head(Edat3)

annoy	sad	afraid	angry	approach	avoid	support	agressive	situation	sbj
4	2	2	2	1.00	2.00	1.00	2.50	Fail	S2
4	4	4	2	4.00	3.00	1.25	1.50	NoPart	S2
2	2	2	2	2.67	3.00	1.00	2.33	TeacNo	S2
4	3	4	4	4.00	1.50	3.25	1.00	Bully	S2
4	2	1	1	1.00	2.75	1.25	1.50	Work	S2
4	3	1	4	2.33	2.50	1.00	3.67	MomNo	S2

str(Edat3)

## 'data.frame':    84 obs. of  10 variables:
##  $ annoy    : int  4 4 2 4 4 4 3 3 3 4 ...
##  $ sad      : int  2 4 2 3 2 3 2 1 1 4 ...
##  $ afraid   : int  2 4 2 4 1 1 2 1 1 2 ...
##  $ angry    : int  2 2 2 4 1 4 2 2 2 1 ...
##  $ approach : num  1 4 2.67 4 1 2.33 2 1.33 1 1.67 ...
##  $ avoid    : num  2 3 3 1.5 2.75 2.5 1 4 1 4 ...
##  $ support  : num  1 1.25 1 3.25 1.25 1 1.5 2.75 1.33 3.5 ...
##  $ agressive: num  2.5 1.5 2.33 1 1.5 3.67 1 2 1.67 2.5 ...
##  $ situation: chr  "Fail" "NoPart" "TeacNo" "Bully" ...
##  $ sbj      : chr  "S2" "S2" "S2" "S2" ...

summary(Edat3)

annoy	sad	afraid	angry	approach	avoid	support	agressive	situation	sbj
Min. :1.000	Min. :1.00	Min. :1.000	Min. :1.000	Min. :1.000	Min. :1.000	Min. :0.000	Min. :1.000	Length:84	Length:84
1st Qu.:2.000	1st Qu.:1.00	1st Qu.:1.000	1st Qu.:1.000	1st Qu.:1.670	1st Qu.:1.750	1st Qu.:1.330	1st Qu.:1.000	Class :character	Class :character
Median :3.000	Median :2.00	Median :1.000	Median :2.000	Median :2.000	Median :2.500	Median :2.000	Median :1.500	Mode :character	Mode :character
Mean :2.762	Mean :1.81	Mean :1.405	Mean :2.131	Mean :2.236	Mean :2.398	Mean :1.959	Mean :1.542	NA	NA
3rd Qu.:4.000	3rd Qu.:2.00	3rd Qu.:2.000	3rd Qu.:2.250	3rd Qu.:3.000	3rd Qu.:3.000	3rd Qu.:2.373	3rd Qu.:1.670	NA	NA
Max. :4.000	Max. :4.00	Max. :4.000	Max. :9.000	Max. :4.000	Max. :4.000	Max. :4.000	Max. :4.000	NA	NA

##利用reshape函數將資料從wide轉成long
Edat3_long<- reshape(Edat3, 
  direction="long", 
  varying = c("annoy", "sad", "afraid", "angry", "approach", "avoid", "support", "agressive"), 
  v.names = "score",
  timevar = "Emotion", 
  times = c("annoy", "sad", "afraid", "angry", "approach", "avoid", "support", "agressive"),
  new.row.names = 1:1000)

Edat3_long |> as.data.frame() |> head(11)

situation	sbj	Emotion	score	id
Fail	S2	annoy	4	1
NoPart	S2	annoy	4	2
TeacNo	S2	annoy	2	3
Bully	S2	annoy	4	4
Work	S2	annoy	4	5
MomNo	S2	annoy	4	6
Fail	S17	annoy	3	7
NoPart	S17	annoy	3	8
TeacNo	S17	annoy	3	9
Bully	S17	annoy	4	10
Work	S17	annoy	4	11

ggplot(Edat3_long, aes(x=Emotion, y=score, group=Emotion))+
  geom_point(mapping = aes(color=Emotion))+
  facet_wrap(~ situation,nrow=3)+#facet_wrap前面有用過，分類一個變項用
  labs(x="Emotion", y="Score")+
  geom_boxplot()+ 
 theme(axis.text.x=element_text(angle=30,size=8, hjust=1.5))

ggplot(Edat3_long, aes(x=Emotion, y=score, group=Emotion))+
  geom_point(mapping = aes(color=Emotion))+
  facet_wrap(~ situation,nrow=1)+#全部擺一排看分布
  labs(x="Emotion", y="Score")+
  geom_boxplot()+ 
  theme(axis.text.x=element_text(angle=45,size=8, hjust=1.5))

#以下要再嘗試 pd <- position_dodge(.3)

p <- na.omit(Edat3_long) %>% group_by(situation ) %>%
summarize(m_p=mean(annoy), #library(dplyr) se_p=sd(annoy)/sqrt(n()), .groups=‘drop’) %>% ggplot() + aes( m_p, group=situation, shape=situation) + geom_errorbar(aes(ymin=m_p - se_p, ymax=m_p + se_p), width=.2, size=.5, position=pd) + geom_line(position=pd, linetype=‘dotted’) + geom_point(position=pd, size=rel(3)) +#數字大線條越明顯 scale_shape(guide=guide_legend(title=‘Group’)) + labs(x=” Children ID”, y=“Unpleasant (Annoy)”) + theme(legend.position=c(.1, .5)) p

Exercises 4

Use the USPersonalExpenditure{datasets} for this problem. This data set consists of United States personal expenditures (in billions of dollars) in the categories; food and tobacco, household operation, medical and health, personal care, and private education for the years 1940, 1945, 1950, 1955 and 1960. Plot the US personal expenditure data in the style of the third plot on the “Time Use” case study in the course web page. You might want to transform the dollar amounts to log base 10 unit first.

data(USPersonalExpenditure, package="datasets")
Edat4<-USPersonalExpenditure
head(Edat4) |> knitr::kable()

	1940	1945	1950	1955	1960
Food and Tobacco	22.200	44.500	59.60	73.2	86.80
Household Operation	10.500	15.500	29.00	36.5	46.20
Medical and Health	3.530	5.760	9.71	14.0	21.10
Personal Care	1.040	1.980	2.45	3.4	5.40
Private Education	0.341	0.974	1.80	2.6	3.64

str(Edat4)

##  num [1:5, 1:5] 22.2 10.5 3.53 1.04 0.341 44.5 15.5 5.76 1.98 0.974 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:5] "Food and Tobacco" "Household Operation" "Medical and Health" "Personal Care" ...
##   ..$ : chr [1:5] "1940" "1945" "1950" "1955" ...

##log base 10 unit，取小數點後四位
log10_Edat4<-round(log10(Edat4),digits = 4)
log10_Edat4

	1940	1945	1950	1955	1960
Food and Tobacco	1.3464	1.6484	1.7752	1.8645	1.9385
Household Operation	1.0212	1.1903	1.4624	1.5623	1.6646
Medical and Health	0.5478	0.7604	0.9872	1.1461	1.3243
Personal Care	0.0170	0.2967	0.3892	0.5315	0.7324
Private Education	-0.4672	-0.0114	0.2553	0.4150	0.5611

library(reshape2)

##轉置
log_Edat4_long<-reshape::melt(log10_Edat4)
names(log_Edat4_long) <-c("Items", "Year", "US_Dollar")

log_Edat4_long |> as.data.frame() |> head(10)

Items	Year	US_Dollar
Food and Tobacco	1940	1.3464
Household Operation	1940	1.0212
Medical and Health	1940	0.5478
Personal Care	1940	0.0170
Private Education	1940	-0.4672
Food and Tobacco	1945	1.6484
Household Operation	1945	1.1903
Medical and Health	1945	0.7604
Personal Care	1945	0.2967
Private Education	1945	-0.0114

##畫圖
ggplot(log_Edat4_long, aes(x=Year, y=US_Dollar, group=Items))+
  geom_line(mapping = aes(color=Items))+
  geom_point(mapping = aes(color=Items))+
  facet_wrap(~ Items,nrow=1)+
  labs(x="Year", y="US Dollar")+
  theme(axis.text.x=element_text(angle=45,size=10, hjust=1))

##放在一起看
ggplot(log_Edat4_long, aes(x=Year, y=US_Dollar, group=Items))+
  geom_line(mapping = aes(color=Items))+
  geom_point(mapping = aes(color=Items))+
  labs(x="Year", y="US Dollar")+
  theme(axis.text.x=element_text(size=12, hjust=1.5))

##畫Lollipop plot
ggplot(log_Edat4_long, aes(x=US_Dollar, y=Items))+
  geom_vline(xintercept=0)+
  geom_segment(aes(xend=0, yend=Items))+
  geom_point(mapping = aes(color=Items))+
  facet_wrap(~ Year,nrow=1)+
  labs(x="US_Dollar", y="Items")+
  theme(axis.text.x=element_text(angle=30,size=8, hjust=1))

Exercises 5

Use the Cushings{MASS} data set to generate a plot

##ggrepel 解決文字標示重疊問題的套件
library(ggrepel)

data(Cushings, package="MASS")
Edat5<-Cushings
head(Edat5)

	Tetrahydrocortisone	Pregnanetriol	Type
a1	3.1	11.70	a
a2	3.0	1.30	a
a3	1.9	0.10	a
a4	3.8	0.04	a
a5	4.1	1.10	a
a6	1.9	0.40	a

str(Edat5)

## 'data.frame':    27 obs. of  3 variables:
##  $ Tetrahydrocortisone: num  3.1 3 1.9 3.8 4.1 1.9 8.3 3.8 3.9 7.8 ...
##  $ Pregnanetriol      : num  11.7 1.3 0.1 0.04 1.1 0.4 1 0.2 0.6 1.2 ...
##  $ Type               : Factor w/ 4 levels "a","b","c","u": 1 1 1 1 1 1 2 2 2 2 ...

Edat5$Typef <- factor(Edat5$Type, 
                       levels = c("u", "b", "c", "a"), 
                       labels = c("Unknown", 
                                  "Bilateral Hyperplasia",
                                  "Carcinoma",
                                  "Adenoma"))

head(Edat5)

	Tetrahydrocortisone	Pregnanetriol	Type	Typef
a1	3.1	11.70	a	Adenoma
a2	3.0	1.30	a	Adenoma
a3	1.9	0.10	a	Adenoma
a4	3.8	0.04	a	Adenoma
a5	4.1	1.10	a	Adenoma
a6	1.9	0.40	a	Adenoma

#先設定好標籤
Edat5$text[c(1, 13, 21, 27)] <- c("Adenoma", "Bilateral Hyperplasia", "Carcinoma", "Unknown")

##先建標籤後跑圖
ggplot(Edat5, aes(Tetrahydrocortisone, Pregnanetriol, fill = Type))+
  geom_point(pch = 21, size = rel(2))+
  # 設定points shapes 用21可填顏色
  geom_text_repel(aes(label = text, color = Typef))+
  # setting the breaks in x and y axis
  scale_y_continuous(breaks = c(0, 2, 4, 6, 8, 10, 12))+ 
  scale_x_continuous(limits = c(0, 60), breaks = c(0, 10, 20, 30, 40, 50, 60))+
  # plot and axis title
  labs(x = "Tetrahydrocortisone (mg/24 hours)", 
       y = "Pregnanetriol (mg/24 hours)",
       title = "Cushings's syndrome")+
  # title 靠右邊
  theme(plot.title = element_text(hjust = 1),
        legend.position="")

##使用 ggrepel設定標籤
ggplot(Edat5, aes(Tetrahydrocortisone, Pregnanetriol, label = rownames(Edat5))) +
  geom_point(mapping =aes(color =Typef) ) +
  geom_text_repel()+#不要重疊標籤
  labs(x = "Tetrahydrocortisone (mg/24 hours)", 
       y = "Pregnanetriol (mg/24 hours)",
       title = "Cushings's syndrome")+
  theme(plot.title = element_text(hjust = 1))

20211129_Grammar of graphics_SCL

Shang Chi Lee

2021/11/29