Find out what each code chunk (indicated by ‘##’) in the R script does and provide comments.
data(women)
##base系統畫women, 先劃一個空白畫布'n',再畫出第一列的資料點
plot(women, type='n')
points(women[1,])
##lattice系統畫圖,用 xyplot繪製散佈圖,繪製點圖
::xyplot(weight ~ height,
latticedata=women,
subset=row.names(women)==1, type='p')
##用ggplot2系統化圖,使用加號(+)連接後續的繪圖類型與細部參數,aes指定資料與圖形對應關係的函數,geom_point 指定以資料點的方式繪製散佈圖
library(ggplot2)
ggplot(data=women[1,], aes(height, weight)) +
geom_point()
##在W1畫一個空白圖
<- plot(women, type='n') w1
##無法存至w1中
w1
## NULL
##使用lattice畫圖存至W2
<- lattice::xyplot(weight ~ height, data=women) w2
w2
##使用base畫圖無法存至w1
class(w1)
## [1] "NULL"
##w2為格子類的圖型
class(w2)
## [1] "trellis"
##確認trellis裡面的內容
methods(class='trellis')
## [1] [ dim dimnames dimnames<- plot print summary
## [8] t update
## see '?methods' for accessing help and source code
#總結:以上了解三種畫圖型態的比較,base、lattice、ggplot2,個人喜歡ggplot2,用+來代表後面所要增加的細部工作,覺得很直覺使用上很方便。
The data set is concerned with grade 8 pupils (age about 11 years) in elementary schools in the Netherlands. After deleting pupils with missing values, the number of pupils is 2,287 and the number of schools is 131. Class size ranges from 4 to 35. The response variables are score on a language test and that on an arithmetic test. The research intest is on how the two test scores depend on the pupil’s intelligence (verbal IQ) and on the number of pupils in a school class.
The class size is categorized into small, medium, and large with roughly equal number of observations in each category. The verbal IQ is categorized into low, middle and high with roughly equal number of observations in each category. Reproduce the plot below.
Source: Snijders, T. & Bosker, R. (2002). Multilevel Analysis.
Data
Column 1: School ID Column 2: Pupil ID Column 3: Verbal IQ score Column 4: The number of pupils in a class Column 5: Language test score Column 6: Arithmetic test score
<- read.table("langMathDutch.txt", header = T) dat
head(dat)
## school pupil IQV size lang arith
## 1 1 17001 15.0 29 46 24
## 2 1 17002 14.5 29 45 19
## 3 1 17003 9.5 29 33 24
## 4 1 17004 11.0 29 46 26
## 5 1 17005 8.0 29 20 9
## 6 1 17006 9.5 29 30 13
str(dat)
## 'data.frame': 2287 obs. of 6 variables:
## $ school: int 1 1 1 1 1 1 1 1 1 1 ...
## $ pupil : int 17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
## $ IQV : num 15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
## $ size : int 29 29 29 29 29 29 29 29 29 29 ...
## $ lang : int 46 45 33 46 20 30 30 57 36 36 ...
## $ arith : int 24 19 24 26 9 13 13 30 23 22 ...
summary(dat)
## school pupil IQV size lang
## Min. : 1 Min. : 17001 Min. : 4.00 Min. : 5.0 Min. : 9.00
## 1st Qu.: 67 1st Qu.: 677013 1st Qu.:10.50 1st Qu.:17.0 1st Qu.:35.00
## Median :141 Median :1417007 Median :12.00 Median :24.0 Median :42.00
## Mean :133 Mean :1337225 Mean :11.83 Mean :23.1 Mean :40.93
## 3rd Qu.:195 3rd Qu.:1957204 3rd Qu.:13.00 3rd Qu.:28.0 3rd Qu.:48.00
## Max. :258 Max. :2587010 Max. :18.00 Max. :37.0 Max. :58.00
## arith
## Min. : 2.00
## 1st Qu.:14.00
## Median :20.00
## Mean :19.44
## 3rd Qu.:25.00
## Max. :30.00
##將IQV分三等份
quantile(dat$IQV,c(0.33,0.66,0.99))
## 33% 66% 99%
## 11.0 12.5 17.0
##設定分三等份後的新變項IQVf
$IQVf <- with(dat, cut(IQV, order=T, breaks = c( 0, 11, 12.5, 18), labels = c("Low", "Middle", "High"))) dat
##將size分三等份
quantile(dat$size,c(0.33,0.66,0.99))
## 33% 66% 99%
## 20 27 37
##設定分三等份後的新變項sizef
$sizef <- with(dat, cut(size , order=T, breaks = c(0, 20, 27, 50), labels = c("Small", "Middle", "High"))) dat
##再次確認資料
head(dat)
## school pupil IQV size lang arith IQVf sizef
## 1 1 17001 15.0 29 46 24 High High
## 2 1 17002 14.5 29 45 19 High High
## 3 1 17003 9.5 29 33 24 Low High
## 4 1 17004 11.0 29 46 26 Low High
## 5 1 17005 8.0 29 20 9 Low High
## 6 1 17006 9.5 29 30 13 Low High
#facet_wrap指的是以一個標準作為分類
ggplot(data=dat)+geom_point(mapping=aes(x=lang, y=arith)) +facet_wrap(.~sizef+IQVf)+geom_smooth(mapping=aes(x=lang, y=arith))+labs(x= "Language score", y= "Arithmetic score")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#facet_grid,可以用2個以上類別分組,依據資料水平或垂直分組。這是放右邊表示水平分組
ggplot(data=dat)+geom_point(mapping=aes(x=lang, y=arith)) +facet_grid(.~sizef+IQVf)+geom_smooth(mapping=aes(x=lang, y=arith))+labs(x= "Language score", y= "Arithmetic score")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
##總結:facet的這個部分總共有facet_grid, facet_null, 及facet_wrap三個形態,常用facet_grid。
A sample of 158 children with autisim spectrum disorder were recruited. Social development was assessed using the Vineland Adaptive Behavior Interview survey form, a parent-reported measure of socialization. It is a combined score that included assessment of interpersonal relationships, play/leisure time activities, and coping skills. Initial language development was assessed using the Sequenced Inventory of Communication Development (SICD) scale. These assessments were repeated on these children when they were 3, 5, 9, 13 years of age.
Source: West, B.T., Welch, K.B., & Galecki, A.T. (2002). Linear Mixed Models: Practical Guide Using Statistical Software. p. 220-271.
Data: autism{WWGbook}
Column 1: Age (in years) Column 2: Vineland Socialization Age Equivalent score Column 3: Sequenced Inventory of Communication Development Expressive Group (1 = Low, 2 = Medium, 3 = High) Column 4: Child ID
library(ggplot2)
library(magrittr)
library(dplyr)
##
## 載入套件:'dplyr'
## 下列物件被遮斷自 'package:stats':
##
## filter, lag
## 下列物件被遮斷自 'package:base':
##
## intersect, setdiff, setequal, union
data(autism, package="WWGbook")
<-autism
dat3head(dat3)
## age vsae sicdegp childid
## 1 2 6 3 1
## 2 3 7 3 1
## 3 5 18 3 1
## 4 9 25 3 1
## 5 13 27 3 1
## 6 2 17 3 3
str(dat3)
## 'data.frame': 612 obs. of 4 variables:
## $ age : int 2 3 5 9 13 2 3 5 9 13 ...
## $ vsae : int 6 7 18 25 27 17 18 12 18 24 ...
## $ sicdegp: int 3 3 3 3 3 3 3 3 3 3 ...
## $ childid: int 1 1 1 1 1 3 3 3 3 3 ...
##設定sicdegp的新變項sizef
$sicdegpf <- factor(dat3$sicdegp, levels=c(1,2,3), labels = c("L", "M", "H")) dat3
head(dat3)
## age vsae sicdegp childid sicdegpf
## 1 2 6 3 1 H
## 2 3 7 3 1 H
## 3 5 18 3 1 H
## 4 9 25 3 1 H
## 5 13 27 3 1 H
## 6 2 17 3 3 H
ggplot(dat3, aes(x = scale(age, center = TRUE, scale = F),
y = vsae)) +
geom_line(aes(group = childid)) +
geom_point(alpha = .5) + #設定點,alpha透明度
geom_smooth(method = "lm", se = T) +
#stat_smooth或geom_smooth都可以畫出線性回歸線lm
facet_grid(. ~ sicdegpf) +
labs(x = "Age (in years, centered)", y = "VSAE score") +
theme_minimal()#內建主題theme_minimal()為簡約版
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_point).
$Age2<-(dat3$age-2) dat3
<- position_dodge(.3)
pd
<- na.omit(dat3) %>% group_by(sicdegpf, Age2) %>%
p #library(magrittr),na.omit(dat3)返回:NA 省略後的值
summarize(m_p=mean(vsae), #library(dplyr)
se_p=sd(vsae)/sqrt(n()), .groups='drop') %>%
ggplot() +
aes(Age2, m_p,
group=sicdegpf,
shape=sicdegpf) +
geom_errorbar(aes(ymin=m_p - se_p,
ymax=m_p + se_p),
width=.2, size=.3,
position=pd) +
geom_line(position=pd,
linetype='dotted') +
geom_point(position=pd,
size=rel(3)) +#數字大線條越明顯
scale_shape(guide=guide_legend(title='Group')) +
labs(x="Age (in years - 2)", y="VSAE score") +
theme(legend.position=c(.1, .9))
p
Use the diabetes dataset to generate a plot similar to the one below and inteprete the plot.
#使用沖積圖工具
::p_load(ggalluvial) pacman
<- read.csv("diabetes_mell.csv", header = T)
dat4 head(dat4)
## SEQN RIAGENDR RIDRETH1 DIQ010 BMXBMI gender race diabetes BMI
## 1 51624 1 3 2 32.22 Males White No Overweight
## 2 51626 1 4 2 22.00 Males Black No Normal weight
## 3 51627 1 4 2 18.22 Males Black No Normal weight
## 4 51628 2 4 1 42.39 Females Black Yes Overweight
## 5 51629 1 1 2 32.61 Males Hispanic No Overweight
## 6 51630 2 3 2 30.57 Females White No Overweight
str(dat4)
## 'data.frame': 8706 obs. of 9 variables:
## $ SEQN : int 51624 51626 51627 51628 51629 51630 51632 51633 51634 51635 ...
## $ RIAGENDR: int 1 1 1 2 1 2 1 1 1 1 ...
## $ RIDRETH1: int 3 4 4 4 1 3 2 3 1 3 ...
## $ DIQ010 : int 2 2 2 1 2 2 2 2 2 1 ...
## $ BMXBMI : num 32.2 22 18.2 42.4 32.6 ...
## $ gender : chr "Males" "Males" "Males" "Females" ...
## $ race : chr "White" "Black" "Black" "Black" ...
## $ diabetes: chr "No" "No" "No" "Yes" ...
## $ BMI : chr "Overweight" "Normal weight" "Normal weight" "Overweight" ...
##整理資料
<- data.frame(with(dat4[, c("gender", "race", "diabetes", "BMI")],xtabs(~ gender + race + diabetes + BMI)))
dat4_v1
head(dat4_v1)
## gender race diabetes BMI Freq
## 1 Females Black No Normal weight 347
## 2 Males Black No Normal weight 429
## 3 Females Hispanic No Normal weight 712
## 4 Males Hispanic No Normal weight 706
## 5 Females White No Normal weight 998
## 6 Males White No Normal weight 873
ggplot(dat4_v1,
aes(axis1=race,
axis2=diabetes,
axis3=BMI,
y=Freq)) +
scale_x_discrete(limits=c("race",
"diabetes",
"BMI"),
expand=c(.1, .5)) +
labs(x='Health and Behavior between Races',
y='Count') +
geom_alluvium(aes(fill=gender)) +#依性別分別填充顏色
geom_stratum() +
geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
scale_fill_manual(values=c('darkgreen','hotpink3'))+#指定顏色
theme_minimal() +
ggtitle("Subjects stratified by race, diabetes, BMI, and gender")
Fifty male and fifty female students fill out the same questionnaire in weekly intervals starting five weeks before an important examination to measure state anxiety.
The research interests are: 1. whether there are gender difference in state anxiety 2. individual differences in state anxiety. Explore the answers to both questions with plots involving confidence intervals or error bars for the means.
Source: Von Eye, A., & Schuster C. (1998). Regression Analysis for Social Sciences. San Diego: Academic Press.
::p_load(ggplot2,readr, tidyr) pacman
<- read.table("stateAnxiety.txt", header = T)
Edat1 head(Edat1)|>knitr::kable()
f1 | f2 | f3 | f4 | f5 | m1 | m2 | m3 | m4 | m5 |
---|---|---|---|---|---|---|---|---|---|
13 | 17 | 18 | 20 | 24 | 6 | 14 | 22 | 20 | 24 |
26 | 31 | 33 | 38 | 42 | 4 | 11 | 14 | 12 | 23 |
13 | 17 | 24 | 29 | 32 | 17 | 25 | 26 | 29 | 38 |
22 | 24 | 26 | 27 | 29 | 19 | 22 | 26 | 30 | 34 |
18 | 19 | 19 | 22 | 30 | 12 | 21 | 21 | 23 | 24 |
32 | 31 | 30 | 31 | 32 | 11 | 16 | 20 | 19 | 22 |
str(Edat1)
## 'data.frame': 50 obs. of 10 variables:
## $ f1: int 13 26 13 22 18 32 16 18 14 20 ...
## $ f2: int 17 31 17 24 19 31 16 22 17 19 ...
## $ f3: int 18 33 24 26 19 30 21 25 23 23 ...
## $ f4: int 20 38 29 27 22 31 27 29 21 25 ...
## $ f5: int 24 42 32 29 30 32 30 35 25 28 ...
## $ m1: int 6 4 17 19 12 11 14 9 12 11 ...
## $ m2: int 14 11 25 22 21 16 23 18 16 13 ...
## $ m3: int 22 14 26 26 21 20 26 20 23 17 ...
## $ m4: int 20 12 29 30 23 19 29 20 26 14 ...
## $ m5: int 24 23 38 34 24 22 33 24 32 20 ...
<-Edat1 %>%
Edat1_long pivot_longer(cols = starts_with(c("f","m")),
names_to = "Week",
values_to = "Score") |>
mutate(Gender = gsub("[0-9]","",Week),
#mutate() 函數位於dplyr裡:新增變數設定Gender變項
Week = parse_number(Week),
ID = rep(1:100, each=5))
|> as.data.frame() |> head(11) Edat1_long
## Week Score Gender ID
## 1 1 13 f 1
## 2 2 17 f 1
## 3 3 18 f 1
## 4 4 20 f 1
## 5 5 24 f 1
## 6 1 6 m 2
## 7 2 14 m 2
## 8 3 22 m 2
## 9 4 20 m 2
## 10 5 24 m 2
## 11 1 26 f 3
##睿寧作法,stack 函數可以將原先多個直行向量,轉換成單一向量進行標示
=stack(Edat1)
long head(long)
## values ind
## 1 13 f1
## 2 26 f1
## 3 13 f1
## 4 22 f1
## 5 18 f1
## 6 32 f1
typeof(Edat1_long)
## [1] "list"
##將男女放在一起,女性生焦慮程度高於男性
ggplot(Edat1_long, aes(x=Week, y=Score, group=ID))+
geom_line(mapping = aes(color=Gender))+
geom_point()+
facet_wrap(~ Gender)+#facet_wrap前面有用過,分類一個變項用
labs(x="weeks before an important examination", y="Anxiety score")+
theme_minimal()
##看男女的差異
ggplot(Edat1_long, aes(x=Week, y=Score, group=Week,color=Gender))+
facet_wrap(~ Gender)+geom_boxplot()
##看男女的個人差異
ggplot(Edat1_long, aes(x=Week, y=Score, group=ID,color=Gender))+
facet_wrap(~ Gender)+geom_boxplot()
Use the markdown file to replicate the contents of Weissgerber, T.L., Milic, N.M., Winham, S.J., Garovic, V.D. (2015). Beyond Bar and Line Graphs: Time for a New Data Presentation Paradigm. PLOS Biology , 13. The two data sets are here provided: journal.pbio.1002128.s002.XLS and journal.pbio.1002128.s003.XLS . You can also download everything in a zip file from this location.
#Interpreting the plots
The short horizontal black lines in the plots show the median values. The mean should not be shown for data that are analyzed non-parametrically, as these data do not meet the distributional assumptions required to calculate the mean. If your data meet the assumptions required for parametric testing the graphs can be changed to show the mean instead of the median. To use the mean with the code below, simply replace “median” with “mean” where it occurs in the chunks below. Open circles show measurements for each participant or observation.
## Registered S3 method overwritten by 'printr':
## method from
## knit_print.data.frame rmarkdown
#Independent data, points not jittered (no overlapping points)
Use these next few chunks of code to create scatterplots for independent data in two to five groups, when there are no overlapping points within any group. Independent data means that the variable of interest is measured one time in each subject, and subjects are not related to each other. If your data do not meet this criteria, see the further below for paired or non-independent data. Overlapping points means that two subjects have values that are so close that they will overlap on the graph and you will not be able to see both points clearly. If your data have overlapping points, use the code in the “Points Jittered” section below.
First step is to read in the data from Excel spreadsheet (which needs to be in the same folder as this source file, or else you need to specify the full path to the file on your computer), then extract the specific rows and columns that contain the data to be plotted. The table below shows the format of the data in the Excel sheet. This chunk below can be adapted to read data from any Excel file, just change the file name and adapt the subsetting line so that it captures the relevant rows and columns in the Excel file.
library(readxl)
# read in data from PLOS Biology article supplementary materials
<- read_excel("journal.pbio.1002128.s002.XLSX", sheet = 1)
independent_data # subset just groups 1-5 from the 'No overlapping points' sheet
<- independent_data[15:30,2:6]
independent_data # assign column names
names(independent_data) <- independent_data[1, ]
# remove first row with column names
<- independent_data[-1, ]
independent_data kable(independent_data)
Group 1 | Group 2 | Group 3 | Group 4 | Group 5 |
---|---|---|---|---|
5 | 7 | 9 | 42 | 2 |
3 | 3 | 7 | 2 | 0 |
6 | 9 | 10 | 5 | 3 |
8 | 10 | 12 | 55 | 5 |
10 | 33 | 14 | 9 | 7 |
13 | 15 | 17 | 12 | 10 |
1 | 18 | 20 | 15 | 13 |
4 | 6 | 40 | 3 | 1 |
18 | 20 | 22 | NA | 15 |
4 | 30 | 35 | NA | 1 |
7 | NA | 42 | NA | 4 |
9 | NA | 13 | NA | 6 |
14 | NA | NA | NA | 11 |
15 | NA | NA | NA | 12 |
17 | NA | NA | NA | 14 |
An important step is reshaping the data from their current wide format to a more tidy long format. Long formats are most useful for plotting and statistical analysis in R. Here’s what the data look like in the long format:
# reshape for plotting
library(tidyr)
<- gather(independent_data, group, value, `Group 1`:`Group 5`, convert = TRUE)
independent_data_long kable(independent_data_long)
group | value |
---|---|
Group 1 | 5 |
Group 1 | 3 |
Group 1 | 6 |
Group 1 | 8 |
Group 1 | 10 |
Group 1 | 13 |
Group 1 | 1 |
Group 1 | 4 |
Group 1 | 18 |
Group 1 | 4 |
Group 1 | 7 |
Group 1 | 9 |
Group 1 | 14 |
Group 1 | 15 |
Group 1 | 17 |
Group 2 | 7 |
Group 2 | 3 |
Group 2 | 9 |
Group 2 | 10 |
Group 2 | 33 |
Group 2 | 15 |
Group 2 | 18 |
Group 2 | 6 |
Group 2 | 20 |
Group 2 | 30 |
Group 2 | NA |
Group 2 | NA |
Group 2 | NA |
Group 2 | NA |
Group 2 | NA |
Group 3 | 9 |
Group 3 | 7 |
Group 3 | 10 |
Group 3 | 12 |
Group 3 | 14 |
Group 3 | 17 |
Group 3 | 20 |
Group 3 | 40 |
Group 3 | 22 |
Group 3 | 35 |
Group 3 | 42 |
Group 3 | 13 |
Group 3 | NA |
Group 3 | NA |
Group 3 | NA |
Group 4 | 42 |
Group 4 | 2 |
Group 4 | 5 |
Group 4 | 55 |
Group 4 | 9 |
Group 4 | 12 |
Group 4 | 15 |
Group 4 | 3 |
Group 4 | NA |
Group 4 | NA |
Group 4 | NA |
Group 4 | NA |
Group 4 | NA |
Group 4 | NA |
Group 4 | NA |
Group 5 | 2 |
Group 5 | 0 |
Group 5 | 3 |
Group 5 | 5 |
Group 5 | 7 |
Group 5 | 10 |
Group 5 | 13 |
Group 5 | 1 |
Group 5 | 15 |
Group 5 | 1 |
Group 5 | 4 |
Group 5 | 6 |
Group 5 | 11 |
Group 5 | 12 |
Group 5 | 14 |
Now we are ready to plot, starting with subsetting just groups 1 and and 2 from the long data frame. Open circles show measurements for each participant or observation.
# plot
library(ggplot2)
library(dplyr)
# subset groups 1 & 2
<- independent_data_long %>%
independent_data_long_groups_1_and_2 filter(group %in% c("Group 1", "Group 2"))
# plot
ggplot(independent_data_long_groups_1_and_2, aes(group, as.numeric(value))) +
geom_point(shape = 1, size = 4) +
stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.2, size = 1) +
xlab("") +
ylab("Measurement (units)") +
theme_minimal(base_size = 16)
Plotting groups 1, 2, and 3, the only thing that changes is the subsetting method:
# subset groups 1, 2 & 3
<- independent_data_long %>%
independent_data_long_groups_1_2_3 filter(group %in% c("Group 1", "Group 2", "Group 3"))
# plot
ggplot(independent_data_long_groups_1_2_3, aes(group, as.numeric(value))) +
geom_point(shape = 1, size = 4) +
stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.2, size = 1) +
xlab("") +
ylab("Measurement (units)") +
theme_minimal(base_size = 16)
Plotting groups 1 to 4:
# groups 1, 2, 3, & 4
<- independent_data_long %>%
independent_data_long_groups_1_2_3_4 filter(group %in% c("Group 1", "Group 2", "Group 3", "Group 4"))
# plot
ggplot(independent_data_long_groups_1_2_3_4, aes(group, as.numeric(value))) +
geom_point(shape = 1, size = 4) +
stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.2, size = 1) +
xlab("") +
ylab("Measurement (units)") +
theme_minimal(base_size = 16)
And finally plotting all five groups, no subsetting required:
# all five groups
ggplot(independent_data_long, aes(group, as.numeric(value))) +
geom_point(shape = 1, size = 4) +
stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.2, size = 1) +
xlab("") +
ylab("Measurement (units)") +
theme_minimal(base_size = 16)
#Independent data, points jittered
Use the code chunks below to create scatterplots for independent data in two to five groups, when there are overlapping points in at least one group. Independent data means that the variable of interest is measured one time in each subject, and subjects are not related to each other. If your data do not meet this criteria, see the code chunks below for paired or non-independent data. Overlapping points means that two subjects have values that are so close that they will overlap on the plot, and you will not be able to see both points clearly. Adjust the width
and height
values in the position_jitter
function to refine the jitter settings for the points so that they do no overlap. Note that I’ve set a jitter height
value, which is not part of the Excel plots, but I think is quite effective at separating points.
First we get the data from the Excel spreadsheet:
library(readxl)
# read in data from PLOS Biology article supplementary materials
<- read_excel("journal.pbio.1002128.s002.XLSX", sheet = 2)
independent_data_j # subset data from the 'points jittered' sheet
<- independent_data_j[16:115,2:3]
independent_data_j # group numbers are not given in the spreadsheet, so we'll add them
$Groups <- c(rep(1, 20), rep(2, 20), rep(3, 20), rep(4, 20), rep(5, 20))
independent_data_j# assign column names
names(independent_data_j) <- c("Subject ID", "Measurement", "Group")
The data are already in a nice tidy long format, with Group Name in one column and Measurement Values in another column, so we don’t need to reshape them. We can go directly to plotting them, first two groups, then three, then four, then all five groups. Once again the only thing that varies is how we subset the original data.
# plot
library(ggplot2)
library(dplyr)
# groups 1 & 2
<- independent_data_j %>%
independent_data_j_groups_1_and_2 filter(Group %in% 1:2)
# plot
ggplot(independent_data_j_groups_1_and_2, aes(as.factor(Group), as.numeric(Measurement))) +
geom_jitter(shape = 1, size = 4, position=position_jitter(width = 0.2, height = 0.2)) +
stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.2, size = 1) +
xlab("") +
ylab("Measurement (units)") +
theme_minimal(base_size = 16)
# groups 1, 2 & 3
<- independent_data_j %>%
independent_data_j_groups_1_2_3 filter(Group %in% 1:3)
# plot
ggplot(independent_data_j_groups_1_2_3, aes(as.factor(Group), as.numeric(Measurement))) +
geom_jitter(shape = 1, size = 4, position=position_jitter(width = 0.2, height = 0.2)) +
stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.2, size = 1) +
xlab("") +
ylab("Measurement (units)") +
theme_minimal(base_size = 16)
# groups 1, 2, 3, & 4
<- independent_data_j %>%
independent_data_j_groups_1_2_3_4 filter(Group %in% 1:4)
# plot
ggplot(independent_data_j_groups_1_2_3_4, aes(as.factor(Group), as.numeric(Measurement))) +
geom_jitter(shape = 1, size = 4, position=position_jitter(width = 0.2, height = 0.2)) +
stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.2, size = 1) +
xlab("") +
ylab("Measurement (units)") +
theme_minimal(base_size = 16)
# all five groups
ggplot(independent_data_j, aes(as.factor(Group), as.numeric(Measurement))) +
geom_jitter(shape = 1, size = 4, position=position_jitter(width = 0.2, height = 0.2)) +
stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.2, size = 1) +
xlab("") +
ylab("Measurement (units)") +
theme_minimal(base_size = 16)
#Paired or Non-independent Data: 1 Group, 2 Conditions
Use these chunks to create scatterplots for paired or matched data (2 conditions) in one group of subjects. Paired data are when you measure the variable of interest more than one time in each participant. Matched data are when participants in groups 1 and 2 are matched for important characteristics. If your data are independent, please see the chunks above for Independent Data.
The short horizontal black line in the “Difference in Measurement” graph shows the median difference. Medians for each condition are not shown, and should not be calculated. Unlike means, medians are not additive. The median difference does not equal the difference in the medians. Open circles and black lines connecting the circles show paired measurements for each participant or observation.
library(readxl)
# read in data from PLOS Biology article supplementary materials
<- read_excel("journal.pbio.1002128.s003.XLS", sheet = 1)
One_group_two_conditions # subset data from the 'points jittered' sheet
<- One_group_two_conditions[12:23,1:3]
One_group_two_conditions # assign column names
names(One_group_two_conditions) <- c("Subject ID", "Condition 1 Name", "Condition 2 Name")
$difference <- as.numeric(One_group_two_conditions$`Condition 2 Name`) - as.numeric(One_group_two_conditions$`Condition 1 Name`) One_group_two_conditions
The data in the Excel sheet are in an untidy wide format, so let’s convert them to a tidy long format:
# reshape for plotting
library(tidyr)
<- gather(One_group_two_conditions, group, value, `Condition 1 Name`:`Condition 2 Name`, -`Subject ID`, -difference, convert = TRUE) One_group_two_conditions_long
Now we can plot:
# plot
library(ggplot2)
library(gridExtra)
<- ggplot(One_group_two_conditions_long, aes(group, as.numeric(value), group = `Subject ID`)) +
g1 geom_point(shape = 1, size = 4) +
geom_line() +
xlab("") +
ylab("Measurement (units)") +
theme_minimal(base_size = 16) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
# differences
<- ggplot(One_group_two_conditions_long, aes(x = 1, y = difference)) +
g2 geom_point(shape = 1, size = 4) +
stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.001, size = 1) +
xlab("") +
ylab("Difference in Measurement (units)") +
theme_minimal(base_size = 16) +
scale_x_continuous(breaks = NULL) +
coord_fixed(ratio = 0.0005)
# combine the two plots
grid.arrange(g1, g2, ncol = 2)
#Paired or Non-independent Data: 2 Groups, 2 Conditions
First we read in the data from the spreadsheet:
library(readxl)
# read in data from PLOS Biology article supplementary materials
<- read_excel("journal.pbio.1002128.s003.XLS", sheet = 2)
Two_groups_two_conditions # subset data from the 'points jittered' sheet
<- Two_groups_two_conditions[12:41,2:5]
Two_groups_two_conditions # assign group names
$group <- c(rep("Group 1 Name", 15), rep("Group 2 Name", 15))
Two_groups_two_conditionsnames(Two_groups_two_conditions) <- c("Condition 1 Name A", "Condition 2 Name A", "Condition 1 Name B", "Condition 2 Name B")
The data in the Excel sheet are in an unusual structure, so a few steps for reshaping into a tidy form are needed. Here’s how we can tidy them and how they look after being tidied:
# convert to simple long form
1] <- unlist(c(Two_groups_two_conditions[1:15,1], Two_groups_two_conditions[16:30,3]))
Two_groups_two_conditions[,2] <- unlist(c(Two_groups_two_conditions[1:15,2], Two_groups_two_conditions[16:30,4]))
Two_groups_two_conditions[,# drop unneeded columns
<- Two_groups_two_conditions[,c(1:2, 5)]
Two_groups_two_conditions # assign column names
names(Two_groups_two_conditions) <- c("Condition 1", "Condition 2", "Group")
$`Subject ID` <- 1:30
Two_groups_two_conditions# compute differences
$difference <- as.numeric(Two_groups_two_conditions$`Condition 2`) - as.numeric(Two_groups_two_conditions$`Condition 1`)
Two_groups_two_conditions
# convert to long again
library(tidyr)
<- gather(Two_groups_two_conditions, condition, value, c(`Condition 1`, `Condition 2`), convert = TRUE)
Two_groups_two_conditions_long kable(Two_groups_two_conditions_long)
Group | Subject ID | difference | condition | value |
---|---|---|---|---|
Group 1 Name | 1 | 8 | Condition 1 | 5 |
Group 1 Name | 2 | 4 | Condition 1 | 1 |
Group 1 Name | 3 | 5 | Condition 1 | 7 |
Group 1 Name | 4 | 2 | Condition 1 | 9 |
Group 1 Name | 5 | 7 | Condition 1 | 2 |
Group 1 Name | 6 | -1 | Condition 1 | 6 |
Group 1 Name | 7 | 1 | Condition 1 | 4 |
Group 1 Name | 8 | 3 | Condition 1 | 11 |
Group 1 Name | 9 | -2 | Condition 1 | 14 |
Group 1 Name | 10 | 6 | Condition 1 | 13 |
Group 1 Name | 11 | NA | Condition 1 | NA |
Group 1 Name | 12 | NA | Condition 1 | NA |
Group 1 Name | 13 | NA | Condition 1 | NA |
Group 1 Name | 14 | NA | Condition 1 | NA |
Group 1 Name | 15 | NA | Condition 1 | NA |
Group 2 Name | 16 | -2 | Condition 1 | 20 |
Group 2 Name | 17 | -4 | Condition 1 | 13 |
Group 2 Name | 18 | 1 | Condition 1 | 15 |
Group 2 Name | 19 | 5 | Condition 1 | 8 |
Group 2 Name | 20 | 2 | Condition 1 | 3 |
Group 2 Name | 21 | 1 | Condition 1 | 7 |
Group 2 Name | 22 | -7 | Condition 1 | 14 |
Group 2 Name | 23 | 0 | Condition 1 | 12 |
Group 2 Name | 24 | 3 | Condition 1 | 11 |
Group 2 Name | 25 | 1 | Condition 1 | 9 |
Group 2 Name | 26 | NA | Condition 1 | NA |
Group 2 Name | 27 | NA | Condition 1 | NA |
Group 2 Name | 28 | NA | Condition 1 | NA |
Group 2 Name | 29 | NA | Condition 1 | NA |
Group 2 Name | 30 | NA | Condition 1 | NA |
Group 1 Name | 1 | 8 | Condition 2 | 13 |
Group 1 Name | 2 | 4 | Condition 2 | 5 |
Group 1 Name | 3 | 5 | Condition 2 | 12 |
Group 1 Name | 4 | 2 | Condition 2 | 11 |
Group 1 Name | 5 | 7 | Condition 2 | 9 |
Group 1 Name | 6 | -1 | Condition 2 | 5 |
Group 1 Name | 7 | 1 | Condition 2 | 5 |
Group 1 Name | 8 | 3 | Condition 2 | 14 |
Group 1 Name | 9 | -2 | Condition 2 | 12 |
Group 1 Name | 10 | 6 | Condition 2 | 19 |
Group 1 Name | 11 | NA | Condition 2 | NA |
Group 1 Name | 12 | NA | Condition 2 | NA |
Group 1 Name | 13 | NA | Condition 2 | NA |
Group 1 Name | 14 | NA | Condition 2 | NA |
Group 1 Name | 15 | NA | Condition 2 | NA |
Group 2 Name | 16 | -2 | Condition 2 | 18 |
Group 2 Name | 17 | -4 | Condition 2 | 9 |
Group 2 Name | 18 | 1 | Condition 2 | 16 |
Group 2 Name | 19 | 5 | Condition 2 | 13 |
Group 2 Name | 20 | 2 | Condition 2 | 5 |
Group 2 Name | 21 | 1 | Condition 2 | 8 |
Group 2 Name | 22 | -7 | Condition 2 | 7 |
Group 2 Name | 23 | 0 | Condition 2 | 12 |
Group 2 Name | 24 | 3 | Condition 2 | 14 |
Group 2 Name | 25 | 1 | Condition 2 | 10 |
Group 2 Name | 26 | NA | Condition 2 | NA |
Group 2 Name | 27 | NA | Condition 2 | NA |
Group 2 Name | 28 | NA | Condition 2 | NA |
Group 2 Name | 29 | NA | Condition 2 | NA |
Group 2 Name | 30 | NA | Condition 2 | NA |
Now we can plot:
# plot
library(ggplot2)
<- ggplot(Two_groups_two_conditions_long, aes(condition, as.numeric(value), group = `Subject ID`)) +
g1 geom_point(size = 4, shape = 1) +
geom_line() +
xlab("") +
ylab("Measurement (units)") +
theme_minimal(base_size = 16) +
facet_grid(~Group) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# difference
<- ggplot(Two_groups_two_conditions_long, aes(Group, difference)) +
g2 geom_point(size = 4, shape = 1) +
stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.3) +
xlab("") +
ylab("Difference in Measurement (units)") +
theme_minimal(base_size = 16) +
coord_fixed(ratio = 0.15)
# combine the two plots
grid.arrange(g1, g2, ncol = 2)
Here’s a summary of the computational environment the these plots were produced in. If you want to reproduce the plots here you’ll need a similar version of R and the packages listed below:
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19043)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=Chinese (Traditional)_Taiwan.950
## [2] LC_CTYPE=Chinese (Traditional)_Taiwan.950
## [3] LC_MONETARY=Chinese (Traditional)_Taiwan.950
## [4] LC_NUMERIC=C
## [5] LC_TIME=Chinese (Traditional)_Taiwan.950
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] gridExtra_2.3 readxl_1.3.1 printr_0.2 knitr_1.36
## [5] tidyr_1.1.4 readr_2.0.1 ggalluvial_0.12.3 dplyr_1.0.7
## [9] magrittr_2.0.1 ggplot2_3.3.5
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.1.1 xfun_0.26 purrr_0.3.4 splines_4.1.2
## [5] lattice_0.20-45 colorspace_2.0-2 vctrs_0.3.8 generics_0.1.0
## [9] htmltools_0.5.2 yaml_2.2.1 mgcv_1.8-38 utf8_1.2.2
## [13] rlang_0.4.11 jquerylib_0.1.4 pillar_1.6.3 glue_1.4.2
## [17] withr_2.4.2 DBI_1.1.1 lifecycle_1.0.1 stringr_1.4.0
## [21] munsell_0.5.0 gtable_0.3.0 cellranger_1.1.0 evaluate_0.14
## [25] labeling_0.4.2 tzdb_0.1.2 fastmap_1.1.0 fansi_0.5.0
## [29] highr_0.9 Rcpp_1.0.7 scales_1.1.1 farver_2.1.0
## [33] hms_1.1.1 digest_0.6.28 stringi_1.7.4 grid_4.1.2
## [37] tools_4.1.2 tibble_3.1.5 pacman_0.5.1 crayon_1.4.1
## [41] pkgconfig_2.0.3 ellipsis_0.3.2 Matrix_1.3-4 assertthat_0.2.1
## [45] rmarkdown_2.11 R6_2.5.1 nlme_3.1-153 compiler_4.1.2
The dataset consists of a sample of 14 primary school children between 8 and 12 years old. The children were asked to respond on 8 emotions and coping strategies scales for each of 6 situations: fail to fulfill assingments in class, not allowed to play with other children, forbidden to do something by the teacher, victim of bullying, too much school work, forbidden to do something by the mother. Plot the data in some meaningful ways. You may have to manipulate data into a different format first.
Data
Column 1: Unpleasant (Annoy) Column 2: Sad Column 3: Afraid Column 4: Angry Column 5: Approach coping Column 6: Avoidant coping Column 7: Social support seeking Column 8: Emotional reaction, especially agression Column 9: Situation ID Column 10: Children ID Source: Roeder, I., Boekaerts, M., & Kroonenberg, P. M. (2002). The stress and coping questionnaire for children (School version and Asthma version): Construction, factor structure, and psychometric properties. Psychological Reports, 91, 29-36.
<- read.table("coping.txt", header = T)
Edat3 head(Edat3)
annoy | sad | afraid | angry | approach | avoid | support | agressive | situation | sbj |
---|---|---|---|---|---|---|---|---|---|
4 | 2 | 2 | 2 | 1.00 | 2.00 | 1.00 | 2.50 | Fail | S2 |
4 | 4 | 4 | 2 | 4.00 | 3.00 | 1.25 | 1.50 | NoPart | S2 |
2 | 2 | 2 | 2 | 2.67 | 3.00 | 1.00 | 2.33 | TeacNo | S2 |
4 | 3 | 4 | 4 | 4.00 | 1.50 | 3.25 | 1.00 | Bully | S2 |
4 | 2 | 1 | 1 | 1.00 | 2.75 | 1.25 | 1.50 | Work | S2 |
4 | 3 | 1 | 4 | 2.33 | 2.50 | 1.00 | 3.67 | MomNo | S2 |
str(Edat3)
## 'data.frame': 84 obs. of 10 variables:
## $ annoy : int 4 4 2 4 4 4 3 3 3 4 ...
## $ sad : int 2 4 2 3 2 3 2 1 1 4 ...
## $ afraid : int 2 4 2 4 1 1 2 1 1 2 ...
## $ angry : int 2 2 2 4 1 4 2 2 2 1 ...
## $ approach : num 1 4 2.67 4 1 2.33 2 1.33 1 1.67 ...
## $ avoid : num 2 3 3 1.5 2.75 2.5 1 4 1 4 ...
## $ support : num 1 1.25 1 3.25 1.25 1 1.5 2.75 1.33 3.5 ...
## $ agressive: num 2.5 1.5 2.33 1 1.5 3.67 1 2 1.67 2.5 ...
## $ situation: chr "Fail" "NoPart" "TeacNo" "Bully" ...
## $ sbj : chr "S2" "S2" "S2" "S2" ...
summary(Edat3)
annoy | sad | afraid | angry | approach | avoid | support | agressive | situation | sbj | |
---|---|---|---|---|---|---|---|---|---|---|
Min. :1.000 | Min. :1.00 | Min. :1.000 | Min. :1.000 | Min. :1.000 | Min. :1.000 | Min. :0.000 | Min. :1.000 | Length:84 | Length:84 | |
1st Qu.:2.000 | 1st Qu.:1.00 | 1st Qu.:1.000 | 1st Qu.:1.000 | 1st Qu.:1.670 | 1st Qu.:1.750 | 1st Qu.:1.330 | 1st Qu.:1.000 | Class :character | Class :character | |
Median :3.000 | Median :2.00 | Median :1.000 | Median :2.000 | Median :2.000 | Median :2.500 | Median :2.000 | Median :1.500 | Mode :character | Mode :character | |
Mean :2.762 | Mean :1.81 | Mean :1.405 | Mean :2.131 | Mean :2.236 | Mean :2.398 | Mean :1.959 | Mean :1.542 | NA | NA | |
3rd Qu.:4.000 | 3rd Qu.:2.00 | 3rd Qu.:2.000 | 3rd Qu.:2.250 | 3rd Qu.:3.000 | 3rd Qu.:3.000 | 3rd Qu.:2.373 | 3rd Qu.:1.670 | NA | NA | |
Max. :4.000 | Max. :4.00 | Max. :4.000 | Max. :9.000 | Max. :4.000 | Max. :4.000 | Max. :4.000 | Max. :4.000 | NA | NA |
##利用reshape函數將資料從wide轉成long
<- reshape(Edat3,
Edat3_longdirection="long",
varying = c("annoy", "sad", "afraid", "angry", "approach", "avoid", "support", "agressive"),
v.names = "score",
timevar = "Emotion",
times = c("annoy", "sad", "afraid", "angry", "approach", "avoid", "support", "agressive"),
new.row.names = 1:1000)
|> as.data.frame() |> head(11) Edat3_long
situation | sbj | Emotion | score | id |
---|---|---|---|---|
Fail | S2 | annoy | 4 | 1 |
NoPart | S2 | annoy | 4 | 2 |
TeacNo | S2 | annoy | 2 | 3 |
Bully | S2 | annoy | 4 | 4 |
Work | S2 | annoy | 4 | 5 |
MomNo | S2 | annoy | 4 | 6 |
Fail | S17 | annoy | 3 | 7 |
NoPart | S17 | annoy | 3 | 8 |
TeacNo | S17 | annoy | 3 | 9 |
Bully | S17 | annoy | 4 | 10 |
Work | S17 | annoy | 4 | 11 |
ggplot(Edat3_long, aes(x=Emotion, y=score, group=Emotion))+
geom_point(mapping = aes(color=Emotion))+
facet_wrap(~ situation,nrow=3)+#facet_wrap前面有用過,分類一個變項用
labs(x="Emotion", y="Score")+
geom_boxplot()+
theme(axis.text.x=element_text(angle=30,size=8, hjust=1.5))
ggplot(Edat3_long, aes(x=Emotion, y=score, group=Emotion))+
geom_point(mapping = aes(color=Emotion))+
facet_wrap(~ situation,nrow=1)+#全部擺一排看分布
labs(x="Emotion", y="Score")+
geom_boxplot()+
theme(axis.text.x=element_text(angle=45,size=8, hjust=1.5))
#以下要再嘗試 pd <- position_dodge(.3)
p <- na.omit(Edat3_long) %>% group_by(situation ) %>%
summarize(m_p=mean(annoy), #library(dplyr) se_p=sd(annoy)/sqrt(n()), .groups=‘drop’) %>% ggplot() + aes( m_p, group=situation, shape=situation) + geom_errorbar(aes(ymin=m_p - se_p, ymax=m_p + se_p), width=.2, size=.5, position=pd) + geom_line(position=pd, linetype=‘dotted’) + geom_point(position=pd, size=rel(3)) +#數字大線條越明顯 scale_shape(guide=guide_legend(title=‘Group’)) + labs(x=” Children ID”, y=“Unpleasant (Annoy)”) + theme(legend.position=c(.1, .5)) p
Use the USPersonalExpenditure{datasets} for this problem. This data set consists of United States personal expenditures (in billions of dollars) in the categories; food and tobacco, household operation, medical and health, personal care, and private education for the years 1940, 1945, 1950, 1955 and 1960. Plot the US personal expenditure data in the style of the third plot on the “Time Use” case study in the course web page. You might want to transform the dollar amounts to log base 10 unit first.
data(USPersonalExpenditure, package="datasets")
<-USPersonalExpenditure
Edat4head(Edat4) |> knitr::kable()
1940 | 1945 | 1950 | 1955 | 1960 | |
---|---|---|---|---|---|
Food and Tobacco | 22.200 | 44.500 | 59.60 | 73.2 | 86.80 |
Household Operation | 10.500 | 15.500 | 29.00 | 36.5 | 46.20 |
Medical and Health | 3.530 | 5.760 | 9.71 | 14.0 | 21.10 |
Personal Care | 1.040 | 1.980 | 2.45 | 3.4 | 5.40 |
Private Education | 0.341 | 0.974 | 1.80 | 2.6 | 3.64 |
str(Edat4)
## num [1:5, 1:5] 22.2 10.5 3.53 1.04 0.341 44.5 15.5 5.76 1.98 0.974 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:5] "Food and Tobacco" "Household Operation" "Medical and Health" "Personal Care" ...
## ..$ : chr [1:5] "1940" "1945" "1950" "1955" ...
##log base 10 unit,取小數點後四位
<-round(log10(Edat4),digits = 4)
log10_Edat4 log10_Edat4
1940 | 1945 | 1950 | 1955 | 1960 | |
---|---|---|---|---|---|
Food and Tobacco | 1.3464 | 1.6484 | 1.7752 | 1.8645 | 1.9385 |
Household Operation | 1.0212 | 1.1903 | 1.4624 | 1.5623 | 1.6646 |
Medical and Health | 0.5478 | 0.7604 | 0.9872 | 1.1461 | 1.3243 |
Personal Care | 0.0170 | 0.2967 | 0.3892 | 0.5315 | 0.7324 |
Private Education | -0.4672 | -0.0114 | 0.2553 | 0.4150 | 0.5611 |
library(reshape2)
##轉置
<-reshape::melt(log10_Edat4)
log_Edat4_longnames(log_Edat4_long) <-c("Items", "Year", "US_Dollar")
|> as.data.frame() |> head(10) log_Edat4_long
Items | Year | US_Dollar |
---|---|---|
Food and Tobacco | 1940 | 1.3464 |
Household Operation | 1940 | 1.0212 |
Medical and Health | 1940 | 0.5478 |
Personal Care | 1940 | 0.0170 |
Private Education | 1940 | -0.4672 |
Food and Tobacco | 1945 | 1.6484 |
Household Operation | 1945 | 1.1903 |
Medical and Health | 1945 | 0.7604 |
Personal Care | 1945 | 0.2967 |
Private Education | 1945 | -0.0114 |
##畫圖
ggplot(log_Edat4_long, aes(x=Year, y=US_Dollar, group=Items))+
geom_line(mapping = aes(color=Items))+
geom_point(mapping = aes(color=Items))+
facet_wrap(~ Items,nrow=1)+
labs(x="Year", y="US Dollar")+
theme(axis.text.x=element_text(angle=45,size=10, hjust=1))
##放在一起看
ggplot(log_Edat4_long, aes(x=Year, y=US_Dollar, group=Items))+
geom_line(mapping = aes(color=Items))+
geom_point(mapping = aes(color=Items))+
labs(x="Year", y="US Dollar")+
theme(axis.text.x=element_text(size=12, hjust=1.5))
##畫Lollipop plot
ggplot(log_Edat4_long, aes(x=US_Dollar, y=Items))+
geom_vline(xintercept=0)+
geom_segment(aes(xend=0, yend=Items))+
geom_point(mapping = aes(color=Items))+
facet_wrap(~ Year,nrow=1)+
labs(x="US_Dollar", y="Items")+
theme(axis.text.x=element_text(angle=30,size=8, hjust=1))
Use the Cushings{MASS} data set to generate a plot
##ggrepel 解決文字標示重疊問題的套件
library(ggrepel)
data(Cushings, package="MASS")
<-Cushings
Edat5head(Edat5)
Tetrahydrocortisone | Pregnanetriol | Type | |
---|---|---|---|
a1 | 3.1 | 11.70 | a |
a2 | 3.0 | 1.30 | a |
a3 | 1.9 | 0.10 | a |
a4 | 3.8 | 0.04 | a |
a5 | 4.1 | 1.10 | a |
a6 | 1.9 | 0.40 | a |
str(Edat5)
## 'data.frame': 27 obs. of 3 variables:
## $ Tetrahydrocortisone: num 3.1 3 1.9 3.8 4.1 1.9 8.3 3.8 3.9 7.8 ...
## $ Pregnanetriol : num 11.7 1.3 0.1 0.04 1.1 0.4 1 0.2 0.6 1.2 ...
## $ Type : Factor w/ 4 levels "a","b","c","u": 1 1 1 1 1 1 2 2 2 2 ...
$Typef <- factor(Edat5$Type,
Edat5levels = c("u", "b", "c", "a"),
labels = c("Unknown",
"Bilateral Hyperplasia",
"Carcinoma",
"Adenoma"))
head(Edat5)
Tetrahydrocortisone | Pregnanetriol | Type | Typef | |
---|---|---|---|---|
a1 | 3.1 | 11.70 | a | Adenoma |
a2 | 3.0 | 1.30 | a | Adenoma |
a3 | 1.9 | 0.10 | a | Adenoma |
a4 | 3.8 | 0.04 | a | Adenoma |
a5 | 4.1 | 1.10 | a | Adenoma |
a6 | 1.9 | 0.40 | a | Adenoma |
#先設定好標籤
$text[c(1, 13, 21, 27)] <- c("Adenoma", "Bilateral Hyperplasia", "Carcinoma", "Unknown") Edat5
##先建標籤後跑圖
ggplot(Edat5, aes(Tetrahydrocortisone, Pregnanetriol, fill = Type))+
geom_point(pch = 21, size = rel(2))+
# 設定points shapes 用21可填顏色
geom_text_repel(aes(label = text, color = Typef))+
# setting the breaks in x and y axis
scale_y_continuous(breaks = c(0, 2, 4, 6, 8, 10, 12))+
scale_x_continuous(limits = c(0, 60), breaks = c(0, 10, 20, 30, 40, 50, 60))+
# plot and axis title
labs(x = "Tetrahydrocortisone (mg/24 hours)",
y = "Pregnanetriol (mg/24 hours)",
title = "Cushings's syndrome")+
# title 靠右邊
theme(plot.title = element_text(hjust = 1),
legend.position="")
##使用 ggrepel設定標籤
ggplot(Edat5, aes(Tetrahydrocortisone, Pregnanetriol, label = rownames(Edat5))) +
geom_point(mapping =aes(color =Typef) ) +
geom_text_repel()+#不要重疊標籤
labs(x = "Tetrahydrocortisone (mg/24 hours)",
y = "Pregnanetriol (mg/24 hours)",
title = "Cushings's syndrome")+
theme(plot.title = element_text(hjust = 1))