Task: Find out what each code chunk (indicated by ‘##’) in the R script does and provide comments.
::loadfonts(device = "win")
extrafont## basic graphics
## plot without any points
plot(women, type='n')
## only plot first row data (58, 115)
points(women[1,])
## lattice graphics
## xyplot(y ~ x), only plot first row data
::xyplot(weight ~ height,
latticedata = women,
subset = row.names(women)==1, type = 'p')
## ggplot graphics
library(ggplot2)
ggplot(data=women[1,], aes(height, weight)) +
geom_point()
## basic plot cannot save as an object
<- plot(women, type='n') w1
## the xyplot can save as a trellis object
<- lattice::xyplot(weight ~ height, data=women) w2
## the class of w1 is null due to basic plot cannot save as an object
class(w1)
## [1] "NULL"
## the class of w2 is trellis due to lattice plot save as an trellis object
class(w2)
## [1] "trellis"
## List Methods for trellis objectice
methods(class="trellis")
## [1] [ dim dimnames dimnames<- plot print summary
## [8] t update
## see '?methods' for accessing help and source code
## e.g.:
summary(w2)
##
## Call:
## xyplot(weight ~ height, data = women)
##
## Number of observations:
## [1] 15
The data set is concerned with grade 8 pupils (age about 11 years) in elementary schools in the Netherlands. After deleting pupils with missing values, the number of pupils is 2,287 and the number of schools is 131. Class size ranges from 4 to 35. The response variables are score on a language test and that on an arithmetic test. The research interest is on how the two test scores depend on the pupil’s intelligence (verbal IQ) and on the number of pupils in a school class.
The class size is categorized into small, medium, and large with roughly equal number of observations in each category. The verbal IQ is categorized into low, middle and high with roughly equal number of observations in each category.
Task: Reproduce the plot below.
Source: Snijders, T. & Bosker, R. (2002). Multilevel Analysis.
Column 1: School ID
Column 2: Pupil ID
Column 3: Verbal IQ score
Column 4: The number of pupils in a class
Column 5: Language test score
Column 6: Arithmetic test score
::p_load(ggplot2, hrbrthemes, dplyr)
pacman
# input data
<- read.table("langMathDutch.txt", header = T)
dat # rename column
names(dat) <- c("School", "Pupil", "VIQ", "Size", "Language", "Math")#
# class size category
|>
dat ::summarise(Size = quantile(Size, c(0.33, 0.66, 0.99)),
dplyrcategory = c("Small", "Medium", "Large"))
## Size category
## 1 20 Small
## 2 27 Medium
## 3 37 Large
# verbal IQ category
|>
dat ::summarise(VIQ = quantile(VIQ, c(0.33, 0.66, 0.99)),
dplyrcategory = c("Low", "Middle", "High"))
## VIQ category
## 1 11.0 Low
## 2 12.5 Middle
## 3 17.0 High
<- dat |>
dat mutate(Sizec = cut(Size, 3,
labels=c("Small", "Medium", "Large"),
right = TRUE,
ordered_result = TRUE),
VIQc = cut(VIQ, 3,
labels=c("Low", "Middle", "High"),
right = TRUE,
ordered_result = TRUE))
ggplot(data= dat, aes(Language, Math))+
geom_point(shape= 18, size= 1.5)+
stat_smooth(formula= y ~ x, method= "lm", se= T, size= 0.8)+
labs(x= "Language score", y= "Arithmetic score")+
facet_wrap(. ~ Sizec + VIQc, labeller= labeller(.multi_line= F))+
theme_minimal()
A sample of 158 children with autisim spectrum disorder were recruited. Social development was assessed using the Vineland Adaptive Behavior Interview survey form, a parent-reported measure of socialization. It is a combined score that included assessment of interpersonal relationships, play/leisure time activities, and coping skills. Initial language development was assessed using the Sequenced Inventory of Communication Development (SICD) scale. These assessments were repeated on these children when they were 3, 5, 9, 13 years of age.
Data: autism{WWGbook}
Column 1: Age (in years)
Column 2: Vineland Socialization Age Equivalent score
Column 3: Sequenced Inventory of Communication Development Expressive Group (1 = Low, 2 = Medium, 3 = High)
Column 4: Child ID
Task: Replicate the two plots above using ggplot2.
# input autism data from {WWGbook}
<- na.omit(WWGbook::autism)
dat
# rename column
names(dat) <- c("Age", "Socialization", "Communication", "Child")
# set level labels
$Communication <- factor(dat$Communication,
datlevels=c(1, 2, 3),
labels=c("L", "M", "H"))
ggplot(dat, aes(x = scale(Age, center = TRUE, scale = F), # Centered Age
y = Socialization)) +
geom_line(aes(group = Child)) +
geom_point(alpha = .5) +
stat_smooth(formula = y ~ poly(x, 2), method = "lm", se = T) + # set poly curve
facet_grid(. ~ Communication) +
labs(x = "Age (in years, centered)", y = "VSAE score") +
theme_minimal()
# Warning: Removed 1 row(s) containing missing values (geom_path).
# Warning: Removed 2 rows containing missing values (geom_point).
<- position_dodge(.3)
pd
%>%
dat group_by(Age, Communication) %>%
summarize(m_p = mean(Socialization),
se_p = sd(Socialization)/sqrt(n()), .groups = "drop") %>%
ggplot(aes(x = Age-2, m_p,
group = Communication,
shape = Communication,
linetype = Communication)) +
geom_errorbar(aes(ymin = m_p - se_p,
ymax = m_p + se_p),
width = .2, size= .3,
position = pd,
linetype = 1) +
geom_line(position = pd) +
geom_point(position = pd, size = rel(3)) +
scale_shape_manual(values=c(1, 2, 16))+
labs(x = "Age (in years, -2)", y = "VSAE score")+
theme_bw()+
guides(fill=guide_legend(title="Group"))+
theme(legend.position = c(.1, .8),
legend.background = element_rect(fill="white",
size=0.5, linetype="solid",
colour ="black"))
## na.omit 後,Low組別的線才可以畫出來
Source: West, B.T., Welch, K.B., & Galecki, A.T. (2002). Linear Mixed Models: Practical Guide Using Statistical Software. p. 220-271.
Use the diabetes dataset to generate a plot similar to the one below and interpret the plot.
::p_load(ggalluvial)
pacman
<- read.csv("diabetes_mell.csv", header = T) dat
<- data.frame(with(dat[, c("BMI", "race", "gender", "diabetes")],
dta_v3 xtabs(~ BMI + race + gender + diabetes)))
# relevel
# 原本使用levels: levels(dta_v3$race) <- c("Hispanic", "White", "Black")會錯置
levels(dta_v3$race) <- list(Hispanic = "Hispanic",
White = "White",
Black = "Black")
$gender<-relevel(dta_v3$gender, "Males")
dta_v3$diabetes<-relevel(dta_v3$diabetes, "Yes")
dta_v3
ggplot(dta_v3,
aes(axis1=race,
axis2=gender,
axis3=diabetes,
y=Freq)) +
scale_x_discrete(limits=c("race",
"gender",
"diabetes"),
expand=c(.1, .05)) +
labs(y = "No. individuals") +
geom_alluvium(aes(fill=BMI)) +
geom_stratum() +
geom_text(stat = "stratum",
aes(label = after_stat(stratum))) +
scale_fill_manual(values=c("#A6AEB1","#EFCAAA"))+
theme_minimal() +
ggtitle("Diabetes in overall population in US 2009-2010",
subtitle = "stratified by race, gender and diabetes mellitus")+
theme(legend.position='bottom')
Fifty male and fifty female students fill out the same questionnaire in weekly intervals starting five weeks before an important examination to measure state anxiety.
The research interests are:
1. whether there are gender difference in state anxiety
2. individual differences in state anxiety
Task: Explore the answers to both questions with plots involving confidence intervals or error bars for the means.
Column 1: Anxiety score 5 weeks before exam for female
Column 2: Anxiety score 4 weeks before exam for female
Column 3: Anxiety score 3 weeks before exam for female
Column 4: Anxiety score 2 weeks before exam for female
Column 5: Anxiety score 1 weeks before exam for female
Column 6: Anxiety score 5 weeks before exam for male
Column 7: Anxiety score 4 weeks before exam for male
Column 8: Anxiety score 3 weeks before exam for male
Column 9: Anxiety score 2 weeks before exam for male
Column 10: Anxiety score 1 weeks before exam for male
::p_load(readr, tidyr)
pacman
<- read.table("stateAnxiety.txt", header = T)
dat
<- dat |>
dat_l pivot_longer(cols = starts_with(c("f","m")), # wide to long
names_to = "Week",
values_to = "Score") |>
mutate(Gender = gsub("[0-9]","",Week),
Week = parse_number(Week),
ID = rep(1:100, each=5))
ggplot(dat_l, aes(x=Week, y=Score, group=ID))+
geom_line()+
geom_point()+
facet_wrap(~ Gender)+
labs(x="weeks before an important examination", y="Anxiety score")+
theme_minimal()+
theme(legend.position="")
<- position_dodge(0.8)
pd <- position_jitter(width = 0.2, height = 0)
jitter
ggplot(dat_l, aes(x=Week, y=Score, color=Gender))+
geom_boxplot(aes(group=Week), width = 0.1, color="gray80")+
geom_boxplot(aes(group=interaction(Week, Gender)), width = 0.5, position = pd)+
#geom_boxplot(data=subset(dat_l, Gender=="m"), aes(group=Week), width = 0.1, position = pd)+
#geom_boxplot(data=subset(dat_l, Gender=="f"), aes(group=Week), width = 0.1, position = pd)+
geom_point(size = 0.8, position = jitter)+
stat_smooth(formula= y~x, method="lm", se=T, size=0.8)+
labs(x="weeks before an important examination", y="Anxiety score")+
theme_minimal()+
theme(legend.position="bottom")
Source: Von Eye, A., & Schuster C. (1998). Regression Analysis for Social Sciences. San Diego: Academic Press.
Use the markdown file to replicate the contents of Weissgerber, T.L., Milic, N.M., Winham, S.J., Garovic, V.D. (2015). Beyond Bar and Line Graphs: Time for a New Data Presentation Paradigm. PLOS Biology , 13.
The two data sets are here provided: journal.pbio.1002128.s002.XLS and journal.pbio.1002128.s003.XLS.
You can also download everything in a zip file from this location.
## Registered S3 method overwritten by 'printr':
## method from
## knit_print.data.frame rmarkdown
library(readxl)
# read in data from PLOS Biology article supplementary materials
<- read_excel("journal.pbio.1002128.s002.XLSX", sheet = 1)
independent_data # subset just groups 1-5 from the 'No overlapping points' sheet
<- independent_data[15:30,2:6]
independent_data # assign column names
names(independent_data) <- independent_data[1, ]
# remove first row with column names
<- independent_data[-1, ]
independent_data kable(independent_data)
Group 1 | Group 2 | Group 3 | Group 4 | Group 5 |
---|---|---|---|---|
5 | 7 | 9 | 42 | 2 |
3 | 3 | 7 | 2 | 0 |
6 | 9 | 10 | 5 | 3 |
8 | 10 | 12 | 55 | 5 |
10 | 33 | 14 | 9 | 7 |
13 | 15 | 17 | 12 | 10 |
1 | 18 | 20 | 15 | 13 |
4 | 6 | 40 | 3 | 1 |
18 | 20 | 22 | NA | 15 |
4 | 30 | 35 | NA | 1 |
7 | NA | 42 | NA | 4 |
9 | NA | 13 | NA | 6 |
14 | NA | NA | NA | 11 |
15 | NA | NA | NA | 12 |
17 | NA | NA | NA | 14 |
An important step is reshaping the data from their current wide format to a more tidy long format. Long formats are most useful for plotting and statistical analysis in R. Here’s what the data look like in the long format:
# reshape for plotting
library(tidyr)
<- gather(independent_data, group, value, `Group 1`:`Group 5`, convert = TRUE)
independent_data_long ::datatable(independent_data_long, rownames=FALSE, options=list(pageLength=15)) DT
Now we are ready to plot, starting with subsetting just groups 1 and and 2 from the long data frame. Open circles show measurements for each participant or observation.
# plot
library(ggplot2)
library(dplyr)
# subset groups 1 & 2
<- independent_data_long %>%
independent_data_long_groups_1_and_2 filter(group %in% c("Group 1", "Group 2"))
# plot
ggplot(independent_data_long_groups_1_and_2, aes(group, as.numeric(value))) +
geom_point(shape = 1, size = 4) +
stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.2, size = 1) +
xlab("") +
ylab("Measurement (units)") +
theme_minimal(base_size = 16)
Plotting groups 1, 2, and 3, the only thing that changes is the subsetting method:
# subset groups 1, 2 & 3
<- independent_data_long %>%
independent_data_long_groups_1_2_3 filter(group %in% c("Group 1", "Group 2", "Group 3"))
# plot
ggplot(independent_data_long_groups_1_2_3, aes(group, as.numeric(value))) +
geom_point(shape = 1, size = 4) +
stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.2, size = 1) +
xlab("") +
ylab("Measurement (units)") +
theme_minimal(base_size = 16)
Plotting groups 1 to 4:
# groups 1, 2, 3, & 4
<- independent_data_long %>%
independent_data_long_groups_1_2_3_4 filter(group %in% c("Group 1", "Group 2", "Group 3", "Group 4"))
# plot
ggplot(independent_data_long_groups_1_2_3_4, aes(group, as.numeric(value))) +
geom_point(shape = 1, size = 4) +
stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.2, size = 1) +
xlab("") +
ylab("Measurement (units)") +
theme_minimal(base_size = 16)
And finally plotting all five groups, no subsetting required:
# all five groups
ggplot(independent_data_long, aes(group, as.numeric(value))) +
geom_point(shape = 1, size = 4) +
stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.2, size = 1) +
xlab("") +
ylab("Measurement (units)") +
theme_minimal(base_size = 16)
library(readxl)
# read in data from PLOS Biology article supplementary materials
<- read_excel("journal.pbio.1002128.s002.XLSX", sheet = 2)
independent_data_j # subset data from the 'points jittered' sheet
<- independent_data_j[16:115,2:3]
independent_data_j # group numbers are not given in the spreadsheet, so we'll add them
$Groups <- c(rep(1, 20), rep(2, 20), rep(3, 20), rep(4, 20), rep(5, 20))
independent_data_j# assign column names
names(independent_data_j) <- c("Subject ID", "Measurement", "Group")
# plot
library(ggplot2)
library(dplyr)
# groups 1 & 2
<- independent_data_j %>%
independent_data_j_groups_1_and_2 filter(Group %in% 1:2)
# plot
ggplot(independent_data_j_groups_1_and_2, aes(as.factor(Group), as.numeric(Measurement))) +
geom_jitter(shape = 1, size = 4, position=position_jitter(width = 0.2, height = 0.2)) +
stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.2, size = 1) +
xlab("") +
ylab("Measurement (units)") +
theme_minimal(base_size = 16)
# groups 1, 2 & 3
<- independent_data_j %>%
independent_data_j_groups_1_2_3 filter(Group %in% 1:3)
# plot
ggplot(independent_data_j_groups_1_2_3, aes(as.factor(Group), as.numeric(Measurement))) +
geom_jitter(shape = 1, size = 4, position=position_jitter(width = 0.2, height = 0.2)) +
stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.2, size = 1) +
xlab("") +
ylab("Measurement (units)") +
theme_minimal(base_size = 16)
# groups 1, 2, 3, & 4
<- independent_data_j %>%
independent_data_j_groups_1_2_3_4 filter(Group %in% 1:4)
# plot
ggplot(independent_data_j_groups_1_2_3_4, aes(as.factor(Group), as.numeric(Measurement))) +
geom_jitter(shape = 1, size = 4, position=position_jitter(width = 0.2, height = 0.2)) +
stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.2, size = 1) +
xlab("") +
ylab("Measurement (units)") +
theme_minimal(base_size = 16)
# all five groups
ggplot(independent_data_j, aes(as.factor(Group), as.numeric(Measurement))) +
geom_jitter(shape = 1, size = 4, position=position_jitter(width = 0.2, height = 0.2)) +
stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.2, size = 1) +
xlab("") +
ylab("Measurement (units)") +
theme_minimal(base_size = 16)
library(readxl)
# read in data from PLOS Biology article supplementary materials
<- read_excel("journal.pbio.1002128.s003.XLS", sheet = 1)
One_group_two_conditions # subset data from the 'points jittered' sheet
<- One_group_two_conditions[12:23,1:3]
One_group_two_conditions # assign column names
names(One_group_two_conditions) <- c("Subject ID", "Condition 1 Name", "Condition 2 Name")
$difference <- as.numeric(One_group_two_conditions$`Condition 2 Name`) - as.numeric(One_group_two_conditions$`Condition 1 Name`) One_group_two_conditions
The data in the Excel sheet are in an untidy wide format, so let’s convert them to a tidy long format:
# reshape for plotting
library(tidyr)
<- gather(One_group_two_conditions, group, value, `Condition 1 Name`:`Condition 2 Name`, -`Subject ID`, -difference, convert = TRUE) One_group_two_conditions_long
Now we can plot:
# plot
library(ggplot2)
library(gridExtra)
<- ggplot(One_group_two_conditions_long, aes(group, as.numeric(value), group = `Subject ID`)) +
g1 geom_point(shape = 1, size = 4) +
geom_line() +
xlab("") +
ylab("Measurement (units)") +
theme_minimal(base_size = 16) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
# differences
<- ggplot(One_group_two_conditions_long, aes(x = 1, y = difference)) +
g2 geom_point(shape = 1, size = 4) +
stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.001, size = 1) +
xlab("") +
ylab("Difference in Measurement (units)") +
theme_minimal(base_size = 16) +
scale_x_continuous(breaks = NULL) +
coord_fixed(ratio = 0.0005)
# combine the two plots
grid.arrange(g1, g2, ncol = 2)
First we read in the data from the spreadsheet:
library(readxl)
# read in data from PLOS Biology article supplementary materials
<- read_excel("journal.pbio.1002128.s003.XLS", sheet = 2)
Two_groups_two_conditions # subset data from the 'points jittered' sheet
<- Two_groups_two_conditions[12:41,2:5]
Two_groups_two_conditions # assign group names
$group <- c(rep("Group 1 Name", 15), rep("Group 2 Name", 15))
Two_groups_two_conditionsnames(Two_groups_two_conditions) <- c("Condition 1 Name A", "Condition 2 Name A", "Condition 1 Name B", "Condition 2 Name B")
The data in the Excel sheet are in an unusual structure, so a few steps for reshaping into a tidy form are needed. Here’s how we can tidy them and how they look after being tidied:
# convert to simple long form
1] <- unlist(c(Two_groups_two_conditions[1:15,1], Two_groups_two_conditions[16:30,3]))
Two_groups_two_conditions[,2] <- unlist(c(Two_groups_two_conditions[1:15,2], Two_groups_two_conditions[16:30,4]))
Two_groups_two_conditions[,# drop unneeded columns
<- Two_groups_two_conditions[,c(1:2, 5)]
Two_groups_two_conditions # assign column names
names(Two_groups_two_conditions) <- c("Condition 1", "Condition 2", "Group")
$`Subject ID` <- 1:30
Two_groups_two_conditions# compute differences
$difference <- as.numeric(Two_groups_two_conditions$`Condition 2`) - as.numeric(Two_groups_two_conditions$`Condition 1`)
Two_groups_two_conditions
# convert to long again
library(tidyr)
<- gather(Two_groups_two_conditions, condition, value, c(`Condition 1`, `Condition 2`), convert = TRUE)
Two_groups_two_conditions_long
::datatable(Two_groups_two_conditions_long, rownames=FALSE, options=list(pageLength=15)) DT
Now we can plot:
# plot
library(ggplot2)
<- ggplot(Two_groups_two_conditions_long, aes(condition, as.numeric(value), group = `Subject ID`)) +
g1 geom_point(size = 4, shape = 1) +
geom_line() +
xlab("") +
ylab("Measurement (units)") +
theme_minimal(base_size = 16) +
facet_grid(~Group) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# difference
<- ggplot(Two_groups_two_conditions_long, aes(Group, difference)) +
g2 geom_point(size = 4, shape = 1) +
stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.3) +
xlab("") +
ylab("Difference in Measurement (units)") +
theme_minimal(base_size = 16) +
coord_fixed(ratio = 0.15)
# combine the two plots
grid.arrange(g1, g2, ncol = 2)
The dataset consists of a sample of 14 primary school children between 8 and 12 years old. The children were asked to respond on 8 emotions and coping strategies scales for each of 6 situations: 1. fail to fulfill assignments in class, 2. not allowed to play with other children, 3. forbidden to do something by the teacher, 4. victim of bullying, 5. too much school work, 6. forbidden to do something by the mother.
Task: Plot the data in some meaningful ways. You may have to manipulate data into a different format first.
Column 1: Unpleasant (Annoy)
Column 2: Sad
Column 3: Afraid
Column 4: Angry
Column 5: Approach coping
Column 6: Avoidant coping
Column 7: Social support seeking
Column 8: Emotional reaction, especially aggression
Column 9: Situation ID
Column 10: Children ID
<- read.table("coping.txt", header=TRUE)
dat
names(dat)<-c("Annoy", "Sad", "Afraid", "Angry",
"Approach", "Avoidance", "Socialsupport", "Aggression",
"Situation", "ChildrenID")
# wide to long
<- dat|>
dat_l ::melt(id = c("ChildrenID", "Situation"),
reshapevariable_name = "Emotion")
ggplot(dat_l, aes(Emotion, value)) +
stat_summary(fun.data = mean_se,
position = position_dodge(0.3))+
facet_grid(.~Situation)+
labs(x="Situation",y="Score")+
coord_flip()+
theme_minimal()
library(ggridges)
#bw <- unlist(lapply(split(dat_l, dat_l$Situation), function(x) bw.SJ(x$value)))
ggplot(dat_l, aes(x=value)) +
geom_density() +
geom_density_ridges(aes(y=Situation), alpha=.5) +
scale_y_discrete(labels=c("Fail" = "Fail to fulfill assignments in class",
"NoPart" = "Not allowed to play with other children",
"TeacNo" = "Forbidden to do something by the teacher",
"Bully" = "Victim of bullying",
"Work" = "Too much school work",
"MomNo" = "Forbidden to do something by the mother"))+
facet_wrap(.~Emotion, ncol = 8) +
theme_ridges()+
theme(axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 10),
strip.text = element_text(size = 8))
qplot(value, data=dat_l, geom="density", facets=Situation ~ Emotion)
Source: Roeder, I., Boekaerts, M., & Kroonenberg, P. M. (2002). The stress and coping questionnaire for children (School version and Asthma version): Construction, factor structure, and psychometric properties. Psychological Reports, 91, 29-36.
Use the USPersonalExpenditure{datasets} for this problem. This data set consists of United States personal expenditures (in billions of dollars) in the categories; food and tobacco, household operation, medical and health, personal care, and private education for the years 1940, 1945, 1950, 1955 and 1960.
Task: Plot the US personal expenditure data in the style of the third plot on the “Time Use” case study in the course web page. You might want to transform the dollar amounts to log base 10 unit first.
<- datasets::USPersonalExpenditure
dat #"matrix" "array"
<-reshape::melt(dat)
dat_time
names(dat_time)<-c("Category", "Year", "Expenditure")
<- dat_time |>
dat_time mutate(logExp = log10(Expenditure),
Group = ifelse(log10(Expenditure)>0, "Up", "Down"))
ggplot(dat_time, aes(x=Year, y=logExp)) +
# aes(x=factor(Category), y=logExp, group=1)
## if x category is discrete variable to plot line, need to use factor(x),
## group=1: every point to connect with single line group
geom_point()+
geom_line()+
geom_hline(yintercept = 0, colour = "grey50") +
facet_wrap(.~Category, ncol=5)+
labs(x="Year",
y="log Expenditure (original in billion)")+
theme(axis.text.x=element_text(angle=45, hjust=1))
ggplot(dat_time, aes(y=Category, x=logExp)) +
geom_point()+
geom_vline(xintercept=0)+
facet_wrap(.~Year, nrow=1)+
geom_segment(aes(xend=0, yend=Category))+
labs(y="Category", x="log Expenditure (original in billion)")+
theme(axis.text.x=element_text(angle=45, hjust=1))
Tetrahydrocortisone: urinary excretion rate (mg/24hr) of Tetrahydrocortisone.
Pregnanetriol: urinary excretion rate (mg/24hr) of Pregnanetriol.
Type: underlying type of syndrome, coded a (adenoma) , b (bilateral hyperplasia), c (carcinoma) or u for unknown.
Task: Use the Cushings{MASS} data set to generate a plot similar to the following one:
::p_load(ggrepel, ggthemes)
pacman
<- MASS::Cushings
dat
<- dat %>%
dat mutate(Type = factor(Type,
levels = c("u", "b", "c", "a"),
labels = c("Unknown",
"Bilateral Hyperplasia",
"Carcinoma",
"Adenoma")))
# for text label preparation
$Label[c(1, 13, 21, 27)] <- c("Adenoma", "Bilateral Hyperplasia", "Carcinoma", "Unknown") dat
ggplot(dat, aes(Tetrahydrocortisone, Pregnanetriol, fill = Type))+
geom_point(pch = 21, size = rel(2))+
# add point label (annotation is an alternative)
geom_text_repel(aes(label = Label, color = Type))+
# setting the breaks in x and y axis
scale_y_continuous(breaks = c(0, 2, 4, 6, 8, 10, 12))+
scale_x_continuous(limits = c(0, 60), breaks = c(0, 10, 20, 30, 40, 50, 60))+
# theme
theme_hc()+
# plot and axis title
labs(x = "Tetrahydrocortisone (mg/24 hours)",
y = "Pregnanetriol (mg/24 hours)",
title = "Cushings's syndrome")+
# title 靠右邊
theme(plot.title = element_text(hjust = 1),
legend.position="")