## plot the first row in women
## weight against heights
## base graphic
plot(women, type='n')
points(women[1, ])
## plot the first row in women
## weight against heights
## lattice graphic
lattice::xyplot(weight ~ height,
data=women,
subset=row.names(women)==1, type='p')
## plot the first row in women
## weight against heights
## ggplot2
library(ggplot2)
ggplot(data=women[1,], aes(height, weight))+
geom_point()
# read data
dta_school <- read.table("langMathDutch.txt", header = T)
# examine dta
str(dta_school)
## 'data.frame': 2287 obs. of 6 variables:
## $ school: int 1 1 1 1 1 1 1 1 1 1 ...
## $ pupil : int 17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
## $ IQV : num 15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
## $ size : int 29 29 29 29 29 29 29 29 29 29 ...
## $ lang : int 46 45 33 46 20 30 30 57 36 36 ...
## $ arith : int 24 19 24 26 9 13 13 30 23 22 ...
head(dta_school)
## school pupil IQV size lang arith
## 1 1 17001 15.0 29 46 24
## 2 1 17002 14.5 29 45 19
## 3 1 17003 9.5 29 33 24
## 4 1 17004 11.0 29 46 26
## 5 1 17005 8.0 29 20 9
## 6 1 17006 9.5 29 30 13
# categorize class size
# compute range
range <- quantile(dta_school$size, probs = c(0, 1/3, 2/3, 1))
range
## 0% 33.33333% 66.66667% 100%
## 5 20 27 37
# lable different class size group by range
dta_school$size_group <- with(dta_school,
cut(size, breaks = range,
labels = c("Small", "Medium", "Large"),
include.lowest = TRUE))
# categorize IQ
# compute range
range <- quantile(dta_school$IQV, probs = c(0, 1/3, 2/3, 1))
range
## 0% 33.33333% 66.66667% 100%
## 4.0 11.0 12.5 18.0
# label diffrent iq group by range
dta_school$iq_group <- with(dta_school,
cut(IQV, breaks = range,
labels = c("Low", "Middle", "High"),
include.lowest = TRUE))
# combine two columns
# specify factor levels in paticular order
dta_school$type <- factor(paste(dta_school$size_group, dta_school$iq_group, sep = ", "),
levels = c("Small, Low", "Small, Middle", "Small, High",
"Medium, Low", "Medium, Middle", "Medium, High",
"Large, Low", "Large, Middle", "Large, High"))
# plot
library(ggplot2)
ggplot(dta_school, mapping = aes(x = lang, y = arith)) +
# specify the shape of points and color
geom_point(shape = 23, fill = "black") +
# specify method and formula to fit the regression line
stat_smooth(method = "lm",
formula = y ~ x) +
# display plot in panels
facet_wrap(. ~ type) +
# specify labels for x-axis and y-axis
labs(x = "Language score",
y = "Arithmetic score")
# read data
dta <- datasets::USPersonalExpenditure
# examine data
head(dta)
## 1940 1945 1950 1955 1960
## Food and Tobacco 22.200 44.500 59.60 73.2 86.80
## Household Operation 10.500 15.500 29.00 36.5 46.20
## Medical and Health 3.530 5.760 9.71 14.0 21.10
## Personal Care 1.040 1.980 2.45 3.4 5.40
## Private Education 0.341 0.974 1.80 2.6 3.64
# transform to long form
library(reshape2)
dta <- melt(dta)
colnames(dta) <- c("category", "year", "expenditure")
# compute excess money
library(dplyr)
dta$expenditure <- log10(dta$expenditure)
dta <- dta %>% mutate(excess = expenditure - mean(expenditure))
# plot
qplot(excess, category, data = dta) +
geom_segment(aes(xend = 0, yend = category)) +
geom_vline(xintercept = 0, colour = "grey50") +
facet_wrap(~ year, nrow = 1) +
scale_x_continuous(limits = c(-1.5, 1.1),
breaks = seq(-1.5, 1.1, 0.5)) +
labs(x = "excess (log10(billion))")
# read data
dta <- WWGbook::autism
# examine data
head(dta)
## age vsae sicdegp childid
## 1 2 6 3 1
## 2 3 7 3 1
## 3 5 18 3 1
## 4 9 25 3 1
## 5 13 27 3 1
## 6 2 17 3 3
# check NA values
any(is.na(dta))
## [1] TRUE
# remove these rows
dta <- na.omit(dta)
# label sicdegp
dta$group <- with(dta,
cut(sicdegp, breaks = c(0, 1, 2, 3),
labels = c("L", "M", "H")))
# compute centered age
dta <- dta %>% mutate(age_centered = age - mean(age))
# ggplot vsae ~ age_centered
ggplot(data = dta, mapping = aes(x = age_centered,
y = vsae)) +
# plot diffrent groups in diffrent panels
facet_grid(.~group) +
# specify x axis limits and break points
scale_x_continuous(limits = c(-4, 7.5),
breaks = c(-2.5, 0.0, 2.5, 5.0)) +
# make points transparent by "alpha"
geom_point(alpha = 0.45) +
# fit regression line
geom_smooth(method = "lm",
formula = "y ~ x") +
# draw lines by each subject
geom_line(aes(group = childid), alpha = 0.3) +
# specify theme
theme_bw() +
# specify axes labels
labs(x = "Age (in years, centered)",
y = "VSAE score")
pd <- position_dodge(.3)
# create age-2 column
dta %>% mutate(age_2 = age - 2) %>%
# group data
group_by(group, age_2) %>%
# compute mean and standard error for vase
summarize(vsae_mean = mean(vsae),
vsae_se = sd(vsae) / sqrt(n())) %>%
# plot
ggplot() +
# vsae ~ age_2
aes(age_2, vsae_mean,
# grouping variable
group=group,
# shapes by groups
shape=group) +
# errorbar
geom_errorbar(aes(ymin=vsae_mean - vsae_se,
ymax=vsae_mean + vsae_se),
width=.2, size=.3,
position=pd) +
# line
# linetype by groups
geom_line(position=pd,
show.legend = T,
aes(linetype=group)) +
# points
geom_point(position=pd,
size=rel(3),
show.legend = T) +
# points shape
scale_shape_manual(values = c(1, 2, 16)) +
# axes labels
labs(x="Age (in year -2)", y="VSAE score") +
# theme
theme_bw() +
# draw a box holding legend
theme(legend.position=c(.08, .8),
legend.box.background = element_rect(colour = "black"),
legend.key = element_rect(color = "black"),
legend.key.size = unit(1, "cm"))
# read data
dta <- read.table("diabetes_mell.csv", sep = ",", header = T)
# examine data
head(dta)
## SEQN RIAGENDR RIDRETH1 DIQ010 BMXBMI gender race diabetes BMI
## 1 51624 1 3 2 32.22 Males White No Overweight
## 2 51626 1 4 2 22.00 Males Black No Normal weight
## 3 51627 1 4 2 18.22 Males Black No Normal weight
## 4 51628 2 4 1 42.39 Females Black Yes Overweight
## 5 51629 1 1 2 32.61 Males Hispanic No Overweight
## 6 51630 2 3 2 30.57 Females White No Overweight
# select variables for visualization
dta <- dta %>% select(race, gender, diabetes, BMI)
dta <- data.frame(xtabs(data = dta, ~ race + gender + diabetes + BMI))
# relevel factors in paticular order
dta$race <- factor(dta$race, levels = c("Hispanic", "White", "Black"))
dta$gender <- factor(dta$gender, levels = c("Males", "Females"))
dta$diabetes <- factor(dta$diabetes, levels = c("Yes", "No"))
library(ggalluvial)
ggplot(dta,
aes(axis1=race,
axis2=gender,
axis3=diabetes,
y=Freq)) +
scale_x_discrete(limits=c("race",
"gender",
"diabetes"),
expand=c(.1, .05)) +
labs(x='',
y='No. individuals') +
geom_alluvium(aes(fill=BMI)) +
geom_stratum() +
geom_text(stat="stratum",
infer.label=TRUE) +
scale_fill_manual(values=c('gray40','tan1'))+
theme_minimal() +
theme(legend.position = "bottom") +
ggtitle("Diabetes in overall population in US 2009-2010", subtitle = "straitified by race, gender and diabetes mellitus")
## load ggplot2 package
library(ggplot2)
## open ggplot2 documentation
?ggplot2
## install package "gapminder"
install.packages("gapminder")
## Installing package into '/home/zhe/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)
## load package
library(gapminder)
## load data object into global environment
data(gapminder)
## examine data structure of gapminder
str(gapminder)
## tibble [1,704 × 6] (S3: tbl_df/tbl/data.frame)
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ year : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ lifeExp : num [1:1704] 28.8 30.3 32 34 36.1 ...
## $ pop : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
## $ gdpPercap: num [1:1704] 779 821 853 836 740 ...
## store gapminder in a new data object named gap
gap <- gapminder
## create a plot template without geometric object
ggplot(data = gap, aes(x = lifeExp))
## plot a histogram of variabel lifeExp
ggplot(data = gap, aes(x = lifeExp)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## plot a histogram of variabel lifeExp
ggplot(data = gap, aes(x = lifeExp)) +
# blue bar
# black border
# use only 10 bars to display the histogram
geom_histogram(fill = "blue", color = "black", bins = 10) +
# title of the plot
ggtitle("Life expectancy for the gap dataset") +
# label of x-axis
xlab("Life expectancy (years)") +
# label of y-axis
ylab("Frequency") +
# classic theme
theme_classic()
## boxplot of lifeExp by different continent
# use diffrent color to display different continets
ggplot(data = gap, aes(x = continent, y = lifeExp, fill = continent)) +
geom_boxplot() +
# title
ggtitle("Boxplots for lifeExp by continent") +
# label of x-axis
xlab("Continent") +
# label of y-axis
ylab("Life expectancy (years)") +
# minial theme
theme_minimal() +
guides(fill = FALSE)
# What happens if you un-hashtage `guides(fill = FALSE)` and
# the plus sign in lines 68 and 69 above?
What happens if you un-hashtage guides(fill = FALSE) and the plus sign in lines 68 and 69 above?
The legend will disappear.
## scatter plot of gdpPercap against lifeExp
# different colors to display continent
# different shapes of points to display continent
ggplot(data = gap, aes(x = lifeExp, y = gdpPercap, color = continent, shape = continent)) + # specify point size and opacity
geom_point(size = 5, alpha = 0.5) +
# classic theme
theme_classic() +
# title of the plot
ggtitle("Scatterplot of life expectancy by gdpPercap") +
# label of x-axis
xlab("Life expectancy (years)") +
# label of y-axis
ylab("gdpPercap (USD)") +
# legend position on the top of the plot
theme(legend.position = "top",
# title font size 20
# title horizontal position 0.5
plot.title = element_text(hjust = 0.5, size = 20),
# legend title font size 10
legend.title = element_text(size = 10),
# legend font size 5
legend.text = element_text(size = 5),
# text on the x-axix counter clockwise 45 degree
# config the horizontal postition of text
axis.text.x = element_text(angle = 45, hjust = 1))
## The End