In-Class Exercise 1: Mutiple Plot Method
Plot a scatter plot of the women data using lattice.
Plot a scatter plot of the women data using ggplot.
In-Class Exercise 2: Grade 8 Pupils in Elementary Schools in the Netherlands.
Load data file
## school pupil IQV size lang arith
## 1 1 17001 15.0 29 46 24
## 2 1 17002 14.5 29 45 19
## 3 1 17003 9.5 29 33 24
## 4 1 17004 11.0 29 46 26
## 5 1 17005 8.0 29 20 9
## 6 1 17006 9.5 29 30 13
Categorized the class size and verbal IQ
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dta_n <- dta %>%
mutate(Group_size = cut(size, include.lowest = TRUE,
breaks = quantile(size, c(0, 1/3, 2/3, 1)),
labels = c('Small', 'Medium', 'Large')),
Group_IQV = cut(IQV, include.lowest = TRUE,
breaks = quantile(IQV, c(0, 1/3, 2/3, 1)),
labels = c('Low', 'Middle', 'High'))) %>%
mutate(group = paste(as.character(Group_size),
as.character(Group_IQV), sep=', ') %>%
factor(., levels = c('Small, Low', 'Small, Middle', 'Small, High',
'Medium, Low', 'Medium, Middle', 'Medium, High',
'Large, Low', 'Large, Middle', 'Large, High')))Plot the scatter plot between Language and Arithmetic scores
ggplot(data = dta_n, aes(x = lang, y = arith)) +
labs(x = 'Language score', y = 'Arithmetic score') +
geom_point(shape = 23, fill = 'black') +
geom_smooth(formula = y ~ x, method = 'lm', lwd = .5) +
facet_wrap(. ~ group) In-Class Exercise 3: Grade 8 Pupils in Elementary Schools in the Netherlands.
Load data file
## 1940 1945 1950 1955 1960
## Food and Tobacco 22.200 44.500 59.60 73.2 86.80
## Household Operation 10.500 15.500 29.00 36.5 46.20
## Medical and Health 3.530 5.760 9.71 14.0 21.10
## Personal Care 1.040 1.980 2.45 3.4 5.40
## Private Education 0.341 0.974 1.80 2.6 3.64
Translate the data format from wide to long
Compute mean and log Expenditure, and subtraction
## ─ Attaching packages ────────────────────────── tidyverse 1.3.0 ─
## ✓ tibble 2.1.3 ✓ purrr 0.3.3
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ─ Conflicts ─────────────────────────── tidyverse_conflicts() ─
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
dta_n <- dta %>%
as.data.frame() %>%
mutate(Categories = row.names(dta)) %>%
gather(key = Year, value = Expenditure, 1:5) %>%
mutate(Expenditure = log10(Expenditure)) %>%
mutate(Excess = Expenditure - mean(Expenditure))
head(dta_n)## Categories Year Expenditure Excess
## 1 Food and Tobacco 1940 1.34635297 0.4279902
## 2 Household Operation 1940 1.02118930 0.1028266
## 3 Medical and Health 1940 0.54777471 -0.3705880
## 4 Personal Care 1940 0.01703334 -0.9013294
## 5 Private Education 1940 -0.46724562 -1.3856084
## 6 Food and Tobacco 1945 1.64836001 0.7299973
Plot the data
qplot(Excess, Categories, data = dta_n, facets = . ~ Year) +
geom_segment(aes(xend = 0, yend = Categories)) +
geom_vline(xintercept = 0, colour = "grey50") +
facet_wrap(~ Year, nrow = 1) +
scale_x_continuous(limits = c(-1.5, 1.1),
breaks = seq(-1.5, 1.1, 0.5)) +
labs(x = "excess (log10(billion))")Relabel the Group by sicdegp
Compute age difference
## age vsae sicdegp childid Group Age_d
## 1 2 6 3 1 H -3.7704918
## 2 3 7 3 1 H -2.7704918
## 3 5 18 3 1 H -0.7704918
## 4 9 25 3 1 H 3.2295082
## 5 13 27 3 1 H 7.2295082
## 6 2 17 3 3 H -3.7704918
Plot the scatter plot between Age and VSAE score
ggplot(dta, aes(x = Age_d, y = vsae)) +
facet_grid(.~Group) +
scale_x_continuous(limits = c(-4, 7.5),
breaks = c(-2.5, 0.0, 2.5, 5.0)) +
geom_point(alpha = 0.45) +
geom_smooth(method = "lm", formula = "y ~ x") +
geom_line(aes(group = childid), alpha = 0.3) +
labs(x = "Age (in years, centered)", y = "VSAE score") +
theme_bw() Create age-2 column
Plot the scatter plot between Age and VSAE score
dta %>% mutate(Age_2 = age - 2) %>%
group_by(Group, Age_2) %>%
summarize(vsae_mean = mean(vsae),
vsae_se = sd(vsae) / sqrt(n())) %>%
ggplot() +
aes(x = Age_2, y = vsae_mean, group = Group, shape = Group) +
geom_point(position = position_dodge(width = .3),
size=rel(2), show.legend = TRUE) +
scale_shape_manual(values = c(1, 2, 16)) +
geom_line(position = position_dodge(width = .3),
aes(linetype = Group),
show.legend = TRUE) +
geom_errorbar(aes(ymax = vsae_mean + vsae_se,
ymin = vsae_mean - vsae_se),
size=.3, width=.2, position = position_dodge(width = .3)) +
xlab('Age (in year - 2)') + ylab('VSAE score') +
theme_bw() + ## the legend information was refered from Jay Liao
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(size=0.75),
axis.text = element_text(size = 12),
legend.position = c(.1, .85),
legend.key = element_rect(color = "black"),
legend.key.size = unit(.69, 'cm'),
legend.title = element_text(size = 14),
legend.box.background = element_rect(color = 'black'))In-Class Exercise 5: Diabetes in overall population in US 2009-2010
Load data file
## SEQN RIAGENDR RIDRETH1 DIQ010 BMXBMI gender race diabetes BMI
## 1 51624 1 3 2 32.22 Males White No Overweight
## 2 51626 1 4 2 22.00 Males Black No Normal weight
## 3 51627 1 4 2 18.22 Males Black No Normal weight
## 4 51628 2 4 1 42.39 Females Black Yes Overweight
## 5 51629 1 1 2 32.61 Males Hispanic No Overweight
## 6 51630 2 3 2 30.57 Females White No Overweight
Relevel the variables
## Warning: package 'ggalluvial' was built under R version 3.6.2
dta_n <- data.frame(with(dta[, c("race", "gender", "diabetes", "BMI")],
xtabs(~ race + gender + diabetes + BMI)))
dta_n$race <- factor(dta_n$race, levels = c("Hispanic", "White", "Black"))
dta_n$gender <- factor(dta_n$gender, levels = c("Males", "Females"))
dta_n$diabetes <- factor(dta_n$diabetes, levels = c("Yes", "No"))
head(dta_n)## race gender diabetes BMI Freq
## 1 Black Females No Normal weight 347
## 2 Hispanic Females No Normal weight 712
## 3 White Females No Normal weight 998
## 4 Black Males No Normal weight 429
## 5 Hispanic Males No Normal weight 706
## 6 White Males No Normal weight 873
Plot the data
ggplot(dta_n,
aes(axis1=race,
axis2=gender,
axis3=diabetes,
y=Freq)) +
scale_x_discrete(limits=c("race",
"gender",
"diabetes"),
expand=c(.1, .05)) +
labs(y='No. individuals') +
ggtitle('Diabetes in overall population in US 2009-2010',
subtitle = 'straitified by race, gender and diabetes mellitus') +
geom_alluvium(aes(fill=BMI)) +
geom_stratum() +
geom_text(stat="stratum",
infer.label=TRUE) +
scale_fill_manual(values=c('gray40','tan1'))+
theme_minimal() +
theme(legend.position = 'bottom')In-Class Exercise 6: gg_gapminder
Load ggplot2 package and use help function to see more details
Install and load gapminder package
Load gapminder data file and Show the Structure of the data
## Classes 'tbl_df', 'tbl' and 'data.frame': 1704 obs. of 6 variables:
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ pop : int 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
## $ gdpPercap: num 779 821 853 836 740 ...
Name gapminder is gap
Add on a histogram of lifeEXP
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Change the histogram color, add on title and label names
ggplot(data = gap, aes(x = lifeExp)) +
geom_histogram(fill = "blue", color = "black", bins = 10) +
ggtitle("Life expectancy for the gap dataset") +
xlab("Life expectancy (years)") +
ylab("Frequency") +
theme_classic() Plot a boxplot between continent and lifeExp
ggplot(data = gap, aes(x = continent, y = lifeExp, fill = continent)) +
geom_boxplot() +
ggtitle("Boxplots for lifeExp by continent") +
xlab("Continent") +
ylab("Life expectancy (years)") +
theme_minimal() # + Q: What happens if you un-hashtage guides(fill = FALSE) and the plus sign in lines 68 and 69 above? A: Legend will not display while exceuting guides(fill = FALSE)
Plot the scatter plot of lifeExp by GDPPerCap
ggplot(data = gap, aes(x = lifeExp, y = gdpPercap, color = continent, shape = continent)) +
geom_point(size = 5, alpha = 0.5) +
theme_classic() +
ggtitle("Scatterplot of life expectancy by gdpPercap") +
xlab("Life expectancy (years)") +
ylab("gdpPercap (USD)") +
theme(legend.position = "top",
plot.title = element_text(hjust = 0.5, size = 20),
legend.title = element_text(size = 10),
legend.text = element_text(size = 5),
axis.text.x = element_text(angle = 45, hjust = 1)) Q: In lines the ggplot code above, what are the arguments inside of our second “theme” argument doing?
A: Change the title position and the text size