Grammar of graphics in_class exercises2~5
in_class exercise2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dta1<- dta %>% mutate(class_size= cut(size,include.lowest = TRUE, breaks=quantile(size, probs=c(0, .33, .67, 1)), label=c("Small", "Medium", "Large"), ordered=T),IQV_f= cut(IQV, include.lowest = TRUE,breaks=quantile(IQV, probs=c(0, .33, .67, 1)), label=c("Low", "Middle", "High"), ordered=T))
table(dta1$groups)## < table of extent 0 >
library(ggplot2)
p0 <- ggplot(data=dta1,
mapping = aes(x=lang,
y=arith)) +
geom_point(shape = 23, fill = "black")+
stat_smooth(formula= y ~ x,
method='lm',
se=FALSE) +
labs(x = "Language score",
y = "Arithmetic score")+
facet_wrap(.~class_size+IQV_f, labeller=labeller(.multi_line=F))
p0in_class exercise3
## 1940 1945 1950 1955 1960
## Food and Tobacco 22.200 44.500 59.60 73.2 86.80
## Household Operation 10.500 15.500 29.00 36.5 46.20
## Medical and Health 3.530 5.760 9.71 14.0 21.10
## Personal Care 1.040 1.980 2.45 3.4 5.40
## Private Education 0.341 0.974 1.80 2.6 3.64
library(reshape2)
dta1 <- melt(dta)
names(dta1) <- c("category", "year", "expenditure")
dta1$expenditure <- log10(dta1$expenditure)
library(dplyr)
dta1 <- dta1 %>% mutate(excess = expenditure - mean(expenditure))
library(ggplot2)
qplot(excess, category, data = dta1) +
geom_segment(aes(xend = 0, yend = category)) +
geom_vline(xintercept = 0, colour = "grey50") +
facet_wrap(~ year, nrow = 1)in_class exercise4
library(dplyr)
dta <- na.omit(dta)
dta <- dta %>% mutate(group = cut(sicdegp, breaks=c(0, 1, 2, 3),labels = c("L", "M", "H")),age_centered = age - mean(age))
library(ggplot2)
p1 <- ggplot(dta, aes(age_centered, vsae)) +
facet_wrap(. ~ group)+
geom_point(alpha = 0.40) +
geom_smooth(method = "lm",
formula = "y ~ x")+
geom_path(aes(group = childid), alpha = 0.4)+
theme_bw()+
scale_x_continuous(limits = c(-4, 7.5),
breaks = c(-2.5, 0.0, 2.5, 5.0)) +
labs(x = "Age (in years, centered)",
y = "VSAE score")
pd <- position_dodge(.3)
p2 <- dta %>% mutate(age_2 = age - 2) %>% group_by(group,age_2) %>%
summarize(vsae_mean = mean(vsae),
vsae_se = sd(vsae) / sqrt(n())) %>%
ggplot() +
aes(age_2, vsae_mean,
group=group,
shape=group) +
geom_errorbar(aes(ymin=vsae_mean -vsae_se,
ymax=vsae_mean +vsae_se),
width=.2, size=.3,
position=pd) +
geom_line(position=pd,
show.legend = T,
aes(linetype=group)) +
geom_point(position=pd,
size=rel(3),
show.legend = T) +
scale_shape_manual(values = c(1, 2, 16)) +
labs(x="Age (in year -2)", y="VSAE score") +
theme_bw()+
theme(legend.position=c(.08, .8),
legend.box.background = element_rect(colour = "black"),
legend.key = element_rect(color = "black"),
legend.key.size = unit(1, "cm"))
p1in_class exercise5
library(ggalluvial)
dta_v3 <- data.frame(with(dta[, c("race", "gender", "diabetes", "BMI")],
xtabs(~ race + gender + diabetes + BMI)))
dta_v3$race <- factor(dta_v3$race,
levels(dta_v3$race)[c(2,3,1)])
dta_v3$gender <- factor(dta_v3$gender,
levels(dta_v3$gender)[c(2,1)])
dta_v3$diabetes <- factor(dta_v3$diabetes,
levels(dta_v3$diabetes)[c(2,1)])
ggplot(dta_v3,
aes(axis1=race,
axis2=gender,
axis3=diabetes,
y=Freq)) +
scale_x_discrete(limits=c("race",
"gender",
"diabetes"),
expand=c(.1, .05)) +
labs(x='',
y='No. individuals') +
geom_alluvium(aes(fill=BMI)) +
geom_stratum() +
geom_text(stat="stratum",
infer.label=TRUE) +
scale_fill_manual(values=c('gray40','tan1'))+
theme_minimal() +
theme(legend.position = "bottom") +
ggtitle("Diabetes in overall population in US 2009-2010", subtitle = "straitified by race, gender and diabetes mellitus")We can observe the frequency distributions from the plot. The size of the bar represents the numbers of the population. The gray and brown flows represent a group of observations that match the value for each variable indicated by the flow.
We can see that most of the diabete population is overweighted. The numbers of diabete population between race and gender are similar.