grammer of graphics exercise 1, 3, 5
exercise 1
Find out what each code chunk (indicated by ‘##’) in the R script does and provide comments. base graphics
plot(women, type='n') #scatter plots of women dataset, "n": no plotting
points(women[1,]) # plot the first row of women as a point trellis graphics
# using xyplot to plot a point of first row data of women dataset
# y-axis: weignt; x-axis: height
lattice::xyplot(weight ~ height, data=women, subset=row.names(women)==1, type='p')grammer of graphics: ggplot
## Warning: package 'ggplot2' was built under R version 3.6.3
# using ggplot to depict the first row of women datast
# setting x-axis:hieght; y-axis:weight
# plot point
ggplot(data=women[1,], aes(height, weight))+ geom_point()exercise 3
Use the USPersonalExpenditure{datasets} for this problem. This data set consists of United States personal expenditures (in billions of dollars) in the categories; food and tobacco, household operation, medical and health, personal care, and private education for the years 1940, 1945, 1950, 1955 and 1960. Plot the US personal expenditure data in the style of the third plot on the “Time Use” case study in the course web page. You might want to transform the dollar amounts to log base 10 unit first.
…
…
loading data and check data structure
## num [1:5, 1:5] 22.2 10.5 3.53 1.04 0.341 44.5 15.5 5.76 1.98 0.974 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:5] "Food and Tobacco" "Household Operation" "Medical and Health" "Personal Care" ...
## ..$ : chr [1:5] "1940" "1945" "1950" "1955" ...
## 1940 1945 1950 1955 1960
## Food and Tobacco 22.200 44.500 59.60 73.2 86.80
## Household Operation 10.500 15.500 29.00 36.5 46.20
## Medical and Health 3.530 5.760 9.71 14.0 21.10
## Personal Care 1.040 1.980 2.45 3.4 5.40
## Private Education 0.341 0.974 1.80 2.6 3.64
data manipulation
library(dplyr)
dta.melt<-reshape::melt(dta)
colnames(dta.melt)<-c("Category", "Year", "Expenditure")
df<-dta.melt%>%as.data.frame()%>%
mutate(logexpend=log10(Expenditure), group=ifelse(log10(Expenditure)>0, "up", "down"))ggplot plot 1
library(ggplot2)
p<-ggplot(df, aes(x=factor(Category), y=logexpend, group=1)) + # categroy is discret variable to plot line, need to use factor(x), group=1: every point to connect with singlge line group
geom_point()+
geom_line()+
facet_wrap(.~Year, ncol=1)+
labs(x="Category", y="log Expenditure (original in billion)")
pplot 2
p1<-ggplot(df, aes(y=Category, x=logexpend)) +
geom_point()+
geom_vline(xintercept=0)+
facet_wrap(.~Year, nrow=1)+
geom_segment(aes(xend=0, yend=Category))+
labs(y="Category", x="log Expenditure (original in billion)")+
theme(axis.text.x=element_text(angle=50, hjust=1))
p1exercise 5
…
loading data and check data set
dta<-read.csv("C:/Users/USER/Desktop/R_data management/0420/diabetes_mell.csv", sep=",", header=T)
head(dta)## SEQN RIAGENDR RIDRETH1 DIQ010 BMXBMI gender race diabetes BMI
## 1 51624 1 3 2 32.22 Males White No Overweight
## 2 51626 1 4 2 22.00 Males Black No Normal weight
## 3 51627 1 4 2 18.22 Males Black No Normal weight
## 4 51628 2 4 1 42.39 Females Black Yes Overweight
## 5 51629 1 1 2 32.61 Males Hispanic No Overweight
## 6 51630 2 3 2 30.57 Females White No Overweight
## 'data.frame': 8706 obs. of 9 variables:
## $ SEQN : int 51624 51626 51627 51628 51629 51630 51632 51633 51634 51635 ...
## $ RIAGENDR: int 1 1 1 2 1 2 1 1 1 1 ...
## $ RIDRETH1: int 3 4 4 4 1 3 2 3 1 3 ...
## $ DIQ010 : int 2 2 2 1 2 2 2 2 2 1 ...
## $ BMXBMI : num 32.2 22 18.2 42.4 32.6 ...
## $ gender : Factor w/ 2 levels "Females","Males": 2 2 2 1 2 1 2 2 2 2 ...
## $ race : Factor w/ 3 levels "Black","Hispanic",..: 3 1 1 1 2 3 2 3 2 3 ...
## $ diabetes: Factor w/ 2 levels "No","Yes": 1 1 1 2 1 1 1 1 1 2 ...
## $ BMI : Factor w/ 2 levels "Normal weight",..: 2 1 1 2 2 2 1 2 1 2 ...
data manipulation
dta.v1<-data.frame(with(dta[ , c("race", "gender", "diabetes", "BMI")], xtabs(~race+gender+diabetes+BMI)))
str(dta.v1)## 'data.frame': 24 obs. of 5 variables:
## $ race : Factor w/ 3 levels "Black","Hispanic",..: 1 2 3 1 2 3 1 2 3 1 ...
## $ gender : Factor w/ 2 levels "Females","Males": 1 1 1 2 2 2 1 1 1 2 ...
## $ diabetes: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 2 2 2 ...
## $ BMI : Factor w/ 2 levels "Normal weight",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Freq : int 347 712 998 429 706 873 6 11 12 15 ...
plot
library(ggalluvial)
p<-ggplot(dta.v1,
aes(axis1=race,
axis2=gender,
axis3=diabetes,
y=Freq))+
scale_x_discrete(limits=c("race", "gender", "diabetes"),
expand=c(.2, .05))+
labs(x=" ", y="No.individuals")+
geom_alluvium(aes(fill=BMI), reverse = FALSE)+ # reverse=FALSE reoder
geom_stratum(reverse = FALSE)+ # reverse=FALSE reoder
guides(fill = FALSE)+
geom_text(stat="stratum", infer.label= TRUE, reverse = FALSE)+
# reverse=FALSE reoder
scale_fill_manual(values=c("gray48", "tan1"))+
scale_x_continuous(breaks=1:3, labels=c("race", "gender", "diabetes"))+
theme_minimal()+
theme(legend.position="bottom")+
# title and sustitle
ggtitle("Diabetes in overall population in US 2009-2010",
subtitle="stratified by race, gender and diabetes mellitus")
p