grammer of graphics exercise 1, 3, 5

exercise 1

Find out what each code chunk (indicated by ‘##’) in the R script does and provide comments. base graphics

plot(women, type='n') #scatter plots of women dataset, "n": no plotting  
points(women[1,]) # plot the first row of women as a point

trellis graphics

# using xyplot to plot a point of first row data of women dataset
# y-axis: weignt; x-axis: height
lattice::xyplot(weight ~ height, data=women, subset=row.names(women)==1, type='p')

grammer of graphics: ggplot

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.6.3

# using ggplot to depict the first row of women datast
# setting x-axis:hieght; y-axis:weight
# plot point
ggplot(data=women[1,], aes(height, weight))+ geom_point()

exercise 3

Use the USPersonalExpenditure{datasets} for this problem. This data set consists of United States personal expenditures (in billions of dollars) in the categories; food and tobacco, household operation, medical and health, personal care, and private education for the years 1940, 1945, 1950, 1955 and 1960. Plot the US personal expenditure data in the style of the third plot on the “Time Use” case study in the course web page. You might want to transform the dollar amounts to log base 10 unit first.

…

loading data and check data structure

library(datasets)
dta<-USPersonalExpenditure
str(dta)

##  num [1:5, 1:5] 22.2 10.5 3.53 1.04 0.341 44.5 15.5 5.76 1.98 0.974 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:5] "Food and Tobacco" "Household Operation" "Medical and Health" "Personal Care" ...
##   ..$ : chr [1:5] "1940" "1945" "1950" "1955" ...

head(dta)

##                       1940   1945  1950 1955  1960
## Food and Tobacco    22.200 44.500 59.60 73.2 86.80
## Household Operation 10.500 15.500 29.00 36.5 46.20
## Medical and Health   3.530  5.760  9.71 14.0 21.10
## Personal Care        1.040  1.980  2.45  3.4  5.40
## Private Education    0.341  0.974  1.80  2.6  3.64

data manipulation

library(dplyr)
dta.melt<-reshape::melt(dta)
colnames(dta.melt)<-c("Category", "Year", "Expenditure")
df<-dta.melt%>%as.data.frame()%>%
  mutate(logexpend=log10(Expenditure), group=ifelse(log10(Expenditure)>0, "up", "down"))

ggplot plot 1

library(ggplot2)
p<-ggplot(df, aes(x=factor(Category), y=logexpend, group=1)) + # categroy is discret variable to plot line, need to use factor(x), group=1: every point to connect with singlge line group
  geom_point()+
  geom_line()+
  facet_wrap(.~Year, ncol=1)+
  
  labs(x="Category", y="log Expenditure (original in billion)")
p

plot 2

p1<-ggplot(df, aes(y=Category, x=logexpend)) +
  geom_point()+
  geom_vline(xintercept=0)+
  facet_wrap(.~Year, nrow=1)+
  geom_segment(aes(xend=0, yend=Category))+
  labs(y="Category", x="log Expenditure (original in billion)")+
  theme(axis.text.x=element_text(angle=50, hjust=1))
p1

exercise 5

…

loading data and check data set

dta<-read.csv("C:/Users/USER/Desktop/R_data management/0420/diabetes_mell.csv", sep=",", header=T)
head(dta)

##    SEQN RIAGENDR RIDRETH1 DIQ010 BMXBMI  gender     race diabetes           BMI
## 1 51624        1        3      2  32.22   Males    White       No    Overweight
## 2 51626        1        4      2  22.00   Males    Black       No Normal weight
## 3 51627        1        4      2  18.22   Males    Black       No Normal weight
## 4 51628        2        4      1  42.39 Females    Black      Yes    Overweight
## 5 51629        1        1      2  32.61   Males Hispanic       No    Overweight
## 6 51630        2        3      2  30.57 Females    White       No    Overweight

str(dta)

## 'data.frame':    8706 obs. of  9 variables:
##  $ SEQN    : int  51624 51626 51627 51628 51629 51630 51632 51633 51634 51635 ...
##  $ RIAGENDR: int  1 1 1 2 1 2 1 1 1 1 ...
##  $ RIDRETH1: int  3 4 4 4 1 3 2 3 1 3 ...
##  $ DIQ010  : int  2 2 2 1 2 2 2 2 2 1 ...
##  $ BMXBMI  : num  32.2 22 18.2 42.4 32.6 ...
##  $ gender  : Factor w/ 2 levels "Females","Males": 2 2 2 1 2 1 2 2 2 2 ...
##  $ race    : Factor w/ 3 levels "Black","Hispanic",..: 3 1 1 1 2 3 2 3 2 3 ...
##  $ diabetes: Factor w/ 2 levels "No","Yes": 1 1 1 2 1 1 1 1 1 2 ...
##  $ BMI     : Factor w/ 2 levels "Normal weight",..: 2 1 1 2 2 2 1 2 1 2 ...

data manipulation

dta.v1<-data.frame(with(dta[ , c("race", "gender", "diabetes", "BMI")], xtabs(~race+gender+diabetes+BMI)))
str(dta.v1)

## 'data.frame':    24 obs. of  5 variables:
##  $ race    : Factor w/ 3 levels "Black","Hispanic",..: 1 2 3 1 2 3 1 2 3 1 ...
##  $ gender  : Factor w/ 2 levels "Females","Males": 1 1 1 2 2 2 1 1 1 2 ...
##  $ diabetes: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 2 2 2 ...
##  $ BMI     : Factor w/ 2 levels "Normal weight",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Freq    : int  347 712 998 429 706 873 6 11 12 15 ...

plot

library(ggalluvial)
p<-ggplot(dta.v1, 
       aes(axis1=race,
                   axis2=gender,
                   axis3=diabetes,
                   y=Freq))+
  scale_x_discrete(limits=c("race", "gender", "diabetes"),
                   expand=c(.2, .05))+
  labs(x=" ", y="No.individuals")+
  geom_alluvium(aes(fill=BMI), reverse = FALSE)+ # reverse=FALSE    reoder 
  geom_stratum(reverse = FALSE)+ # reverse=FALSE    reoder
  guides(fill = FALSE)+
  geom_text(stat="stratum", infer.label= TRUE, reverse = FALSE)+ 
  # reverse=FALSE    reoder
  scale_fill_manual(values=c("gray48", "tan1"))+
  scale_x_continuous(breaks=1:3, labels=c("race", "gender", "diabetes"))+
  theme_minimal()+
  theme(legend.position="bottom")+
  # title and sustitle
  ggtitle("Diabetes in overall population in US 2009-2010", 
          subtitle="stratified by race, gender and diabetes mellitus")
p