grammer of graphics exercise 1-6

exercise 1

Find out what each code chunk (indicated by ‘##’) in the R script does and provide comments. base graphics

plot(women, type='n') #scatter plots of women dataset, "n": no plotting  
points(women[1,]) # plot the first row of women as a point

trellis graphics

# using xyplot to plot a point of first row data of women dataset
# y-axis: weignt; x-axis: height
lattice::xyplot(weight ~ height, data=women, subset=row.names(women)==1, type='p')

grammer of graphics: ggplot

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.6.3

# using ggplot to depict the first row of women datast
# setting x-axis:hieght; y-axis:weight
# plot point
ggplot(data=women[1,], aes(height, weight))+ geom_point()

exercise 2

The data set is concerned with grade 8 pupils (age about 11 years) in elementary schools in the Netherlands. After deleting pupils with missing values, the number of pupils is 2,287 and the number of schools is 131. Class size ranges from 4 to 35. The response variables are score on a language test and that on an arithmetic test. The research intest is on how the two test scores depend on the pupil’s intelligence (verbal IQ) and on the number of pupils in a school class. The class size is categorized into small, medium, and large with roughly equal number of observations in each category. The verbal IQ is categorized into low, middle and high with roughly equal number of observations in each category. Reproduce the plot below. Source: Snijders, T. & Bosker, R. (2002). Multilevel Analysis.

Column 1: School ID
Column 2: Pupil ID
Column 3: Verbal IQ score
Column 4: The number of pupils in a class
Column 5: Language test score
Column 6: Arithmetic test score

…

loading data and check data structure

dta<-read.table("C:/Users/USER/Desktop/R_data management/0420/langMathDutch.txt", header=T)
str(dta)

## 'data.frame':    2287 obs. of  6 variables:
##  $ school: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pupil : int  17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
##  $ IQV   : num  15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
##  $ size  : int  29 29 29 29 29 29 29 29 29 29 ...
##  $ lang  : int  46 45 33 46 20 30 30 57 36 36 ...
##  $ arith : int  24 19 24 26 9 13 13 30 23 22 ...

head(dta)

##   school pupil  IQV size lang arith
## 1      1 17001 15.0   29   46    24
## 2      1 17002 14.5   29   45    19
## 3      1 17003  9.5   29   33    24
## 4      1 17004 11.0   29   46    26
## 5      1 17005  8.0   29   20     9
## 6      1 17006  9.5   29   30    13

label variable by quantile

dta$level<-with(dta, factor(findInterval(size, c(-Inf,
                                                    quantile(size, probs=c(1/3, 2/3)), Inf)), 
                            labels=c("Small","Medium","Large")))
dta$IQ<-with(dta, factor(findInterval(IQV, c(-Inf,
                                                 quantile(IQV, probs=c(1/3, 2/3)), Inf)), 
                            labels=c("Low","Meddle","High")))

ggplot

library(ggplot2)
p0<-ggplot(data=dta, aes(lang, arith))+
  labs(x="Language score", y="Arithmetic score")+
  geom_point(shape=18, size=2)+ # change shape
  # add regression line +SE, adjust size and color 
  stat_smooth(data=dta, formula=(y~x), method="lm", se=T, fill="#4D4D4D", colour="darkblue",   size=0.5, alpha = 0.2)+ 
  facet_wrap(.~level+IQ, labeller=labeller(.multi_line=F))+ # make two label at the same line
  scale_x_continuous(breaks=seq(10, 50, 10))+ # adjust the tick label
  scale_y_continuous(breaks=seq(5, 30, 5))
p0

exercise 3

Use the USPersonalExpenditure{datasets} for this problem. This data set consists of United States personal expenditures (in billions of dollars) in the categories; food and tobacco, household operation, medical and health, personal care, and private education for the years 1940, 1945, 1950, 1955 and 1960. Plot the US personal expenditure data in the style of the third plot on the “Time Use” case study in the course web page. You might want to transform the dollar amounts to log base 10 unit first.

…

loading data and check data structure

library(datasets)
dta<-USPersonalExpenditure
str(dta)

##  num [1:5, 1:5] 22.2 10.5 3.53 1.04 0.341 44.5 15.5 5.76 1.98 0.974 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:5] "Food and Tobacco" "Household Operation" "Medical and Health" "Personal Care" ...
##   ..$ : chr [1:5] "1940" "1945" "1950" "1955" ...

head(dta)

##                       1940   1945  1950 1955  1960
## Food and Tobacco    22.200 44.500 59.60 73.2 86.80
## Household Operation 10.500 15.500 29.00 36.5 46.20
## Medical and Health   3.530  5.760  9.71 14.0 21.10
## Personal Care        1.040  1.980  2.45  3.4  5.40
## Private Education    0.341  0.974  1.80  2.6  3.64

data manipulation

library(dplyr)
dta.melt<-reshape::melt(dta)
colnames(dta.melt)<-c("Category", "Year", "Expenditure")
df<-dta.melt%>%as.data.frame()%>%
  mutate(logexpend=log10(Expenditure), group=ifelse(log10(Expenditure)>0, "up", "down"))

ggplot plot 1

library(ggplot2)
p<-ggplot(df, aes(x=factor(Category), y=logexpend, group=1)) + # categroy is discret variable to plot line, need to use factor(x), group=1: every point to connect with singlge line group
  geom_point()+
  geom_line()+
  facet_wrap(.~Year, ncol=1)+
  
  labs(x="Category", y="log Expenditure (original in billion)")
p

plot 2

p1<-ggplot(df, aes(y=Category, x=logexpend)) +
  geom_point()+
  geom_vline(xintercept=0)+
  facet_wrap(.~Year, nrow=1)+
  geom_segment(aes(xend=0, yend=Category))+
  labs(y="Category", x="log Expenditure (original in billion)")+
  theme(axis.text.x=element_text(angle=50, hjust=1))
p1

exercise 4

A sample of 158 children with autisim spectrum disorder were recruited. Social development was assessed using the Vineland Adaptive Behavior Interview survey form, a parent-reported measure of socialization. It is a combined score that included assessment of interpersonal relationships, play/leisure time activities, and coping skills. Initial language development was assessed using the Sequenced Inventory of Communication Development (SICD) scale. These assessments were repeated on these children when they were 3, 5, 9, 13 years of age.

Data: autism{WWGbook}

Column 1: Age (in years)
Column 2: Vineland Socialization Age Equivalent score
Column 3: Sequenced Inventory of Communication Development Expressive Group (1 = Low, 2 = Medium, 3 = High)
Column 4: Child ID

…

loading data and check data set

library(WWGbook)
dta<-autism
head(dta)

##   age vsae sicdegp childid
## 1   2    6       3       1
## 2   3    7       3       1
## 3   5   18       3       1
## 4   9   25       3       1
## 5  13   27       3       1
## 6   2   17       3       3

str(dta)

## 'data.frame':    612 obs. of  4 variables:
##  $ age    : int  2 3 5 9 13 2 3 5 9 13 ...
##  $ vsae   : int  6 7 18 25 27 17 18 12 18 24 ...
##  $ sicdegp: int  3 3 3 3 3 3 3 3 3 3 ...
##  $ childid: int  1 1 1 1 1 3 3 3 3 3 ...

data manipulation

library(dplyr)
library(plyr)
dta$sic<-factor(dta$sicdegp, levels = c(1,2,3), labels = c("L", "M", "H")) # label variable
m<-dta%>%summarise(m=mean(age), std=sd(age), me=median(age)) # find mean of age
dta$center<-(dta$age-m$m) # center age

plot 1.

library(ggplot2)
p0<-ggplot(data=dta, aes(x=center, y=vsae))+labs(x="Age (in years, centered)", y="VASE score")
p0+geom_line(aes(group = childid))+facet_grid(. ~sic)+geom_smooth(formula=(y~x), method="lm")+
  scale_x_continuous(limits=c(-4, 7.3), breaks=seq(-2.5, 5.0, 2.5))+
  geom_point(size=rel(2), alpha=.5)+ # make the points transparent
  theme_bw()

plot 2

dta1<-na.omit(dta)

p<-dta1 %>% mutate(age2=age-2)%>%group_by(sic, age2)%>%
  dplyr::summarise(n=n(), m=mean(vsae),se=sd(vsae)/sqrt(n))%>%
  
  ggplot()+
  aes(age2, m, group=sic, shape=sic)+
  geom_errorbar(aes(ymin=m-se, ymax=m+se), width=.2, size=.3)+
  geom_line(aes(linetype=sic), show.legend=T)+
  geom_point(size=rel(2), show.legend=T)+
  scale_shape_manual(values=c(1, 2, 16), name="Group")+ # change legend title, there are line and shape in the legend have to assign the name both in shape manual and the linetype_discrete 
  scale_linetype_discrete(name="Group")+ # change legend title 
  labs(x="Age (in year-2)", y="Vsae score")+
  theme_bw()+
  # legend frame
  theme(legend.background = element_rect(fill="white", size=0.5, linetype="solid", 
                                         colour ="black"), legend.position=c(0.08, 0.85), 
        legend.key = element_rect(fill = "white", colour = "darkgray"))+  # little grey box in the legned
  theme(axis.text.y = element_text(colour="Black"), axis.text.x=element_text(colour="Black"))

p

exercise 5

…

loading data and check data set

dta<-read.csv("C:/Users/USER/Desktop/R_data management/0420/diabetes_mell.csv", sep=",", header=T)
head(dta)

##    SEQN RIAGENDR RIDRETH1 DIQ010 BMXBMI  gender     race diabetes           BMI
## 1 51624        1        3      2  32.22   Males    White       No    Overweight
## 2 51626        1        4      2  22.00   Males    Black       No Normal weight
## 3 51627        1        4      2  18.22   Males    Black       No Normal weight
## 4 51628        2        4      1  42.39 Females    Black      Yes    Overweight
## 5 51629        1        1      2  32.61   Males Hispanic       No    Overweight
## 6 51630        2        3      2  30.57 Females    White       No    Overweight

str(dta)

## 'data.frame':    8706 obs. of  9 variables:
##  $ SEQN    : int  51624 51626 51627 51628 51629 51630 51632 51633 51634 51635 ...
##  $ RIAGENDR: int  1 1 1 2 1 2 1 1 1 1 ...
##  $ RIDRETH1: int  3 4 4 4 1 3 2 3 1 3 ...
##  $ DIQ010  : int  2 2 2 1 2 2 2 2 2 1 ...
##  $ BMXBMI  : num  32.2 22 18.2 42.4 32.6 ...
##  $ gender  : Factor w/ 2 levels "Females","Males": 2 2 2 1 2 1 2 2 2 2 ...
##  $ race    : Factor w/ 3 levels "Black","Hispanic",..: 3 1 1 1 2 3 2 3 2 3 ...
##  $ diabetes: Factor w/ 2 levels "No","Yes": 1 1 1 2 1 1 1 1 1 2 ...
##  $ BMI     : Factor w/ 2 levels "Normal weight",..: 2 1 1 2 2 2 1 2 1 2 ...

data manipulation

dta.v1<-data.frame(with(dta[ , c("race", "gender", "diabetes", "BMI")], xtabs(~race+gender+diabetes+BMI)))
str(dta.v1)

## 'data.frame':    24 obs. of  5 variables:
##  $ race    : Factor w/ 3 levels "Black","Hispanic",..: 1 2 3 1 2 3 1 2 3 1 ...
##  $ gender  : Factor w/ 2 levels "Females","Males": 1 1 1 2 2 2 1 1 1 2 ...
##  $ diabetes: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 2 2 2 ...
##  $ BMI     : Factor w/ 2 levels "Normal weight",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Freq    : int  347 712 998 429 706 873 6 11 12 15 ...

plot

library(ggalluvial)
p<-ggplot(dta.v1, 
       aes(axis1=race,
                   axis2=gender,
                   axis3=diabetes,
                   y=Freq))+
  scale_x_discrete(limits=c("race", "gender", "diabetes"),
                   expand=c(.2, .05))+
  labs(x=" ", y="No.individuals")+
  geom_alluvium(aes(fill=BMI), reverse = FALSE)+ # reverse=FALSE    reoder 
  geom_stratum(reverse = FALSE)+ # reverse=FALSE    reoder
  guides(fill = FALSE)+
  geom_text(stat="stratum", infer.label= TRUE, reverse = FALSE)+ 
  # reverse=FALSE    reoder
  scale_fill_manual(values=c("gray48", "tan1"))+
  scale_x_continuous(breaks=1:3, labels=c("race", "gender", "diabetes"))+
  theme_minimal()+
  theme(legend.position="bottom")+
  # title and sustitle
  ggtitle("Diabetes in overall population in US 2009-2010", 
          subtitle="stratified by race, gender and diabetes mellitus")
p

exercise 6

Find out what each code chunk (indicated by ‘##’) in the R script does and provide comments.

# call ggplot2 library and search for ggplot2-package documentation
library(ggplot2) 
?ggplot2 
# install gapminder package
 
library(gapminder)

# check data structure
data(gapminder) 
str(gapminder)

## Classes 'tbl_df', 'tbl' and 'data.frame':    1704 obs. of  6 variables:
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ pop      : int  8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
##  $ gdpPercap: num  779 821 853 836 740 ...

# rename dataset as gap 
gap <- gapminder 

#plot the gap dataset, x-axis: lifeExp, create layers-canvas 
ggplot(data = gap, aes(x = lifeExp))

# histrogram plot, using gap dataset, x-axis: lifeExp
ggplot(data = gap, aes(x = lifeExp)) + geom_histogram()

# histrogram plot, using gap dataset, x-axis: lifeExp, depitct bar with blue color and black border
# bins=10 show 10 bins
#ggtitle: giving plot title and rename x-axis and y-axis label
# using classic theme: A classic-looking theme, with x and y axis lines and no gridlines
ggplot(data = gap, aes(x = lifeExp)) + geom_histogram(fill = "blue", color = "black", bins = 10) + ggtitle("Life expectancy for the gap dataset") + xlab("Life expectancy (years)") + ylab("Frequency") + theme_classic()

# boxplot using gap dataset, x-axis: continent, y-axis: lifeExp, fill different color depends on the cotinent
# giving title, labels for x-axis and y-axis
# using minimal theme: A minimalistic theme with no background annotations

ggplot(data = gap, aes(x = continent, y = lifeExp, fill = continent)) + geom_boxplot() + ggtitle("Boxplots for lifeExp by continent") + xlab("Continent") + ylab("Life expectancy (years)") + theme_minimal()

What happens if you un-hashtage guides(fill = FALSE) and the plus sign in lines 68 and 69 above?

ggplot(data = gap, aes(x = continent, y = lifeExp, fill = continent)) + geom_boxplot() + ggtitle("Boxplots for lifeExp by continent") + xlab("Continent") + ylab("Life expectancy (years)") + theme_minimal()+ guides(fill = FALSE)

added guides(fill=FALSE) will not display the legend

# scatter plot
# using gap dataset,  x-axis: lifeExp, y-axis: gdpPercap, shape of points depends on the cotinent
# Aesthetics: size, alpha=transparency 
# classic theme, with title and rename x-axis and y-axis label
# put legend at top of plot, and adjust title size and horizontal justification to centered
# adjust legend title to size=10, text in legned to size=5.
# adjust text in x-axis, rotating text 45 degree, align text to the right 
ggplot(data = gap, aes(x = lifeExp, y = gdpPercap, color = continent, shape = continent)) + geom_point(size = 5, alpha = 0.5) + theme_classic() + ggtitle("Scatterplot of life expectancy by gdpPercap") + xlab("Life expectancy (years)") + ylab("gdpPercap (USD)") + 
theme(legend.position = "top", plot.title = element_text(hjust = 0.5, size = 20), legend.title = element_text(size = 10), legend.text = element_text(size = 5), axis.text.x = element_text(angle = 45, hjust = 1))

In lines the ggplot code above, what are the arguments inside of our second “theme” argument doing? Answer: adjust aesthetic stuffs: 1)adjust legend position, 2)centered title text and adjust the word size, 3)adjust word size in text of legend, 4)adjust text in the x-axis, rotating 45 degree, aligned text to the right