Exercise-1

## plot the first row in women
## weight against heights
## base graphic
plot(women, type='n')
points(women[1, ])

## plot the first row in women
## weight against heights
## lattice graphic
lattice::xyplot(weight ~ height, 
  data=women,
  subset=row.names(women)==1, type='p')

## plot the first row in women
## weight against heights
## ggplot2
library(ggplot2)
ggplot(data=women[1,], aes(height, weight))+
  geom_point()

Exercise-2

# read data
dta_school <- read.table("langMathDutch.txt", header = T)

# examine dta
str(dta_school)

## 'data.frame':    2287 obs. of  6 variables:
##  $ school: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pupil : int  17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
##  $ IQV   : num  15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
##  $ size  : int  29 29 29 29 29 29 29 29 29 29 ...
##  $ lang  : int  46 45 33 46 20 30 30 57 36 36 ...
##  $ arith : int  24 19 24 26 9 13 13 30 23 22 ...

head(dta_school)

##   school pupil  IQV size lang arith
## 1      1 17001 15.0   29   46    24
## 2      1 17002 14.5   29   45    19
## 3      1 17003  9.5   29   33    24
## 4      1 17004 11.0   29   46    26
## 5      1 17005  8.0   29   20     9
## 6      1 17006  9.5   29   30    13

# categorize class size
# compute range
range <- quantile(dta_school$size, probs = c(0, 1/3, 2/3, 1))  
range

##        0% 33.33333% 66.66667%      100% 
##         5        20        27        37

# lable different class size group by range
dta_school$size_group <- with(dta_school, 
                              cut(size, breaks = range, 
                                  labels = c("Small", "Medium", "Large"),
                                  include.lowest = TRUE))

# categorize IQ
# compute range
range <- quantile(dta_school$IQV, probs = c(0, 1/3, 2/3, 1))  
range

##        0% 33.33333% 66.66667%      100% 
##       4.0      11.0      12.5      18.0

# label diffrent iq group by range
dta_school$iq_group <- with(dta_school,
                            cut(IQV, breaks = range,
                            labels = c("Low", "Middle", "High"),
                            include.lowest = TRUE))

# combine two columns
# specify factor levels in paticular order
dta_school$type <- factor(paste(dta_school$size_group, dta_school$iq_group, sep = ", "),
                          levels = c("Small, Low", "Small, Middle", "Small, High",
                                     "Medium, Low", "Medium, Middle", "Medium, High",
                                     "Large, Low", "Large, Middle", "Large, High"))

# plot
library(ggplot2)
ggplot(dta_school, mapping = aes(x = lang, y = arith)) +
  # specify the shape of points and color
         geom_point(shape = 23, fill = "black") +
  # specify method and formula to fit the regression line
        stat_smooth(method = "lm",
                    formula = y ~ x) +
  # display plot in panels
         facet_wrap(. ~ type) +
  # specify labels for x-axis and y-axis
  labs(x = "Language score",
       y = "Arithmetic score")

Exercise-3

# read data
dta <- datasets::USPersonalExpenditure
# examine data
head(dta)

##                       1940   1945  1950 1955  1960
## Food and Tobacco    22.200 44.500 59.60 73.2 86.80
## Household Operation 10.500 15.500 29.00 36.5 46.20
## Medical and Health   3.530  5.760  9.71 14.0 21.10
## Personal Care        1.040  1.980  2.45  3.4  5.40
## Private Education    0.341  0.974  1.80  2.6  3.64

# transform to long form
library(reshape2)
dta <- melt(dta)
colnames(dta) <- c("category", "year", "expenditure")

# compute excess money
library(dplyr)
dta$expenditure <- log10(dta$expenditure)
dta <- dta %>% mutate(excess = expenditure - mean(expenditure))

# plot
qplot(excess, category, data = dta) +
  geom_segment(aes(xend = 0, yend = category)) +
  geom_vline(xintercept = 0, colour = "grey50") +
  facet_wrap(~ year, nrow = 1) +
  scale_x_continuous(limits = c(-1.5, 1.1),
                     breaks = seq(-1.5, 1.1, 0.5)) +
  labs(x = "excess (log10(billion))")

Exercise-4

# read data
dta <- WWGbook::autism
# examine data
head(dta)

##   age vsae sicdegp childid
## 1   2    6       3       1
## 2   3    7       3       1
## 3   5   18       3       1
## 4   9   25       3       1
## 5  13   27       3       1
## 6   2   17       3       3

# check NA values
any(is.na(dta))

## [1] TRUE

# remove these rows
dta <- na.omit(dta)

# label sicdegp
dta$group <- with(dta, 
                  cut(sicdegp, breaks = c(0, 1, 2, 3), 
                      labels = c("L", "M", "H")))
# compute centered age
dta <- dta %>% mutate(age_centered = age - mean(age))

# ggplot vsae ~ age_centered
ggplot(data = dta,  mapping = aes(x = age_centered, 
                                  y = vsae)) +
  # plot diffrent groups in diffrent panels
  facet_grid(.~group) +
  # specify x axis limits and break points
  scale_x_continuous(limits = c(-4, 7.5),
                     breaks = c(-2.5, 0.0, 2.5, 5.0)) +
  # make points transparent by "alpha"
  geom_point(alpha = 0.45)  +
  # fit regression line
  geom_smooth(method = "lm",
              formula = "y ~ x") +
  # draw lines by each subject
  geom_line(aes(group = childid), alpha = 0.3) +
  # specify theme
  theme_bw() +
  # specify axes labels
  labs(x = "Age (in years, centered)",
       y = "VSAE score")

pd <- position_dodge(.3)
# create age-2 column
dta %>% mutate(age_2 = age - 2) %>% 
  # group data
  group_by(group, age_2) %>%
  # compute mean and standard error for vase
  summarize(vsae_mean = mean(vsae), 
            vsae_se = sd(vsae) / sqrt(n())) %>%
  # plot
  ggplot() + 
  # vsae ~ age_2
  aes(age_2, vsae_mean, 
      # grouping variable
      group=group,
      # shapes by groups
      shape=group) +
  # errorbar
  geom_errorbar(aes(ymin=vsae_mean - vsae_se,
                    ymax=vsae_mean + vsae_se),
                width=.2, size=.3, 
                position=pd) +
  # line
  # linetype by groups
  geom_line(position=pd, 
            show.legend = T,
            aes(linetype=group)) +
  # points
  geom_point(position=pd, 
             size=rel(3),
             show.legend = T) +
  # points shape
  scale_shape_manual(values = c(1, 2, 16)) +
  # axes labels
  labs(x="Age (in year -2)", y="VSAE score") +
  # theme
  theme_bw() + 
  # draw a box holding legend
  theme(legend.position=c(.08, .8),
        legend.box.background = element_rect(colour = "black"),
        legend.key = element_rect(color = "black"),
        legend.key.size = unit(1, "cm"))

Exercise-5

# read data
dta <- read.table("diabetes_mell.csv", sep = ",", header = T)
# examine data
head(dta)

##    SEQN RIAGENDR RIDRETH1 DIQ010 BMXBMI  gender     race diabetes           BMI
## 1 51624        1        3      2  32.22   Males    White       No    Overweight
## 2 51626        1        4      2  22.00   Males    Black       No Normal weight
## 3 51627        1        4      2  18.22   Males    Black       No Normal weight
## 4 51628        2        4      1  42.39 Females    Black      Yes    Overweight
## 5 51629        1        1      2  32.61   Males Hispanic       No    Overweight
## 6 51630        2        3      2  30.57 Females    White       No    Overweight

# select variables for visualization
dta <- dta %>% select(race, gender, diabetes, BMI)
dta <- data.frame(xtabs(data = dta, ~ race + gender + diabetes + BMI))
# relevel factors in paticular order
dta$race <- factor(dta$race, levels = c("Hispanic", "White", "Black"))
dta$gender <- factor(dta$gender, levels = c("Males", "Females"))
dta$diabetes <- factor(dta$diabetes, levels = c("Yes", "No"))

library(ggalluvial)

ggplot(dta, 
       aes(axis1=race,
           axis2=gender, 
           axis3=diabetes, 
           y=Freq)) +
  scale_x_discrete(limits=c("race", 
                            "gender", 
                            "diabetes"), 
                   expand=c(.1, .05)) +
  labs(x='', 
       y='No. individuals') +
  geom_alluvium(aes(fill=BMI)) +
  geom_stratum() +
  geom_text(stat="stratum", 
            infer.label=TRUE) +
  scale_fill_manual(values=c('gray40','tan1'))+
  theme_minimal() +
  theme(legend.position = "bottom") +
  ggtitle("Diabetes in overall population in US 2009-2010", subtitle = "straitified by race, gender and diabetes mellitus")

Exercise-6

## load ggplot2 package
library(ggplot2)
## open ggplot2 documentation
?ggplot2

## install package "gapminder"
install.packages("gapminder")

## Installing package into '/home/zhe/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)

## load package
library(gapminder)

## load data object into global environment
data(gapminder)
## examine data structure of gapminder
str(gapminder)

## tibble [1,704 × 6] (S3: tbl_df/tbl/data.frame)
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ year     : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ lifeExp  : num [1:1704] 28.8 30.3 32 34 36.1 ...
##  $ pop      : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
##  $ gdpPercap: num [1:1704] 779 821 853 836 740 ...

## store gapminder in a new data object named gap
gap <- gapminder

## create a plot template without geometric object
ggplot(data = gap, aes(x = lifeExp))

## plot a histogram of variabel lifeExp
ggplot(data = gap, aes(x = lifeExp)) + 
    geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## plot a histogram of variabel lifeExp
ggplot(data = gap, aes(x = lifeExp)) + 
  # blue bar
  # black border
  # use only 10 bars to display the histogram
  geom_histogram(fill = "blue", color = "black", bins = 10) + 
  # title of the plot
  ggtitle("Life expectancy for the gap dataset") + 
  # label of x-axis
  xlab("Life expectancy (years)") + 
  # label of y-axis
  ylab("Frequency") + 
  # classic theme
  theme_classic()

## boxplot of lifeExp by different continent
# use diffrent color to display different continets
ggplot(data = gap, aes(x = continent, y = lifeExp, fill = continent)) + 
  geom_boxplot() + 
  # title
  ggtitle("Boxplots for lifeExp by continent") + 
  # label of x-axis
  xlab("Continent") + 
  # label of y-axis
  ylab("Life expectancy (years)") +
  # minial theme
  theme_minimal() + 
   guides(fill = FALSE)

# What happens if you un-hashtage `guides(fill = FALSE)` and 
# the plus sign in lines 68 and 69 above?

What happens if you un-hashtage guides(fill = FALSE) and the plus sign in lines 68 and 69 above?

The legend will disappear.

## scatter plot of gdpPercap against lifeExp
# different colors to display continent 
# different shapes of points to display continent
ggplot(data = gap, aes(x = lifeExp, y = gdpPercap, color = continent, shape = continent)) + # specify point size and opacity
    geom_point(size = 5, alpha = 0.5) + 
  # classic theme
    theme_classic() +
  # title of the plot
    ggtitle("Scatterplot of life expectancy by gdpPercap") +
  # label of x-axis
    xlab("Life expectancy (years)") + 
  # label of y-axis
    ylab("gdpPercap (USD)") + 
  # legend position on the top of the plot
    theme(legend.position = "top",
          # title font size 20
          # title horizontal position 0.5
          plot.title = element_text(hjust = 0.5, size = 20),
          # legend title font size 10
          legend.title = element_text(size = 10),
          # legend font size 5
          legend.text = element_text(size = 5),
          # text on the x-axix counter clockwise 45 degree
          # config the horizontal postition of text
          axis.text.x = element_text(angle = 45, hjust = 1))

## The End

inclass-420, grammar of graphics

Zhe Sun

4/21/2020

Exercise-1

Exercise-2

Exercise-3

Exercise-4

Exercise-5

Exercise-6