In-Class Exercise 1: Mutiple Plot Method

Plot a scatter plot of the women data using R base.

plot(women, type='n')
points(women[1,])

Plot a scatter plot of the women data using lattice.

lattice::xyplot(weight ~ height, 
                data=women,
                subset=row.names(women)==1, type='p')

Plot a scatter plot of the women data using ggplot.

library(ggplot2)
ggplot(data=women[1,], aes(height, weight))+
  geom_point()

##

In-Class Exercise 2: Grade 8 Pupils in Elementary Schools in the Netherlands.

Load data file

dta <- read.table("/Users/haolunfu/Documents/資料管理/week7/langMathDutch.txt", header = T)
head(dta)

##   school pupil  IQV size lang arith
## 1      1 17001 15.0   29   46    24
## 2      1 17002 14.5   29   45    19
## 3      1 17003  9.5   29   33    24
## 4      1 17004 11.0   29   46    26
## 5      1 17005  8.0   29   20     9
## 6      1 17006  9.5   29   30    13

Categorized the class size and verbal IQ

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

dta_n <- dta %>% 
  mutate(Group_size = cut(size, include.lowest = TRUE,
                          breaks = quantile(size, c(0, 1/3, 2/3, 1)),
                          labels = c('Small', 'Medium', 'Large')),
         Group_IQV = cut(IQV, include.lowest = TRUE,
                         breaks = quantile(IQV, c(0, 1/3, 2/3, 1)),
                         labels = c('Low', 'Middle', 'High'))) %>%
  mutate(group = paste(as.character(Group_size), 
                       as.character(Group_IQV), sep=', ') %>% 
           factor(.,  levels = c('Small, Low', 'Small, Middle', 'Small, High',
                                 'Medium, Low', 'Medium, Middle', 'Medium, High',
                                 'Large, Low', 'Large, Middle', 'Large, High')))

Plot the scatter plot between Language and Arithmetic scores

ggplot(data = dta_n, aes(x = lang, y = arith)) +            
  labs(x = 'Language score', y = 'Arithmetic score') +  
  geom_point(shape = 23, fill = 'black') +              
  geom_smooth(formula = y ~ x, method = 'lm', lwd = .5) + 
  facet_wrap(. ~ group)

In-Class Exercise 3: Grade 8 Pupils in Elementary Schools in the Netherlands.

Load data file

dta <- datasets::USPersonalExpenditure
head(dta)

##                       1940   1945  1950 1955  1960
## Food and Tobacco    22.200 44.500 59.60 73.2 86.80
## Household Operation 10.500 15.500 29.00 36.5 46.20
## Medical and Health   3.530  5.760  9.71 14.0 21.10
## Personal Care        1.040  1.980  2.45  3.4  5.40
## Private Education    0.341  0.974  1.80  2.6  3.64

Translate the data format from wide to long

Compute mean and log Expenditure, and subtraction

library(tidyverse)

## ─ Attaching packages ────────────────────────── tidyverse 1.3.0 ─

## ✓ tibble  2.1.3     ✓ purrr   0.3.3
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0

## ─ Conflicts ─────────────────────────── tidyverse_conflicts() ─
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(dplyr)
dta_n <- dta %>%
  as.data.frame() %>%
  mutate(Categories = row.names(dta)) %>% 
  gather(key = Year, value = Expenditure, 1:5) %>%
  mutate(Expenditure = log10(Expenditure)) %>%
  mutate(Excess = Expenditure - mean(Expenditure))
head(dta_n)

##            Categories Year Expenditure     Excess
## 1    Food and Tobacco 1940  1.34635297  0.4279902
## 2 Household Operation 1940  1.02118930  0.1028266
## 3  Medical and Health 1940  0.54777471 -0.3705880
## 4       Personal Care 1940  0.01703334 -0.9013294
## 5   Private Education 1940 -0.46724562 -1.3856084
## 6    Food and Tobacco 1945  1.64836001  0.7299973

Plot the data

qplot(Excess, Categories, data = dta_n, facets = . ~ Year) +
  geom_segment(aes(xend = 0, yend = Categories)) +
  geom_vline(xintercept = 0, colour = "grey50") +
  facet_wrap(~ Year, nrow = 1) +
  scale_x_continuous(limits = c(-1.5, 1.1),
                     breaks = seq(-1.5, 1.1, 0.5)) +
  labs(x = "excess (log10(billion))")

In-Class Exercise 4: ASD Children and Social Development

Load data file

## 4
dta <- WWGbook::autism
head(dta)

##   age vsae sicdegp childid
## 1   2    6       3       1
## 2   3    7       3       1
## 3   5   18       3       1
## 4   9   25       3       1
## 5  13   27       3       1
## 6   2   17       3       3

Remove NA

dta <- na.omit(dta)

Relabel the Group by sicdegp

dta$Group <- with(dta, 
                  cut(sicdegp, breaks = c(0, 1, 2, 3), 
                      labels = c("L", "M", "H")))

Compute age difference

dta <- dta %>% mutate(Age_d = age - mean(age))
head(dta)

##   age vsae sicdegp childid Group      Age_d
## 1   2    6       3       1     H -3.7704918
## 2   3    7       3       1     H -2.7704918
## 3   5   18       3       1     H -0.7704918
## 4   9   25       3       1     H  3.2295082
## 5  13   27       3       1     H  7.2295082
## 6   2   17       3       3     H -3.7704918

Plot the scatter plot between Age and VSAE score

ggplot(dta, aes(x = Age_d, y = vsae)) +
  facet_grid(.~Group) +
  scale_x_continuous(limits = c(-4, 7.5),
                     breaks = c(-2.5, 0.0, 2.5, 5.0)) +
  geom_point(alpha = 0.45)  +
  geom_smooth(method = "lm", formula = "y ~ x") +
  geom_line(aes(group = childid), alpha = 0.3) +
  labs(x = "Age (in years, centered)", y = "VSAE score") +
  theme_bw()

Create age-2 column

Plot the scatter plot between Age and VSAE score

dta %>% mutate(Age_2 = age - 2) %>% 
  group_by(Group, Age_2) %>%
  summarize(vsae_mean = mean(vsae), 
            vsae_se = sd(vsae) / sqrt(n())) %>%
ggplot() +
  aes(x = Age_2, y = vsae_mean, group = Group, shape = Group) +
  geom_point(position = position_dodge(width = .3),
             size=rel(2), show.legend = TRUE) +
  scale_shape_manual(values = c(1, 2, 16)) + 
  geom_line(position = position_dodge(width = .3), 
            aes(linetype = Group),
            show.legend = TRUE) +
  geom_errorbar(aes(ymax = vsae_mean + vsae_se,
                    ymin = vsae_mean - vsae_se),
                size=.3, width=.2, position = position_dodge(width = .3)) +
    xlab('Age (in year - 2)') + ylab('VSAE score') +
    theme_bw() + ## the legend information was refered from Jay Liao
    theme(panel.grid.minor = element_blank(),
        panel.grid.major = element_line(size=0.75),
        axis.text = element_text(size = 12),
        legend.position = c(.1, .85),
        legend.key = element_rect(color = "black"),
        legend.key.size = unit(.69, 'cm'),
        legend.title = element_text(size = 14),
        legend.box.background = element_rect(color = 'black'))

In-Class Exercise 5: Diabetes in overall population in US 2009-2010

Load data file

dta <- read.csv("/Users/haolunfu/Documents/資料管理/week7/diabetes_mell.csv", header = T)
head(dta)

##    SEQN RIAGENDR RIDRETH1 DIQ010 BMXBMI  gender     race diabetes           BMI
## 1 51624        1        3      2  32.22   Males    White       No    Overweight
## 2 51626        1        4      2  22.00   Males    Black       No Normal weight
## 3 51627        1        4      2  18.22   Males    Black       No Normal weight
## 4 51628        2        4      1  42.39 Females    Black      Yes    Overweight
## 5 51629        1        1      2  32.61   Males Hispanic       No    Overweight
## 6 51630        2        3      2  30.57 Females    White       No    Overweight

Relevel the variables

library(ggalluvial)

## Warning: package 'ggalluvial' was built under R version 3.6.2

dta_n <- data.frame(with(dta[, c("race", "gender", "diabetes", "BMI")],
                       xtabs(~ race + gender + diabetes + BMI)))

dta_n$race <- factor(dta_n$race, levels = c("Hispanic", "White", "Black"))
dta_n$gender <- factor(dta_n$gender, levels = c("Males", "Females"))
dta_n$diabetes <- factor(dta_n$diabetes, levels = c("Yes", "No"))
head(dta_n)

##       race  gender diabetes           BMI Freq
## 1    Black Females       No Normal weight  347
## 2 Hispanic Females       No Normal weight  712
## 3    White Females       No Normal weight  998
## 4    Black   Males       No Normal weight  429
## 5 Hispanic   Males       No Normal weight  706
## 6    White   Males       No Normal weight  873

Plot the data

ggplot(dta_n,
       aes(axis1=race,
           axis2=gender,
           axis3=diabetes,
           y=Freq)) +
  scale_x_discrete(limits=c("race",
                            "gender",
                            "diabetes"),
                   expand=c(.1, .05)) +
  labs(y='No. individuals') +
  ggtitle('Diabetes in overall population in US 2009-2010',
          subtitle = 'straitified by race, gender and diabetes mellitus') +
  geom_alluvium(aes(fill=BMI)) +
  geom_stratum() +
  geom_text(stat="stratum",
            infer.label=TRUE) +
  scale_fill_manual(values=c('gray40','tan1'))+
  theme_minimal() +
  theme(legend.position = 'bottom')

In-Class Exercise 6: gg_gapminder

Load ggplot2 package and use help function to see more details

library(ggplot2)
?ggplot2

Install and load gapminder package

#install.packages("gapminder")
library(gapminder)

Load gapminder data file and Show the Structure of the data

data(gapminder)
str(gapminder)

## Classes 'tbl_df', 'tbl' and 'data.frame':    1704 obs. of  6 variables:
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ pop      : int  8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
##  $ gdpPercap: num  779 821 853 836 740 ...

Name gapminder is gap

gap <- gapminder

Create a plot environment and xasis as lifeExp

ggplot(data = gap, aes(x = lifeExp))

Add on a histogram of lifeEXP

ggplot(data = gap, aes(x = lifeExp)) + 
    geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Change the histogram color, add on title and label names

ggplot(data = gap, aes(x = lifeExp)) + 
  geom_histogram(fill = "blue", color = "black", bins = 10) + 
  ggtitle("Life expectancy for the gap dataset") + 
  xlab("Life expectancy (years)") + 
  ylab("Frequency") + 
  theme_classic()

Plot a boxplot between continent and lifeExp

ggplot(data = gap, aes(x = continent, y = lifeExp, fill = continent)) + 
  geom_boxplot() + 
  ggtitle("Boxplots for lifeExp by continent") + 
  xlab("Continent") + 
  ylab("Life expectancy (years)") +
  theme_minimal() # +

  # guides(fill = FALSE)

Q: What happens if you un-hashtage guides(fill = FALSE) and the plus sign in lines 68 and 69 above? A: Legend will not display while exceuting guides(fill = FALSE)

Plot the scatter plot of lifeExp by GDPPerCap

ggplot(data = gap, aes(x = lifeExp, y = gdpPercap, color = continent, shape = continent)) + 
    geom_point(size = 5, alpha = 0.5) + 
    theme_classic() +
    ggtitle("Scatterplot of life expectancy by gdpPercap") +
    xlab("Life expectancy (years)") + 
    ylab("gdpPercap (USD)") + 
    theme(legend.position = "top",
          plot.title = element_text(hjust = 0.5, size = 20),
          legend.title = element_text(size = 10),
          legend.text = element_text(size = 5),
          axis.text.x = element_text(angle = 45, hjust = 1))

Q: In lines the ggplot code above, what are the arguments inside of our second “theme” argument doing?

A: Change the title position and the text size

Week 7 In-class exercise (Grammer)

Hao-Lun Fu

2020-04-27

In-Class Exercise 1: Mutiple Plot Method

Plot a scatter plot of the women data using R base.

Plot a scatter plot of the women data using lattice.

Plot a scatter plot of the women data using ggplot.

In-Class Exercise 2: Grade 8 Pupils in Elementary Schools in the Netherlands.

Load data file

Categorized the class size and verbal IQ

Plot the scatter plot between Language and Arithmetic scores

In-Class Exercise 3: Grade 8 Pupils in Elementary Schools in the Netherlands.

Load data file

Translate the data format from wide to long

Compute mean and log Expenditure, and subtraction

Plot the data

Relabel the Group by sicdegp

Compute age difference

Plot the scatter plot between Age and VSAE score

Create age-2 column

Plot the scatter plot between Age and VSAE score

In-Class Exercise 5: Diabetes in overall population in US 2009-2010

Load data file

Relevel the variables

Plot the data

In-Class Exercise 6: gg_gapminder

Load ggplot2 package and use help function to see more details

Install and load gapminder package

Load gapminder data file and Show the Structure of the data

Name gapminder is gap

Create a plot environment and xasis as lifeExp

Add on a histogram of lifeEXP

Change the histogram color, add on title and label names

Plot a boxplot between continent and lifeExp

Plot the scatter plot of lifeExp by GDPPerCap