This document contains the solutions to the exercises from the practical session of Day 2 of the Introduction to R workshop. The session focuses on data exploration, both numerical as well as graphical. The exercises in the document are all based on base R. However, the ggplot options are also provided. Note that you can enhance any figure as much as you personally like. There is no ‘right’ or ‘wrong’ or only one valid solution.

Before you start:
- Set working directory
- Read in iris dataset (‘iris.csv’ file)
- Read in hair and eye color dataset (‘HairEyeColor.csv’)
- Check your datasets

# Note that the path to your folder will be different
setwd("~/ADS Courses/Intro to R/DAY 2")

# Import datasets HairEyeColor and Iris

haireye <- read.csv("HairEyeColor.csv", header=T)
str(haireye)
#> 'data.frame':    592 obs. of  2 variables:
#>  $ hair: chr  "Black" "Black" "Black" "Black" ...
#>  $ eye : chr  "Brown" "Brown" "Brown" "Brown" ...
View(haireye)

iris <- read.csv("iris.csv", header=T)
str(iris)
#> 'data.frame':    150 obs. of  5 variables:
#>  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
#>  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
#>  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
#>  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
#>  $ Species     : chr  "setosa" "setosa" "setosa" "setosa" ...
View(iris)

# Read in necessary package
# Note that you have to install them first
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(ggplot2)
#> Warning: package 'ggplot2' was built under R version 4.5.3
library(tidyr)

1 Univariate

1.1 Categorical and binary variables

Frequency tables
- Count the number of observations per eye-color (frequency table)
- Try to expand this table with percentages and cumulative values

# Base R solution
number <- table(haireye$eye) # number of observations
proportion <- prop.table(table(haireye$eye)) # proportion of observations
cumulative_number <- cumsum(table(haireye$eye)) # cumulative number of observations
cumulative_proportion <- cumsum(prop.table(table(haireye$eye))) # cumulative proportion

table_eye <- rbind(number, proportion, cumulative_number,cumulative_proportion) #combine all over rows
table_eye 
#>                              Blue       Brown       Green       Hazel
#> number                215.0000000 220.0000000  64.0000000  93.0000000
#> proportion              0.3631757   0.3716216   0.1081081   0.1570946
#> cumulative_number     215.0000000 435.0000000 499.0000000 592.0000000
#> cumulative_proportion   0.3631757   0.7347973   0.8429054   1.0000000

# tidyr R solution

freqtab <- haireye %>% 
  group_by(eye) %>% 
  summarize(Frequency=n()) %>%
  mutate(percent=100*Frequency/sum(Frequency),
         cumfreq=cumsum(Frequency),
         cumper=cumsum(percent))
freqtab
#> # A tibble: 4 × 5
#>   eye   Frequency percent cumfreq cumper
#>   <chr>     <int>   <dbl>   <int>  <dbl>
#> 1 Blue        215    36.3     215   36.3
#> 2 Brown       220    37.2     435   73.5
#> 3 Green        64    10.8     499   84.3
#> 4 Hazel        93    15.7     592  100

Bar plot

Try to replicate the barplot.

barplot(table(haireye$hair), main="Barplot for Hair colour", xlab="Hair colour",ylab="Frequency")

# ggplot alternative

haireye %>% ggplot(aes(hair)) + geom_bar() + ggtitle("Barplot of Hair color") + xlab("hair color") + ylab("Frequency")

Try to plot the same barplot with fractions instead of counts at the y-axis

barplot(prop.table(table(haireye$hair)), main="Barplot for Hair colour", xlab="Hair colour",ylab="Frequency")

Try to change the plot so it looks like the example

barplot(table(haireye$hair), main="Barplot for Hair colour", xlab="hair colour",ylab="Frequency",col=c("black","yellow","brown","red"),las=2)

1.2 Continuous variables

Descriptive summary numbers

Produce quartiles, min/max, median, mean, std deviation and variance for the variable Petal.Width in the iris dataset

# Base R solution

quantile(iris$Petal.Width, probs=c(0.25,0.5,0.75)) #Q1, Median, Q3
#> 25% 50% 75% 
#> 0.3 1.3 1.8
min(iris$Petal.Width) # Minimum
#> [1] 0.1
max(iris$Petal.Width) # Maximum
#> [1] 2.5
mean(iris$Petal.Width) # Mean
#> [1] 1.199333
sd(iris$Petal.Width) # Standard deviation
#> [1] 0.7622377
var(iris$Petal.Width) # Variance
#> [1] 0.5810063

# Tidyr solution

iris %>% summarise(mean = mean(Petal.Width), 
                   sd = sd(Petal.Width),
                   min=min(Petal.Width),
                   max=max(Petal.Width),
                   lowQu=quantile(Petal.Width,0.25),
                   median=quantile(Petal.Width,0.5), 
                   upperQu=quantile(Petal.Width,0.75))
#>       mean        sd min max lowQu median upperQu
#> 1 1.199333 0.7622377 0.1 2.5   0.3    1.3     1.8

Box plot
- Try to make a boxplot of Petal Width
- In this boxplot change the title and label as in the example. Plus, add a text showing the median in the plot

boxplot(iris$Petal.Width, main="Width of Petal leaves", ylab="Leaf width (cm)", col="white")
text(x=median(iris$Petal.Width),pos=2,offset=6,labels="median")

# ggplot alternative
iris %>% ggplot(aes(x=" ", y=Petal.Width)) + geom_boxplot()

Histogram
Produce a histogram showing the frequencies of Iris Sepal Length. Change the titles and axis-labels to something appropriate

hist(iris$Sepal.Length, main="Length of Iris sepal leaves", xlab="Leaf length (cm)", ylab="Frequency")

# ggplot alternative
iris %>% ggplot(aes(Sepal.Length)) + geom_histogram(binwidth=0.5)+ ggtitle("Histogram of Sepal Length") + xlab("Sepal length") + ylab("Frequency")

Now, try to plot the same histogram with density on the y-axis instead of frequencies. Change the intervals in the x-axis until it looks like the example.

hist(iris$Sepal.Length, freq=F, main="Length of Iris sepal leaves", xlab="Leaf length (cm)", ylab="Density",breaks=20,col="white")

2 Bivariate

2.1 Categorical x categorical variables

Mosaic plot Create a mosaic plot of both variables. Add a title and labels. Also, try to color the tiles according to hair color.

mosaicplot(table(haireye$eye,haireye$hair), main="Mosaic plot", xlab="Eye color", ylab="Hair color", col=c("black","yellow","brown","red"))

# ggplot alternative
df <- haireye %>% group_by(eye,hair) %>% summarise(count = n()) %>% mutate(cut.count = sum(count), prop = count/sum(count))
#> `summarise()` has grouped output by 'eye'. You can override using the `.groups`
#> argument.

ggplot(df,aes(x = eye, y = prop, width = cut.count, fill = hair)) + geom_bar(stat = "identity", position = "fill", colour = "black") + geom_text(aes(label = scales::percent(prop)), position = position_stack(vjust = 0.5)) + facet_grid(~eye, scales = "free_x", space = "free_x") + scale_fill_brewer(palette = "RdYlGn") + theme(panel.spacing.x = unit(0, "npc")) + theme_void()

Stacked barplot - Create a stacked barplot of the same cross-table with title and labels
- Try to add a legend with the eye-colors so it looks something like the example.

barplot(table(haireye$eye,haireye$hair),xlab="Hair color",ylab="Frequency", main="Eye color per hair color category",legend.text=c("Blue","Brown","Green","Hazel"))

# ggplot alternative
haireye %>% group_by(eye,hair) %>% summarise(count = n()) %>% ggplot(aes(x = hair, y = count, fill = eye)) + geom_bar(stat ="identity") + xlab("hair color") + ylab("Frequency")
#> `summarise()` has grouped output by 'eye'. You can override using the `.groups`
#> argument.

  • Instead of stacked bars, try to group them next to each other and give some colors to the different eye color categories.
barplot(table(haireye$eye,haireye$hair),xlab="Hair color",ylab="Frequency", main="Eye color per hair color category",legend.text=c("Blue","Brown","Green","Hazel"),beside=T,col=c('blue','brown','darkgreen','olivedrab'))

# ggplot alternative
ggplot(haireye, aes(x=hair, fill=eye)) + 
    geom_bar(position="dodge")

## Categorical x continuous variables

Descriptive summary numbers x categorical levels Generate descriptive summary statistics (min/max, quartiles, mean and median) for Iris Petal Width per level of Iris Species

#Base r solution
tapply(iris$Petal.Width,iris$Species,mean)
#>     setosa versicolor  virginica 
#>      0.246      1.326      2.026
tapply(iris$Petal.Width,iris$Species,min)
#>     setosa versicolor  virginica 
#>        0.1        1.0        1.4
tapply(iris$Petal.Width,iris$Species,max)
#>     setosa versicolor  virginica 
#>        0.6        1.8        2.5
tapply(iris$Petal.Width,iris$Species,quantile,probs=c(0.25,0.5,0.75))
#> $setosa
#> 25% 50% 75% 
#> 0.2 0.2 0.3 
#> 
#> $versicolor
#> 25% 50% 75% 
#> 1.2 1.3 1.5 
#> 
#> $virginica
#> 25% 50% 75% 
#> 1.8 2.0 2.3
tapply(iris$Petal.Width,iris$Species,median)
#>     setosa versicolor  virginica 
#>        0.2        1.3        2.0

# Tidyr solution
iris %>%
  group_by(Species) %>% 
  summarise(mean = mean(Petal.Width), 
            min=min(Petal.Width),
            max=max(Petal.Width), 
            lowQu=quantile(Petal.Width,0.25),            
            median=quantile(Petal.Width,0.5), 
            upperQu=quantile(Petal.Width,0.75))
#> # A tibble: 3 × 7
#>   Species     mean   min   max lowQu median upperQu
#>   <chr>      <dbl> <dbl> <dbl> <dbl>  <dbl>   <dbl>
#> 1 setosa     0.246   0.1   0.6   0.2    0.2     0.3
#> 2 versicolor 1.33    1     1.8   1.2    1.3     1.5
#> 3 virginica  2.03    1.4   2.5   1.8    2       2.3

Box plots Produce a series of boxplots for the Petal Width by Iris Species in one figure and add title and labels.

boxplot(iris$Sepal.Length~iris$Species, main="Width of Iris Petal leaves", ylab="Leaf length (cm)", col="white", xlab=" ")

# ggplot alternative
iris %>% ggplot(aes(x=Species, y=Sepal.Length)) + geom_boxplot()

2.2 Continuous x continuous variables

Descriptive summary numbers Produce the summary numbers (min/max, quartiles, mean, median) for Sepal.Length, Sepal.Width, Petal.Length, Petal.Width in one go.

summary(iris[,1:4])
#>   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
#>  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
#>  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
#>  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
#>  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
#>  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
#>  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500

Correlation Produce the correlation matrix of these four variables, and try to round all numbers on two decimals

round(cor(iris[,1:4]),2)
#>              Sepal.Length Sepal.Width Petal.Length Petal.Width
#> Sepal.Length         1.00       -0.12         0.87        0.82
#> Sepal.Width         -0.12        1.00        -0.43       -0.37
#> Petal.Length         0.87       -0.43         1.00        0.96
#> Petal.Width          0.82       -0.37         0.96        1.00

Scatter plot
Make a scatterplot for Sepal.Length versus Petal.Length. Try to color the dots according to species.

plot(iris$Petal.Length,iris$Sepal.Length,col=as.factor(iris$Species), xlab="Petal length (cm)", ylab="Sepal length (cm)")

# ggplot alternative
iris %>% ggplot(aes(x=Petal.Length, y=Sepal.Length, color=Species)) + geom_point()

Now, also try to change the plotting symbol according to Species and add a legend.

plot(iris$Petal.Length,iris$Sepal.Length,col=as.factor(iris$Species), xlab="Petal length (cm)", ylab="Sepal length (cm)", pch=as.numeric(as.factor(iris$Species)))

Scatter plot matrix Create a scatterplot matrix of the same four variables, and color the dots according to species

# BASE R
pairs(iris[,1:4], col=as.factor(iris$Species))