Practicing using R as a calculator

my_vector <- c(1,3,5,8,10,-5,3)
length(my_vector)
## [1] 7
sum(my_vector)
## [1] 25

Set random seed and generate some random normal draws. It generates the same random vector each time you call the random function

Vectors, matrices, data.frames, …

set.seed(13579)
my_normal_vector <- rnorm(n=length(my_vector))
my_normal_vector
## [1] -1.2347155 -1.2528339 -0.2547780 -1.5266466  1.0971147  2.4887442  0.7794803
#

## Vector multiplication (elementwise)
my_vector * my_normal_vector
## [1]  -1.234715  -3.758502  -1.273890 -12.213173  10.971147 -12.443721   2.338441
## Inner product
## The quantity obtained by multiplying the corresponding
## coordinates of each of two vectors and adding the products.
my_vector %*% my_normal_vector
##           [,1]
## [1,] -17.61441
## Create matrix: 5 x 4
## Normal distribution: mean=10, sd=4
# The r is for “random“, and it is a random variable having the specified 
# distribution. For example, rnorm() function
my_matrix <- matrix(rnorm(n=5*4, mean=10, sd=4),
                    nrow=5, ncol=4)

my_matrix
##           [,1]     [,2]      [,3]      [,4]
## [1,] 10.753500 6.933320 10.987272  3.201161
## [2,]  5.894216 8.250841  6.553208 13.599404
## [3,]  8.973170 8.113796 12.639644 12.884479
## [4,] 12.984201 7.754463  9.852634  9.346003
## [5,] 11.884883 5.139338  6.189902  6.933884
## Create another matrix: 5 x 4
## rexp distribution
# rexp(m, r)—Returns a vector of m random numbers having the exponential 
# distribution
gen_exp <- rexp(n=5*4, rate=1)
gen_exp
##  [1] 0.33735737 2.82539938 1.12580580 0.44286020 0.02611241 1.30599715
##  [7] 0.19481413 0.09892251 2.12785120 1.52468762 0.58872486 0.43111999
## [13] 4.14845884 0.40606674 2.44914039 0.30681808 1.29253900 0.12032613
## [19] 0.82349666 1.08946855
my_matrix_2 <- matrix(gen_exp, ncol=4, nrow=5)
my_matrix_2
##            [,1]       [,2]      [,3]      [,4]
## [1,] 0.33735737 1.30599715 0.5887249 0.3068181
## [2,] 2.82539938 0.19481413 0.4311200 1.2925390
## [3,] 1.12580580 0.09892251 4.1484588 0.1203261
## [4,] 0.44286020 2.12785120 0.4060667 0.8234967
## [5,] 0.02611241 1.52468762 2.4491404 1.0894686
## Double check the product functions (elementwise multiplication)
my_matrix * my_matrix_2
##            [,1]       [,2]      [,3]       [,4]
## [1,]  3.6277725  9.0548966  6.468480  0.9821741
## [2,] 16.6535149  1.6073804  2.825219 17.5777600
## [3,] 10.1020464  0.8026371 52.435041  1.5503396
## [4,]  5.7501857 16.5003440  4.000827  7.6964023
## [5,]  0.3103429  7.8358856 15.159939  7.5542487
## Make use of head and tail commands for sanity checks
my_huge_matrix <- matrix(rgamma(10000,
                                shape=1), 
                         ncol=10)
head(my_huge_matrix)
##           [,1]      [,2]      [,3]       [,4]      [,5]       [,6]       [,7]
## [1,] 0.1585023 0.2594988 0.3045265 0.08611515 1.3593091 0.46796143 0.01747011
## [2,] 2.3844915 2.1434004 0.2076556 0.63440774 0.4867942 2.49800677 1.33030870
## [3,] 2.0498783 0.5644282 0.6914348 0.09216033 0.1677927 0.06666595 1.04110944
## [4,] 0.6544284 0.7128908 2.6595979 0.62706588 0.5343128 0.61398043 1.22492017
## [5,] 0.2356970 0.0428360 0.9375176 1.54803682 2.1684759 0.96108695 2.83205475
## [6,] 1.6757674 0.1273529 0.3218827 0.05611610 0.4491089 1.35358352 0.53563203
##            [,8]       [,9]      [,10]
## [1,] 1.11846385 0.48473271 2.56600850
## [2,] 1.00036235 1.47877854 0.27404199
## [3,] 1.31866862 0.00207013 0.01210107
## [4,] 0.67984458 0.61003525 0.69062251
## [5,] 0.09203886 3.78459495 1.74063865
## [6,] 2.12307883 0.55484225 0.64885783
tail(my_huge_matrix)
##              [,1]      [,2]      [,3]      [,4]      [,5]       [,6]      [,7]
##  [995,] 2.2065900 2.6204056 0.3617654 0.3113698 0.9299397 1.71883278 0.6832110
##  [996,] 0.1513955 0.4863449 0.6484172 1.5576926 1.6252196 0.22696258 0.1349167
##  [997,] 1.5559238 0.1786538 0.4939397 3.0058030 0.5783891 1.21986101 0.4762442
##  [998,] 0.3488299 0.3241757 0.6761012 0.8329858 0.6561114 0.04526154 0.6484562
##  [999,] 0.3982543 1.2403230 0.1033660 2.1795944 0.9295799 0.35622295 0.6209044
## [1000,] 1.5952870 0.2751454 1.0844354 0.1188376 0.2601905 1.78664851 2.0429866
##               [,8]      [,9]      [,10]
##  [995,] 0.06476413 0.3178026 0.43963115
##  [996,] 2.13606096 0.3472772 0.06109482
##  [997,] 1.99384835 0.4734880 1.22129446
##  [998,] 0.46446190 0.4860314 0.22367593
##  [999,] 0.35793411 1.4893320 1.01059960
## [1000,] 1.20263627 0.3589378 0.83118101
## Make sequence of values
1:10
##  [1]  1  2  3  4  5  6  7  8  9 10
0:10
##  [1]  0  1  2  3  4  5  6  7  8  9 10
10:0
##  [1] 10  9  8  7  6  5  4  3  2  1  0
-5:5
##  [1] -5 -4 -3 -2 -1  0  1  2  3  4  5
seq(from=-5, to=50, by=5)
##  [1] -5  0  5 10 15 20 25 30 35 40 45 50
seq(from=-5, to=50, length=50)
##  [1] -5.0000000 -3.8775510 -2.7551020 -1.6326531 -0.5102041  0.6122449
##  [7]  1.7346939  2.8571429  3.9795918  5.1020408  6.2244898  7.3469388
## [13]  8.4693878  9.5918367 10.7142857 11.8367347 12.9591837 14.0816327
## [19] 15.2040816 16.3265306 17.4489796 18.5714286 19.6938776 20.8163265
## [25] 21.9387755 23.0612245 24.1836735 25.3061224 26.4285714 27.5510204
## [31] 28.6734694 29.7959184 30.9183673 32.0408163 33.1632653 34.2857143
## [37] 35.4081633 36.5306122 37.6530612 38.7755102 39.8979592 41.0204082
## [43] 42.1428571 43.2653061 44.3877551 45.5102041 46.6326531 47.7551020
## [49] 48.8775510 50.0000000
## Data frame versus matrix
my_df <- data.frame(my_matrix, 
                    letters=c("A", "b", "c", "d","e"))

## Character vectors
my_name_vector <- c("Anne", "Bob", "Charles")

## Check the class of objects
class(my_name_vector)
## [1] "character"
class(my_normal_vector)
## [1] "numeric"
class(my_huge_matrix)
## [1] "matrix" "array"
class(my_df)
## [1] "data.frame"

Basic plotting

Histogram

#-------------------------------------------------------------
## 

## Histogram: hist
## ?hist
## Histogram of first five columns of my_huge_matrix

hist(my_huge_matrix[,c(1:5)] )

hist(my_huge_matrix[,1] ,w=3)

hist(my_huge_matrix[,2] ,w=3)

hist(my_huge_matrix[,3], main="Better histogram",
     xlab="Value", breaks=50, 
     col="cornflowerblue" ,w=3)

#To put all the 6 plots in a 2 x 3 matrix.
par(mfrow=c(2,3))
hist(my_huge_matrix[,1])
hist(my_huge_matrix[,2])
hist(my_huge_matrix[,3], main="Better histogram",
     xlab="Value", breaks=50, 
     col="cornflowerblue")

## Scatterplots

data(iris)
# ?plot
## Plot petal length vs petal width
col_vector <- rep(NA, nrow(iris))
setosa_index <- which(iris$Species == "setosa")
versicolor_index <- which(iris$Species == "versicolor")
virginica_index <- which(iris$Species == "virginica")
col_vector[setosa_index] <- "blue"
col_vector[versicolor_index] <- "red"
col_vector[virginica_index] <- "black"

##The pch arguement in the plot function can be varied to change the marker. 
plot(x=iris$Petal.Length, 
     y=iris$Petal.Width,
     col=col_vector, pch=19)
legend("topleft", pch=19,
       col=c("blue","red","black"),
       legend=c("Setosa", "Versicolor", "Virginica"))

## Plot Petal length vs width, coloring by sepal length
col_vector <- rep(NA, nrow(iris))
high_index <- which(iris$Sepal.Length > 
                      mean(iris$Sepal.Length))
col_vector[high_index] <- "red"
col_vector[-high_index] <- "blue"
shape_vector <- rep(NA, nrow(iris))
shape_vector[setosa_index] <- "S"
shape_vector[versicolor_index] <- "E"
shape_vector[virginica_index] <- "I"

plot(x=iris$Petal.Length, 
     y=iris$Petal.Width,
     col=col_vector, pch=shape_vector)
legend("topleft", pch=19,
       col=c("blue","red"),
       legend=c("Low sepal length", 
                "High sepal length"))

plot(x=1:nrow(iris), y=sort(iris$Petal.Width),
     type="l")

Boxplots

boxplot(iris$Petal.Length ~ iris$Species,
        col=c("red", "blue", "green"))

## Density plots
## Hint: which function to select species
## lines()
plot(density(iris$Petal.Length[which(iris$Species == 
                                       "setosa")]),
     xlim=c(0,8))
lines(density(iris$Petal.Length[which(iris$Species == 
                                       "virginica")]))
lines(density(iris$Petal.Length[which(iris$Species == 
                                       "versicolor")]))

Help

#R is case sensitive.
#R index starts from 1

####Help
#Help Home Page, Special Character Help, Search Help, Search Function (with partial name)
# 
# help.start()
# 
# help('$')
# 
# ?"$"

# apropos('med')

example(median)
## 
## median> median(1:4)                # = 2.5 [even number]
## [1] 2.5
## 
## median> median(c(1:3, 100, 1000))  # = 3 [odd, robust]
## [1] 3
# help(package = "dplyr")

#Get help on a function of a package
# help("tally", package = "dplyr")

R basics

####Basics

# The arrow operator (<-) sets the left-hand side equal to the right-hand side.
# It computes a1 and assigns it to the variable a2 
#a2<- a1

a1 <- 6 + 7
a2 <- a1
# View(a2) 

rm(a2)

#Comments starts with #


####R Built-in Data Sets
# data()

# Loading datasets
data(mtcars)

# Print the first 6 rows of a dataset
head(mtcars, 6)
# ?mtcars

# Number of rows (observations)
nrow(mtcars)
## [1] 32
# Number of columns (variables)
ncol(mtcars)
## [1] 11
data(iris)


# You could find a large number of datasets at https://archive.ics.uci.edu/ml/datasets.php

Using Packages

You could find a large number of datasets at https://archive.ics.uci.edu/ml/datasets.php

Using Packages

R packages: collections of functions and data sets developed by the community. Examples: “dplyr” or “data.table”

A number of packages are automatically loaded when you start R (e.g., “base”, “utils”, “graphics”, and “stats”)

First you install packages and then you should load them using command “library()”

install & load library()

install packages() install.packages(“tidyr”)

library(tidyr) browseVignettes()

browseVignettes(package=“packagename”)

vignette(package = “ggplot2”)

Unload packages

detach(packageName)

require(ggplot2)

Working Directory

current working directory where inputs are found, and outputs are sent.

getwd()

Change the current working directory.

setwd(C://file/path)

What working directory are we in?

Set working directory to a new place !!! You need to change this! setwd(‘/Users/kravvaz/Desktop/PH718_spring20/Lecture2’)

### Try the following commands
getwd() ## Get working directory
## [1] "C:/Users/anemati/OneDrive - mcw.edu/UWM/Spring/PH 718 R/R code In class"
dir()   ## What else is in that directory?
##  [1] "code-in-class.pdf"                     
##  [2] "code-in-class.Rmd"                     
##  [3] "code-in-class_1.pdf"                   
##  [4] "code-in-class_1.Rmd"                   
##  [5] "code-in-class_files"                   
##  [6] "comment"                               
##  [7] "cowplot.R"                             
##  [8] "data-wrangling-cheatsheet.pdf"         
##  [9] "Done"                                  
## [10] "execute.R"                             
## [11] "first_ggplot.pdf"                      
## [12] "first_ggplot.png"                      
## [13] "Pearson Product-Moment Correlation.pdf"
## [14] "PH718_Lecture5_Scripts_updated.R"      
## [15] "PH718_Lecture6_Scripts.R"              
## [16] "PH718_Lecture7_Scripts.R"              
## [17] "publishers.csv"                        
## [18] "R code for bio.Rproj"                  
## [19] "Spearman_s Rank-Order Correlation.pdf" 
## [20] "superheroes.csv"                       
## [21] "Types of Variable.pdf"
objects()
##  [1] "a1"               "col_vector"       "gen_exp"          "high_index"      
##  [5] "iris"             "mtcars"           "my_df"            "my_huge_matrix"  
##  [9] "my_matrix"        "my_matrix_2"      "my_name_vector"   "my_normal_vector"
## [13] "my_vector"        "setosa_index"     "shape_vector"     "versicolor_index"
## [17] "virginica_index"
ls()
##  [1] "a1"               "col_vector"       "gen_exp"          "high_index"      
##  [5] "iris"             "mtcars"           "my_df"            "my_huge_matrix"  
##  [9] "my_matrix"        "my_matrix_2"      "my_name_vector"   "my_normal_vector"
## [13] "my_vector"        "setosa_index"     "shape_vector"     "versicolor_index"
## [17] "virginica_index"
rm(list=c("x", "y"))
## Warning in rm(list = c("x", "y")): object 'x' not found
## Warning in rm(list = c("x", "y")): object 'y' not found
ls()
##  [1] "a1"               "col_vector"       "gen_exp"          "high_index"      
##  [5] "iris"             "mtcars"           "my_df"            "my_huge_matrix"  
##  [9] "my_matrix"        "my_matrix_2"      "my_name_vector"   "my_normal_vector"
## [13] "my_vector"        "setosa_index"     "shape_vector"     "versicolor_index"
## [17] "virginica_index"

Data Types

Example of data type

a1 <- 6 + 7
a2 <- a1

is.numeric(a2)
## [1] TRUE
nchar(mtcars$hp)
##  [1] 3 3 2 3 3 3 3 2 2 3 3 3 3 3 3 3 3 2 2 2 2 3 3 3 3 2 2 3 3 3 3 3
#Stores just as a date.
date1 <- as.Date('2020-01-23')

#Internally, Date objects are stored as the number of days since January 1, 1970
#To convert a Date object to its internal form, number of days since 1/1/1970. 
as.numeric(date1)
## [1] 18284
#Stores a date and time. 
date2 <- as.POSIXct("2020-01-23 17:30")
#In numeric form, number of seconds since 1/1/1970.
as.numeric(date2)
## [1] 1579822200
as.numeric(TRUE)
## [1] 1
class(iris$Species)
## [1] "factor"
levels(iris$Species)
## [1] "setosa"     "versicolor" "virginica"
str(iris$Species)
##  Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

Data Structures

v1 <- c(1, 2, 3, 4, 5, 6, 7, 8)
length(v1)
## [1] 8
unique(v1)
## [1] 1 2 3 4 5 6 7 8
rev(v1)
## [1] 8 7 6 5 4 3 2 1
v1[order(v1)]
## [1] 1 2 3 4 5 6 7 8
v1[v1 > 3]
## [1] 4 5 6 7 8
v1[1:3]
## [1] 1 2 3
v1[c(1,6)]
## [1] 1 6
v1[-(2:4)]
## [1] 1 5 6 7 8
v1[v1 == 5]
## [1] 5
v1[v1 %in% c(1, 2, 5)]
## [1] 1 2 5
v1 <- c(1, 2, 3, 4, 5, 6, 7, 8)
v1 <- 1:8
v1 <- seq(from=1, to=8, by=1)

#row.names(data.frame)
#nrow()
#length()
#subset(data.frame, [condition])
#rbind()

Vector

# with numerics from 1 up to 10
my_vector <- 1:10
my_vector
##  [1]  1  2  3  4  5  6  7  8  9 10

Matrix

# with numerics from 1 up to 9
my_matrix <- matrix(1:9, ncol = 3)
my_matrix
##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9

data frame

#First 10 elements of the built-in data frame mtcars
my_df <- mtcars[1:10,]
my_df

list

#Construct list with these different elements:
my_list <- list(my_vector, my_matrix, my_df)
my_list
## [[1]]
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## [[2]]
##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9
## 
## [[3]]
##                    mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Duster 360        14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D         24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230          22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280          19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4

Factor

#How to create a factor?
factor1 <- factor(c("single", "married", "married", "single")) 
factor1 
## [1] single  married married single 
## Levels: married single
factor2 <- factor(c("single", "married", "married", "single"), 
                levels = c("single", "married", "divorced"));
factor2
## [1] single  married married single 
## Levels: single married divorced
class(factor2)
## [1] "factor"
levels(factor2)
## [1] "single"   "married"  "divorced"
factor3 <- factor(c("single","married","married","single"))
str(factor3)
##  Factor w/ 2 levels "married","single": 2 1 1 2
#How to access components of a factor?
factor3
## [1] single  married married single 
## Levels: married single
factor3[3]           # access 3rd element
## [1] married
## Levels: married single
factor3[c(2, 4)]     # access 2nd and 4th element
## [1] married single 
## Levels: married single
factor3[-1]          # access all but 1st element
## [1] married married single 
## Levels: married single
factor3[c(TRUE, FALSE, FALSE, TRUE)]  # using logical vector
## [1] single single
## Levels: married single
#How to modify a factor?
factor3
## [1] single  married married single 
## Levels: married single
factor3[2] <- "divorced"    # modify second element;  
## Warning in `[<-.factor`(`*tmp*`, 2, value = "divorced"): invalid factor level,
## NA generated
factor3[3] <- "widowed"    # cannot assign values outside levels
## Warning in `[<-.factor`(`*tmp*`, 3, value = "widowed"): invalid factor level, NA
## generated
factor3
## [1] single <NA>   <NA>   single
## Levels: married single
#Add a value to the level first.
levels(factor3) <- c(levels(factor3), "widowed")    # add new level
factor3[3] <- "widowed"
factor3
## [1] single  <NA>    widowed single 
## Levels: married single widowed
#NA
#Missing data is encoded using value NA.
is.na(factor3) #tests whether factor3 is an NA
## [1] FALSE  TRUE FALSE FALSE

Plot

###
set.seed(2020)
BMI<-rnorm(n=1000, m=26, sd=2)
hist(BMI)

hist(BMI, breaks=15, main="Breaks=15")

hist(BMI, breaks=5, main="Breaks=5")

#breaks() option with a vector gives you more control over exactly the breakpoints between bins by dictating the start and end point of each bin.
hist(BMI, breaks=c(18, 21, 24, 27, 30, 33, 36), main="Breaks is vector of breakpoints")

hist(BMI, breaks=seq(18,36,by=3), main="Breaks is vector of breakpoints")

#Density plot: freq=FALSE option
hist(BMI, freq=FALSE, main="Density plot")

Functions in R

Here is a simple example

#
example.sum <- function(a, b) {
  return(a + b)
}

x <- 1:10
y <- 11:20
example.sum(x, y)
##  [1] 12 14 16 18 20 22 24 26 28 30
### Naming conventions. Why not write example.sum() as:
example.sum.1 <- function(x, y, z) {
  tmp <- x + y
  tmp2 <- tmp + z
  return(tmp2)
}
x <- 1:10
y <- 11:20
z <- 100
example.sum.1(x, y, z)
##  [1] 112 114 116 118 120 122 124 126 128 130
### Try this

example.diff.sum <- function(x, y) {
  newy = example.sum(x, y) - 10
  return(newy)
}
# example.diff.sum(x, y)
example.diff.sum(10, 5)
## [1] 5
example.sum = function(a, b) {
  return(a + b)
}

example.diff.sum = function(c, d) {
  e = example.sum(c, d) - 10
  return(e)
}

x <- 1:10
y <- 11:20
# example.diff.sum(x, y)
example.diff.sum(10, 5)
## [1] 5

Example together: Hardy-Weinberg problem #3.6 from book

HWE <- function(p) {
  stopifnot(is.numeric(p))
  stopifnot(p >= 0)
  stopifnot(p <= 1)
  prob_AA <- p ^ 2
  prob_AB <- 2 * p * (1 - p)
  prob_BB <- (1 - p) ^ 2
  return(c(prob_AA, prob_AB, prob_BB))
}
HWE(0)
## [1] 0 0 1
HWE(0.5)
## [1] 0.25 0.50 0.25
# HWE(-5) # Error
# HWE("kourosh") # Error

Write a function to convert Celsuis to Fahrenheit and vice versa

C_to_F <- function(temp) {
  return((temp * 1.8) + 32)
}

F_to_C <- function(temp) {
  return((temp - 32) / 1.8)
}

par(mfrow = c(1, 2))
plot(1:100, C_to_F(1:100))
plot(-40:212, F_to_C(-40:212))

type_of_conversion can be “F_to_C” or “C_to_F”

temp_conversion <- function(temp, type_of_conversion = "F_to_C") {
  if (!type_of_conversion %in% c("F_to_C", "C_to_F")) {
    stop("STOP!!! I can only convert C to F or F to C.")
  }
  if (type_of_conversion == "F_to_C") {
    new_temp <- F_to_C(temp)
  }
  if (type_of_conversion == "C_to_F") {
    new_temp <- C_to_F(temp)
  }
  return(list(temp = new_temp, conversion = type_of_conversion))
}

temp_conversion(100, type = "C_to_F")
## $temp
## [1] 212
## 
## $conversion
## [1] "C_to_F"
temp_conversion(0, type = "F_to_C")
## $temp
## [1] -17.77778
## 
## $conversion
## [1] "F_to_C"

Indices and the which() function

x <- rnorm(100)
x[1:10]
##  [1]  0.00465041 -1.22874970 -0.14059798 -0.20732697 -0.92153058  0.36047424
##  [7]  1.66660243  1.44804634 -0.03285159 -1.62843554
x[20:30]
##  [1]  0.18713551 -0.57543743 -0.08766336  0.31167076  0.24438772 -0.45803324
##  [7]  1.32593062 -0.57952274 -0.82407695  1.08562056  1.41590793
index <- which(x < 0)

index
##  [1]   2   3   4   5   9  10  11  13  14  15  17  18  19  21  22  25  27  28  33
## [20]  35  38  41  44  45  46  49  52  57  58  60  63  64  65  66  68  72  73  74
## [39]  75  78  80  81  83  84  85  89  90  93  94  96  97 100
x[index]
##  [1] -1.22874970 -0.14059798 -0.20732697 -0.92153058 -0.03285159 -1.62843554
##  [7] -0.56266928 -0.85173525 -0.91512892 -0.44484557 -1.44896422 -0.59389701
## [13] -0.75144575 -0.57543743 -0.08766336 -0.45803324 -0.57952274 -0.82407695
## [19] -2.07172304 -0.49889987 -1.51680685 -0.62976566 -0.36027572 -0.39350268
## [25] -1.04180068 -0.63945719 -0.85649561 -0.38280458 -1.89724701 -0.69186589
## [31] -0.43251556 -0.95196059 -0.21625030 -1.04279172 -0.10986728 -1.84878973
## [37] -0.98415680 -0.21854832 -0.80092215 -0.50991847 -0.09837813 -0.83113845
## [43] -0.19782135 -0.54494659 -0.32692897 -0.25978735 -0.11579965 -0.93762421
## [49] -0.27231887 -0.47014392 -0.06493665 -0.96108831

Loops in R

An introductory example

This “initializes” the object “w” so that R knows what w “stands for.”

w <- NULL

for loop

w <- NULL
for (i in 1:10) {
  w[i] <- i + 10
}
w
##  [1] 11 12 13 14 15 16 17 18 19 20
# same code 
for(i in 1:10){
  w[i] <- i+10
  print(w[i])
}
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20

What do you think w looks like?

#w <- NULL
for (hello in 1:10) {
  w[hello] <- hello
}
w
##  [1]  1  2  3  4  5  6  7  8  9 10

Now what does w look like?

embed loops

w <- NULL
counter <- 1
for (j in 11:20) {
  for (i in 1:10) {
    w[counter] <- i + j
    counter <- counter + 1
  }
}
w
##   [1] 12 13 14 15 16 17 18 19 20 21 13 14 15 16 17 18 19 20 21 22 14 15 16 17 18
##  [26] 19 20 21 22 23 15 16 17 18 19 20 21 22 23 24 16 17 18 19 20 21 22 23 24 25
##  [51] 17 18 19 20 21 22 23 24 25 26 18 19 20 21 22 23 24 25 26 27 19 20 21 22 23
##  [76] 24 25 26 27 28 20 21 22 23 24 25 26 27 28 29 21 22 23 24 25 26 27 28 29 30

Think through these two loops carefully.

What is the difference between this loop and the last loop?

w <- NULL
counter <- 1
for(j in 11:20){
  for(i in 1:10){
    w[counter] <- i + j
  }
  counter <- counter + 1
  cat(counter ,  "  , ")
}
## 2   , 3   , 4   , 5   , 6   , 7   , 8   , 9   , 10   , 11   ,

It is usually a better idea to pre-allocate a variable of the appropriate size

R has to reallocate memory at each iteration

w <- rep(NA, length(1:10))
w
##  [1] NA NA NA NA NA NA NA NA NA NA
for(i in 1:10){
  w[i] <- i+10
}
w
##  [1] 11 12 13 14 15 16 17 18 19 20
system.time({
  w <- c()
  for(i in 1:10^6)
    w[i] <- i+10
  }
)
##    user  system elapsed 
##    0.37    0.03    0.70
system.time({
  w <- rep(NA, 10^6)
  for(i in 1:10^6)
    w[i] <- i+10
  }
)
##    user  system elapsed 
##    0.08    0.00    0.08

using a nested loop

In class example: Construct a 10 x 10 multiplication table

my_mat <- matrix(NA, nrow=10, ncol=10)
for(rows in 1:10) {
  for(cols in 1:10) {
    my_mat[rows, cols] <- rows * cols
  }
}
my_mat
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]    1    2    3    4    5    6    7    8    9    10
##  [2,]    2    4    6    8   10   12   14   16   18    20
##  [3,]    3    6    9   12   15   18   21   24   27    30
##  [4,]    4    8   12   16   20   24   28   32   36    40
##  [5,]    5   10   15   20   25   30   35   40   45    50
##  [6,]    6   12   18   24   30   36   42   48   54    60
##  [7,]    7   14   21   28   35   42   49   56   63    70
##  [8,]    8   16   24   32   40   48   56   64   72    80
##  [9,]    9   18   27   36   45   54   63   72   81    90
## [10,]   10   20   30   40   50   60   70   80   90   100

An example of a “while” loop

  • while loop: Based on onset and verification of a logical
  • condition that is tested at beginning or end of the loop
  • Make sure the termination is explicitly set by testing a condition!
w <- 100
z <- 5

while (w > 20) {
  w.plus.z <- w + z
  w <- w - 1
}
w
## [1] 20
w <- 100
z <- 5
while (w < 20) {
  # if w > 20 => inf loop
  w.plus.z <- w + z
  w <- w + 1
}
w
## [1] 100

Make sure that your loop can be “closed.”

The following loop will run forever. DON’T actually run it!!!!!

w <- 100 z <- 5 while(w > 20){ w.plus.z <- w + z w <- w + 1 }

Next: using break or next

  • https://www.datamentor.io/r-programming/break-next/
  • A next statement is useful when we want to skip the current iteration of a loop without terminating it.
  • On encountering next, the R parser skips further evaluation and starts next iteration of the loop.

break statement

  • A break statement is used inside a loop (repeat, for, while) to stop the iterations and flow the control outside of the loop.
  • In a nested looping situation, where there is a loop inside another loop, this statement exits from the innermost loop that is being evaluated.

Next eample

m=20
for ( k in 1:m){
  if(!k %% 2)
    next
  print(k)
}
## [1] 1
## [1] 3
## [1] 5
## [1] 7
## [1] 9
## [1] 11
## [1] 13
## [1] 15
## [1] 17
## [1] 19

Break Example

set.seed(12345) ## Set seed for random number generator
m <- rnorm(10)
for(i in 1:length(m)) {
  if(m[i] < 0)
    break
  print(m[i])
}
## [1] 0.5855288
## [1] 0.709466

Construct a lower triangular matrix

  • with a 10 x 10 multiplication table using a nested loop.
my_mat <- matrix(NA , nrow =10 , ncol=10)
for(rows in 1:10){
  for(cols in 1:10){
    if(rows < cols) next
    my_mat[rows , cols] <- rows  * cols
  }
}
my_mat
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]    1   NA   NA   NA   NA   NA   NA   NA   NA    NA
##  [2,]    2    4   NA   NA   NA   NA   NA   NA   NA    NA
##  [3,]    3    6    9   NA   NA   NA   NA   NA   NA    NA
##  [4,]    4    8   12   16   NA   NA   NA   NA   NA    NA
##  [5,]    5   10   15   20   25   NA   NA   NA   NA    NA
##  [6,]    6   12   18   24   30   36   NA   NA   NA    NA
##  [7,]    7   14   21   28   35   42   49   NA   NA    NA
##  [8,]    8   16   24   32   40   48   56   64   NA    NA
##  [9,]    9   18   27   36   45   54   63   72   81    NA
## [10,]   10   20   30   40   50   60   70   80   90   100
my_mat[1,3]
## [1] NA

another example replicate function for repeated

### There is also the replicate function for repeated 
### evaluation of the same thing:
m <- 10
n <- 5
mymat <- replicate(m, rnorm(n))

## This is equivalent!
mymat <- matrix(NA, nrow=5, ncol=10)
for(i in 1:ncol(mymat)) {
  mymat[,i] <- rnorm(5)
}
mymat
##            [,1]       [,2]        [,3]       [,4]       [,5]       [,6]
## [1,]  0.1495920 -1.8323773  0.05461558  0.9426009  0.6453831  1.8670992
## [2,] -1.3425315  0.8881394 -0.78464937  0.8262583  1.0431436  0.6720425
## [3,]  0.5533031  1.5934885 -1.04935282 -0.8115405 -0.3043691 -0.3079534
## [4,]  1.5899628  0.5168547  2.33051196  0.4762483  2.4771109  0.5365237
## [5,] -0.5868796 -1.2956717  1.40270538  1.0212584  0.9712207  0.8248701
##            [,7]       [,8]       [,9]      [,10]
## [1,] -0.9639015  0.6873321  0.2239254 -0.5360480
## [2,] -0.8550825 -0.5050435 -1.1562233 -0.3116061
## [3,]  1.8869469  2.1577198  0.4224185  1.5561096
## [4,] -0.3918194 -0.5997976 -1.3247553 -0.4480333
## [5,] -0.9806329 -0.6945467  0.1410843  0.3211235
## This is also equivalent!
mymat <- matrix(NA, nrow=5, ncol=10)
for(cols in 1:ncol(mymat)) {
  for(rows in 1:nrow(mymat)) {
    mymat[rows,cols] <- rnorm(1)
  }
}
mymat
##             [,1]        [,2]       [,3]      [,4]       [,5]       [,6]
## [1,] -1.23017225 -0.50508981  1.5448636 -1.158821  1.6421920 -1.0479137
## [2,] -1.32405869 -0.05215359  1.3214520 -1.845368  0.8836548 -1.0111225
## [3,]  1.26124227  0.62886063  0.3221516  1.157325  0.5248759  0.6689217
## [4,]  1.31923172  2.18000240  1.5309551 -2.123550 -1.1846591  0.1291773
## [5,] -0.08075376 -0.06901731 -0.4212397 -1.196032  2.6557883 -0.4225769
##             [,7]       [,8]       [,9]       [,10]
## [1,] -1.14026414  0.5401696 -1.6193283 -0.25094662
## [2,] -1.29371529 -1.5472920  0.5483979  1.69934667
## [3,] -0.59469877  0.8496529  0.1952822 -0.34429880
## [4,] -1.50081408  0.8960132 -0.8064980  0.06777206
## [5,]  0.01585569  0.1386910 -0.1086242 -0.65056973

Checking on vector interpretation of matrices

my_mat <- matrix(1:20, nrow=4, ncol=5)
for(rows in 1:4) {
  my_mat[rows, ] <- rnorm(5)
}
my_mat
##            [,1]      [,2]        [,3]       [,4]        [,5]
## [1,] -0.4876385 0.3031512 -0.24197402 -0.4817336 -0.99180286
## [2,] -0.2806491 0.6330173 -1.23981834  1.7643141 -0.02367989
## [3,]  0.1999205 1.3471928  0.03607349  0.8245811 -1.70267185
## [4,]  0.4809502 2.4835501  0.40136499  0.2151772 -1.81571235
my_mat <- matrix(1:20, nrow=4, ncol=5)
for(rows in 1:4) {
  my_mat[rows,] <- rnorm(5)
}
my_mat
##             [,1]        [,2]       [,3]       [,4]        [,5]
## [1,] -0.91173942 -0.04904469 -0.4053875  1.1303818  0.81546474
## [2,]  0.07641752  1.45374735  0.3741211 -0.1709041 -0.50221281
## [3,]  0.54352211 -0.50518600  0.7867958  0.3009494  1.31022391
## [4,]  0.79843377  0.85086044 -0.4435680 -0.4467748  0.01330504
for(index in 1:20) {
  my_mat[index] <- my_mat[index] * 10
}
my_mat
##            [,1]       [,2]      [,3]      [,4]       [,5]
## [1,] -9.1173942 -0.4904469 -4.053875 11.303818  8.1546474
## [2,]  0.7641752 14.5374735  3.741211 -1.709041 -5.0221281
## [3,]  5.4352211 -5.0518600  7.867958  3.009494 13.1022391
## [4,]  7.9843377  8.5086044 -4.435680 -4.467748  0.1330504

Loop form

## Loop form
natural_numbers <- 1:50
sum_value <- 0
for(i in 1:length(natural_numbers)) {
  ## Is natural numbers divisible by 3?
  ## Is natural numbers divisible by 5?
  if(!natural_numbers[i] %% 3 | !natural_numbers[i] %% 5) {
    # print(natural_numbers[i])
    sum_value <- sum_value + natural_numbers[i]
  }
}

## Vectorized form
new_vector <- sum(ifelse(!natural_numbers %% 3 | !natural_numbers %% 5,
                     natural_numbers, 0))

new_indices <- ifelse(!natural_numbers %% 3 | !natural_numbers %% 5,
       TRUE, FALSE)
natural_numbers[which(new_indices == TRUE)]
##  [1]  3  5  6  9 10 12 15 18 20 21 24 25 27 30 33 35 36 39 40 42 45 48 50

Loop Example

keep <- c()
counter <- 1
while(counter < 1000) {
  if(counter %% 3 & counter %% 5) {
    counter <- counter + 1
  } else {
    keep <- c(keep, counter)
    counter <- counter + 1
  }
}
sum(keep)
## [1] 233168

Vectorization

Some Example

# ## Consider this:
v1 <- runif(10)
v1
##  [1] 0.07548045 0.47438424 0.26458955 0.23074607 0.59619939 0.15892558
##  [7] 0.85505484 0.23745380 0.79711170 0.07848559
v2 <- runif(10)
v2
##  [1] 0.54189613 0.71694462 0.04077934 0.02560045 0.74061643 0.77014907
##  [7] 0.53867819 0.38813644 0.46941770 0.12248648
v3 <- c()
for (i in 1:10) {
  v3[i] <-v1[i] + v2[i]
}
v3
##  [1] 0.6173766 1.1913289 0.3053689 0.2563465 1.3368158 0.9290746 1.3937330
##  [8] 0.6255902 1.2665294 0.2009721
## versus this:
v3 = v1 + v2
v3
##  [1] 0.6173766 1.1913289 0.3053689 0.2563465 1.3368158 0.9290746 1.3937330
##  [8] 0.6255902 1.2665294 0.2009721
# ## Consider this
A <- matrix(1:20, nrow=5, ncol=4)
B <- matrix(21:40, nrow=5, ncol=4)
matsum <- matrix(0, nrow=5, ncol=4)
for(i in 1:5) {
  for(j in 1:4) {
    matsum[i,j] <- A[i,j] + B[i,j]
  }
}
matsum
##      [,1] [,2] [,3] [,4]
## [1,]   22   32   42   52
## [2,]   24   34   44   54
## [3,]   26   36   46   56
## [4,]   28   38   48   58
## [5,]   30   40   50   60
## versus this:
matsum <- A + B
matsum
##      [,1] [,2] [,3] [,4]
## [1,]   22   32   42   52
## [2,]   24   34   44   54
## [3,]   26   36   46   56
## [4,]   28   38   48   58
## [5,]   30   40   50   60
  • Why is vectorization faster? R is an interpreted language!

Let’s consider an example where we had some artificial “signal” to random noise.

# This is a bad loop with 'growing' data
set.seed(42)
m <- 10
n <- 10
# Create matrix of normal random numbers
mymat <- replicate(m, rnorm(n))
# Transform into data frame
mydframe <- data.frame(mymat)
for (i in 1:m) {
  for (j in 1:n) {
    mydframe[i,j]<-mydframe[i,j] + 10*sin(0.75*pi)
  }
}
mydframe
# 
# ## Here is the vectorized form:
mydframe_vec <- data.frame(mymat)
mydframe_vec <- mydframe_vec + 10*sin(0.75*pi)
mydframe_vec
## What happens when you add a matrix and a vector?
m <- matrix(1:20, nrow=4, ncol=5)
v <- c(1,10,100,1000,10000)
m + v
##      [,1]  [,2]  [,3]  [,4]  [,5]
## [1,]    2 10005  1009   113    27
## [2,]   12     7 10010  1014   118
## [3,]  103    17    12 10015  1019
## [4,] 1004   108    22    17 10020

versions of the above example using system.time.

# This is a bad loop with 'growing' data

time_check <- function(m,n) {
  set.seed(42)
  # Create matrix of normal random numbers
  mymat <- replicate(m, rnorm(n))
  # Transform into data frame
  mydframe <- data.frame(mymat)
  time_1 <- system.time({
    for (i in 1:m) {
      for (j in 1:n) {
        mydframe[i,j]<-mydframe[i,j] + 10*sin(0.75*pi)
      }
    }
  })
  time_2 <- system.time({
    mydframe_vec <- mydframe + 10*sin(0.75*pi)
  })
  return(list(loop = time_1, vectorized=time_2))
}

time_check(10,10)
## $loop
##    user  system elapsed 
##    0.02    0.00    0.01 
## 
## $vectorized
##    user  system elapsed 
##       0       0       0
time_check(100,100)
## $loop
##    user  system elapsed 
##    0.31    0.00    0.33 
## 
## $vectorized
##    user  system elapsed 
##    0.02    0.00    0.02
time_check(1000,1000)
## $loop
##    user  system elapsed 
##   53.79    0.06   55.98 
## 
## $vectorized
##    user  system elapsed 
##    0.07    0.00    0.06

logical operators

Simple logical operators

a = TRUE
b = FALSE

a & b
## [1] FALSE
a | b
## [1] TRUE
!(a & b)
## [1] TRUE
!a | !b
## [1] TRUE

Other operators that evaluate to TRUE or FALSE

x <- 10
x < 20
## [1] TRUE
x <- 1:10
x < 5
##  [1]  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
x == 6
##  [1] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE

if-else conditions

x <- 5
ifelse(x==4, 1, 0)
## [1] 0
x <- 1:10
ifelse(x<=4, 1, 0)
##  [1] 1 1 1 1 0 0 0 0 0 0
y <- NULL
x <- seq(0.1, 1, by=0.1)
for(i in 1:10){
    if(x[i] < 0.5){
        y[i] <- x[i]^2
    }else{
        y[i] <- sqrt(x[i])}
}

y
##  [1] 0.0100000 0.0400000 0.0900000 0.1600000 0.7071068 0.7745967 0.8366600
##  [8] 0.8944272 0.9486833 1.0000000
# What does this evaluate to, and why?
(TRUE + TRUE) * FALSE
## [1] 0
(TRUE + TRUE) * TRUE
## [1] 2

Installing packages in R

Package = fundamental unit of shareable code

Bundles together code, data, documentation, tests, and is easy to share with others

3 main repositories for packages in R:

GitHub: Install the package of interest from GitHub, for example install_github(“Displayr/flipPlots”)

There are > 6000 packages at CRAN alone! And growing rapidly!!

=> Chances are, someone has probably already solved a problem you’re working on

Make sure to give credit to package developers if you use their work! (citations etc)

Let’s practice installing a couple of packages:

Tidyverse and devtools package from CRAN

install.packages(c(“tidyverse”, “devtools”)) ## You only need to do this once per package

library(tidyverse)

library(devtools) ?tidyverse

edgeR RNA-seq expression analysis package from Bioconductor

This is just an example, we won’t actually be using edgeR today

source(“http://www.bioconductor.org/biocLite.R”) ## You only need to do this once per package

biocLite(“edgeR”) ## You only need to do this once per package

library(edgeR)

vignette(“edgeR”)

Install a GitHub package (using the devtools package above)

This is just an example, we won’t actually be using AnomalyDetection today

install_github(“twitter/AnomalyDetection”) ## You only need to do this once per package

library(AnomalyDetection)

Core structures for analysis: data.frames and tibbles First of all: What is the tidyverse? https://www.tidyverse.org/ - A “meta-package” that is an optionated collection of several R packages for data science - Includes ggplot2 (plotting), dplyr (transformating/summarizing dataframe content), tidyr (tidying dataframes), purr, tibble

Pipe operator: f(x) can be rewritten as x %>% f

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(devtools)
## Loading required package: usethis
data(iris)
head(iris)
iris %>% head()
mean.sepal <- mean(iris$Sepal.Length)
mean.sepal <- iris$Sepal.Length %>% mean()


x <- c(0.109, 0.359, 0.63, 0.996, 0.515, 0.142, 0.017, 0.829, 0.907)

round(exp(diff(log(x))), 1)
## [1]  3.3  1.8  1.6  0.5  0.3  0.1 48.8  1.1
x %>% log() %>%
  diff() %>%
  exp() %>%
  round(1)
## [1]  3.3  1.8  1.6  0.5  0.3  0.1 48.8  1.1

dplyr, pipe operator

data(starwars)
#
## Look at data: what do rows represent? how about columns?
starwars
# 
# ## filter(): filter data frame on row
filter(starwars, species=="Droid")
starwars[which(starwars$species=="Droid"),] ## base R equivalent
starwars %>% filter(species=="Droid")     ## equivalent with a pipe
starwars %>% filter(., species=="Droid")     ## equivalent with a pipe
starwars %>% filter(species=="Droid" & eye_color=="red")
new_df <- starwars %>% 
  filter(species=="Droid", eye_color=="red") ## 'and' conditions can be separated with comma
new_df
# ## select(): filter data frame on column
starwars %>% select(mass, homeworld)
starwars %>% select(-hair_color)   ## We can remove rows using a '-'
starwars %>%
  filter(species=="Droid") %>%
  select(name, mass)   ## Filter and select can be combined using a pipe %>%
# ## mutate(): add new column to data frame
# ## Note: we generally avoid overwriting data in a data frame, better to create a new variable
starwars %>%
  mutate(height_in_inches=0.393701 * height) 
 # %>% View()

# ## group_by(): establish a grouping for downstream operations, remove with ungroup()
# ## tally(): count number of observations per grouping
starwars %>%
  group_by(species) %>%
  tally()
# ## summarize(): perform summary statistic on a column. Fun fact: this also exists as summarise()
starwars %>% summarize(mean_mass = mean(mass, na.rm=TRUE))
starwars %>%
  group_by(species) %>%
  summarize(mean_mass = mean(mass, na.rm=TRUE))
# ## arrange(): arrange a column

starwars %>% arrange(desc(mass)) #  %>% View()
starwars %>% arrange(mass)       #  %>% View()
# ## unique, na.omit()
starwars %>% unique()
starwars %>% na.omit()
## count() vs tally()


# In brief, tally() is short-hand for summarise() 

# and

# count() is a short-hand for group_by() + tally()

# Here are some examples:

mtcars %>% tally()
mtcars %>% group_by(cyl) %>% tally()
mtcars %>% count(cyl)

ggplot2

-If there’s time: a quick start on ggplot2

ggplot2 grammar: - data = dataframe being plotted - geometrics = the geometric shape that represents the data (point, boxplot, histogram, violin, bar, etc) - aesthetics = aesthetics of the geometric object (color, size, shape, etc)

library(ggplot2)
ggplot(starwars, aes(x=mass, y=height)) +
  geom_point()
## Warning: Removed 28 rows containing missing values (geom_point).

# ## NOTE: did you see that we are using '+' here for ggplot2 and not the pipe '%>%'?

plot(starwars$mass, starwars$height) ## base R version

## These are all the same

ggplot() +
  geom_point(data=starwars, color="red", 
             aes(x=mass, y=height))
## Warning: Removed 28 rows containing missing values (geom_point).

ggplot() +
  geom_point(data=starwars, color="red",
             aes(x=mass, y=height))
## Warning: Removed 28 rows containing missing values (geom_point).

ggplot(data=starwars) +
  geom_point(color="red", aes(x=mass, y=height))
## Warning: Removed 28 rows containing missing values (geom_point).

ggplot(starwars, aes(x=mass, y=height)) +
  geom_point(color="red")
## Warning: Removed 28 rows containing missing values (geom_point).

If we place color inside of the aesthetics, it maps it to the data

ggplot(starwars, aes(x=mass, y=height, color=gender)) +
  geom_point()
## Warning: Removed 28 rows containing missing values (geom_point).

new_df <- starwars %>% filter(mass < 500)
ggplot(new_df, 
       aes(x=mass, y=height, color=gender)) +
  geom_point()

If we place color inside of the aesthetics, it maps it to the data

ggplot(starwars, aes(x=mass, y=height, color=gender)) +
  geom_point()
## Warning: Removed 28 rows containing missing values (geom_point).

new_df <- starwars %>% filter(mass < 500)
new_df
ggplot(new_df, 
       aes(x=mass, y=height, color=gender)) +
  geom_point()

## We can change shape aesthetics too

ggplot(starwars, aes(x=mass, y=height, color=gender, shape=gender)) +
  geom_point()
## Warning: Removed 29 rows containing missing values (geom_point).

## Aesthetics can be placed inside the relevant geom: the following two are equivalent

library(ggplot2)

ggplot(starwars, aes(x=mass, y=height, color=gender, shape=gender)) +
  geom_point()
## Warning: Removed 29 rows containing missing values (geom_point).

ggplot(starwars) +
  geom_point(aes(x=mass, y=height, color=gender, shape=gender))
## Warning: Removed 29 rows containing missing values (geom_point).

## Aesthetics are only for mapping:

ggplot(starwars) + geom_point(aes(x=mass, y=height, color=“blue”)) ## This doesn’t work

ggplot(starwars) + geom_point(aes(x=mass, y=height), color=“blue”) ## This does work

Can use multiple geoms, combine with dplyr functions

ggplot(filter(starwars, mass < 500), aes(x=mass, y=height)) +
  geom_point()  +
  geom_line()

## Other geoms: histograms, boxplots, density plots, violin plots, barplots

ggplot(starwars) +
  geom_histogram(aes(x=height), fill="orange", color="brown")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 6 rows containing non-finite values (stat_bin).

ggplot(starwars) +
  geom_boxplot(aes(x="", y=log(mass)))
## Warning: Removed 28 rows containing non-finite values (stat_boxplot).

ggplot(starwars) +
  geom_boxplot(aes(x=gender, y=log(mass), fill=gender))
## Warning: Removed 28 rows containing non-finite values (stat_boxplot).

ggplot(starwars) +
  geom_violin(aes(x=gender, y=log(mass), fill=gender))
## Warning: Removed 28 rows containing non-finite values (stat_ydensity).
## Warning: Groups with fewer than two data points have been dropped.

ggplot(starwars) +
  geom_bar(aes(x=gender, fill=gender))

# 
ggplot(starwars) +
  geom_density(aes(x=log(mass), fill=gender))
## Warning: Removed 28 rows containing non-finite values (stat_density).

## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

ggplot(starwars) +
  geom_density(aes(x=log(mass), fill=gender), alpha=0.9)  
## Warning: Removed 28 rows containing non-finite values (stat_density).
## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Can add transparency to object

names(starwars)
##  [1] "name"       "height"     "mass"       "hair_color" "skin_color"
##  [6] "eye_color"  "birth_year" "sex"        "gender"     "homeworld" 
## [11] "species"    "films"      "vehicles"   "starships"
starwars$gender
##  [1] "masculine" "masculine" "masculine" "masculine" "feminine"  "masculine"
##  [7] "feminine"  "masculine" "masculine" "masculine" "masculine" "masculine"
## [13] "masculine" "masculine" "masculine" "masculine" "masculine" "masculine"
## [19] "masculine" "masculine" "masculine" "masculine" "masculine" "masculine"
## [25] "masculine" "masculine" "feminine"  "masculine" "masculine" "masculine"
## [31] "masculine" "masculine" "masculine" "masculine" "masculine" "masculine"
## [37] NA          "masculine" "masculine" NA          "feminine"  "masculine"
## [43] "masculine" "feminine"  "masculine" "masculine" "masculine" "masculine"
## [49] "masculine" "masculine" "masculine" "feminine"  "masculine" "masculine"
## [55] "masculine" "masculine" "masculine" "feminine"  "masculine" "masculine"
## [61] "feminine"  "feminine"  "feminine"  "masculine" "masculine" "masculine"
## [67] "feminine"  "masculine" "masculine" "feminine"  "feminine"  "masculine"
## [73] "feminine"  "masculine" "masculine" "feminine"  "masculine" "masculine"
## [79] "masculine" NA          "masculine" "masculine" "feminine"  "masculine"
## [85] "masculine" NA          "feminine"

Some additional options

ggplot(starwars) +
  geom_histogram(aes(x=height), fill="orange", color="brown") +
  xlab("Height") +
  ylab("Count") +
  ggtitle("Histogram of Star Wars character heights")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 6 rows containing non-finite values (stat_bin).

g1 <- ggplot(starwars) +
  geom_histogram(aes(x=height), fill="orange", color="brown") 



ggplot(starwars) +
  geom_histogram(aes(x=height), fill="orange", color="brown") +
  xlab("Height") +
  ylab("Count") +
  ggtitle("Histogram of Star Wars character heights") +
  theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 6 rows containing non-finite values (stat_bin).

First some review about the tidyverse:

dplyr

=> filter, select, mutate, group_by, tally, summarize, arrange, unique, na.omit

library(dplyr)
data(iris)
# iris %>% select(Sepal.Length)%>% mean()
mean(iris$Sepal.Length)
## [1] 5.843333
iris %>% summarize(mean.SL = mean(Sepal.Length))
## Mean Sepal.Length per species
iris %>% 
  group_by(Species) %>% 
  summarize(mean(Sepal.Length),
            median.SL = median(Sepal.Length))
iris %>% arrange(Sepal.Length, Sepal.Width)
iris %>% arrange(desc(Sepal.Length))
# arrange() orders the rows of a data frame by the values of selected columns.

ggplot2

=> data + geom + aesthetics

geom_point, geom_histogram, geom_line, geom_boxplot, geom_violin

Name conflicts in package functions (e.g. filter and lag in tidyverse)

?filter

dplyr::filter stats::filter my_filter <- dplyr::filter filter <- dplyr::filter

Both packages dplyr and MASS – function select

ggplot(data = ) + ( mapping = aes(), stat = , position = ) + +

ggplot2: Statistical transformations

library(tidyverse)
data(diamonds)

## What is plotted here? Where does count come from?
## Histograms, barplots, frequency polygons, smoothers, boxplots, ...
ggplot(diamonds) +
  geom_bar(aes(x = cut))

ggplot(diamonds) +
  stat_count(aes(x = cut))

With bar graphs, there are two different things that the heights of bars commonly represent:

  1. The count of cases for each group - typically, each x value represents one group. This is done with stat_bin, which calculates the number of cases in each group (if x is discrete, then each x value is a group; if x is continuous, then all the data is automatically in one group, unless you specify grouping with group=xx).

  2. The value of a column in the data set. This is done with stat_identity, which leaves the y values unchanged.

For line graphs, the data points must be grouped so that it knows which points to connect.

If group = 1, then all points would be considered connected.
When more variables are used and multiple lines are drawn, the grouping for lines is usually done by variable.

We can override default stat for a given geom

ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut, y = ..prop.., group = 1))

What happens if we don’t set group = 1 above? What is wrong with the following two graphs?

ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut, y = ..prop..))

ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut, fill = color, y = ..prop..))

ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut, y = ..prop..))

We can even explicitly use stat_## instead of the geom By default, each stat is associated with a geom and vice versa

ggplot(data = diamonds) +
  stat_count(mapping = aes(x = cut))

ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut))

ggplot(data = diamonds) +
  stat_summary(
    mapping = aes(x = cut, y = depth),
    fun.ymin = min,
    fun.ymax = max,
    fun.y = median
  )
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: `fun.ymin` is deprecated. Use `fun.min` instead.
## Warning: `fun.ymax` is deprecated. Use `fun.max` instead.

Stacked versus side-by-side barplots

ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut, fill = clarity))

ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut, fill = clarity), position="fill")

ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut, fill = clarity), position="dodge")

geom_smooth: What variables does stat_smooth compute? What parameters control its behavior?

http://statseducation.com/Introduction-to-R/modules/graphics/smoothing/

It can be hard to view trends with just points alone. Many times we wish to add a smoothing line in order to see what the trends look like.

This can be especially helpful when trying to understand regressions.

http://search.r-project.org/library/ggplot2/html/geom_smooth.html Smoothing method (function) to use, accepts either NULL or a character vector,

e.g. “lm”, “glm”, “gam”, “loess” or a function,

e.g. MASS::rlm or mgcv::gam, stats::lm, or stats::loess. “auto” is also accepted for backwards compatibility. It is equivalent to NULL.

For method = NULL the smoothing method is chosen based on the size of the largest group (across all panels). stats::loess() is used for less than 1,000 observations;

otherwise mgcv::gam() is used with formula = y ~ s(x, bs = “cs”) with method = “REML”.

Somewhat anecdotally, loess gives a better appearance, but is O(N^2) in memory, so does not work for larger datasets.

If you have fewer than 1,000 observations but want to use the same gam() model

that method = NULL would use, then set method = “gam”, formula = y ~ s(x, bs = “cs”).

Residual maximum likelihood (REML)

Local Polynomial Regression Fitting (LOESS)

mgcv::gam(): Generalized additive models with integrated smoothness estimation

MASS::rlm(): Robust Fitting of Linear Models

ggplot(diamonds, aes(x=carat, y=price, color=cut)) +
  geom_point() +
  geom_smooth(aes(fill=cut), alpha=0.5)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(diamonds, aes(x=carat, y=price, color=cut)) +
  geom_point() +
  geom_smooth(aes(fill=cut), alpha=0.5, method="lm")
## `geom_smooth()` using formula 'y ~ x'

geom_smooth

ggplot2: overplotting & Jittering

What to do about overplotting? Jittering?

Jittering is the act of adding random noise to data in order to prevent overplotting in statistical graphs.

Overplotting can occur when a continuous measurement is rounded to some convenient unit. This has the effect of changing a continuous variable into a discrete ordinal variable.

For example, age is measured in years and body weight is measured in pounds or kilograms. If you construct a scatter plot of weight versus age for a sufficiently large sample of people, there might be many people recorded as, say, 29 years and 70 kg, and therefore many markers plotted at the point (29, 70).

To alleviate overplotting, you can add small random noise to the data. The size of the noise is often chosen to be the width of the measurement unit. For example, to the value 70 kg you might add the quantity u, where u is a uniform random variable in the interval [-0.5, 0.5].

You can justify jittering by assuming that the true weight of a 70 kg person is equally likely to be anywhere in
the interval [69.5, 70.5].

https://blogs.sas.com/content/iml/2011/07/05/jittering-to-prevent-overplotting-in-statistical-graphics.html

Look at the parameters to control the amount of jittering When your dataset is large, the dots of your scatterplot will tend to overlap, making the graphic unreadable.

The jitter geom is a convenient shortcut for geom_point(position = “jitter”). It adds a small amount of random variation to the location of each point, and is a useful way of handling overplotting caused by discreteness in smaller datasets.

data(mpg)
ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy))

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy), position = "jitter")

ggplot(data = mpg) +
  geom_jitter(mapping = aes(x = displ, y = hwy))

ggplot(data = mpg) +
  geom_count(mapping = aes(x = displ, y = hwy))

alpha aesthetic will make the points more transparent

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy), alpha=0.25)

Facets: smaller plots that display a subset of the data Useful for exploring conditional relationships and large data

facet_wrap

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy))

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy)) +
  facet_grid(~trans)

## What is the difference between facet_wrap and facet_grid?
ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy)) +
  facet_wrap(~trans)

## What happens if you facet on a continuous variable?

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy)) +
  facet_wrap(~cty)

coordinates in ggplot: can flip axes with coord_flip

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy))

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy)) +
  coord_flip()

Saving figures: ggsave

g <- ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy)) +
  coord_flip()
ggsave(g, filename="first_ggplot.pdf")
## Saving 7 x 5 in image
ggsave(filename="first_ggplot.png")  ## Save last plot
## Saving 7 x 5 in image

Combining ggplot2 figures

#install.packages("cowplot")  ## Only do this once!
library(cowplot)
plot_grid(g, g, labels=c("A","B"))

ggplot2 + forcats: Factors and group ordering

R uses factors to handle categorical variables, variables that have a fixed and known set of possible values.

Factors are also helpful for reordering character vectors to improve display.

Working with factors and group order in ggplot: forcats

install.packages(“forcats”) ## not explicitly part of the tidyverse

The goal of the forcats package is to provide a suite of tools that

solve common problems with factors, including changing the order of

levels or the values. Some examples include:

  • fct_reorder(): Reordering a factor by another variable.
  • fct_infreq(): Reordering a factor by the frequency of values.
  • fct_relevel(): Changing the order of a factor by hand.
  • fct_lump(): Collapsing the least/most frequent values of a factor into “other”.

https://forcats.tidyverse.org/

library(forcats)

colors <- c("red","blue","green","green","red","blue","red","green","blue")
class(colors)
## [1] "character"
colors <- factor(colors)
print(colors)
## [1] red   blue  green green red   blue  red   green blue 
## Levels: blue green red
class(colors)
## [1] "factor"
# Each color is a level, and underlying each level is an integer associated with that
# level; red = 3, green = 2, blue = 1. The order of the levels is assigned
# alphabetically to the integer: b = 1, g = 2, r = 3.
colors <- factor(colors, levels = c("red","blue", "green"))
print(colors)
## [1] red   blue  green green red   blue  red   green blue 
## Levels: red blue green
data(airquality)
airquality$Month <- factor(airquality$Month)

levels(airquality$Month)
## [1] "5" "6" "7" "8" "9"
ggplot(airquality, aes(Month, Temp)) +
  geom_boxplot(aes(fill = Month)) +
  ggtitle(label = "Daily Temperatures Aggregated by Month")

Renaming factor levels: fct_recode – Manually change levels.

airquality$Month <- fct_recode(airquality$Month,
                               May='5', June='6', July='7', Aug='8', Sept='9')

ggplot(airquality, aes(Month, Temp)) +
  geom_boxplot(aes(fill = Month)) +
  ggtitle(label = "Daily Temperatures Aggregated by Month")

Reverse factor order: fct_reverse

ggplot(airquality, aes(fct_rev(Month), Temp)) +
  geom_boxplot(aes(fill = Month)) +
  labs(x = "Month") +
  ggtitle(label = "Our plot now has the x-axis in reverse order")

Factor relevel – Manually reorder factor levels

airquality$Month <- fct_relevel(airquality$Month, 'Sept', 'July', 'May', 'Aug', 'June')
levels(airquality$Month)
## [1] "Sept" "July" "May"  "Aug"  "June"
ggplot(airquality, aes(Month, Temp)) +
  geom_boxplot(aes(fill = Month)) +
  ggtitle(label = "Notice how the order of the level 'Month' has changed")

## order factors according to another variable

data(mtcars)

mtcars$model <- factor(row.names(mtcars))

class(mtcars$model)
## [1] "factor"
ggplot(mtcars, aes(mpg, model)) +
  geom_point() +
  ggtitle(label = "MPG vs. Car Model")

ggplot(mtcars, aes(mpg, fct_reorder(model, mpg))) +
  geom_point() +
  labs(y = "model") +
  ggtitle(label = "Better comparison by reordering levels based on mpg values!") +
  theme(plot.title = element_text(size = 10, face = 'bold'))

## ggplot2: Color schemes

colors()

http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf

Color choices in R: great palette selections from RColorBrewer

Note: works in ggplot as well as base R

install.packages(“RColorBrewer”) ## Only have to do this once

library(RColorBrewer)
display.brewer.all()

3 classes of colors: sequential, qualitative, diverging

When would each type of color class be appropriate?

Another great color palette: viridis Designed to be perfectly perceptually-uniform, both in color and black-and-white

Also designed to be perceived by people with the most common form of color blindness install.packages(“viridis”) ## Only have to do this once

library(viridis)

vignette(“intro-to-viridis”)

Using color palettes with ggplot:

Think about what you’re trying to color (discrete vs continuous) and why!

Color schemes: scale_color_brewer(), scale_fill_brewer(), scale_color_gradient(), …

library(viridis)
## Loading required package: viridisLite
ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy, color=hwy)) +
  scale_color_viridis()

## Why does this throw an error?

Error: Discrete value supplied to continuous scale

ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy, color=drv)) + scale_color_viridis()

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy, color=drv)) +
  scale_color_brewer(palette = "Accent") +
  theme_bw()

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy, color=drv)) +
  scale_color_viridis(discrete=TRUE) +
  theme_bw()

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy, color=hwy)) +
  scale_color_gradient() +
  theme_bw()

Choosing colors by hand

Ewww, this plot looks terrible, don’t do red and green together!

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy, color=hwy)) +
  scale_color_gradient(low="green", high="red") +
  theme_bw()

# 
ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy, color=drv)) +
  scale_color_manual(values=c("darkblue", "seagreen1", "deeppink2")) +
  theme_bw()

dplyr: between() is a replacement for x >= left & x <= right,

x <- rnorm(1e2) x[between(x, -1, 1)]

diamonds %>% filter(between(depth, 60, 61)) %>% (view)
diamonds

Other select verbs: starts_with, ends_with, contains, everything, rename, …

starwars %>% dplyr::select(ends_with("color")) 
starwars %>% dplyr::select(contains("_"))
starwars %>% dplyr::select(everything()) 
starwars %>% dplyr::select(gender, everything()) 
starwars %>% dplyr::rename(Name = name) 
starwars %>% rename(Name = name) 

Multiple groupings & ungroup

grp <- diamonds %>% group_by(cut, color)
summarize(grp, avg_price = mean(price),
          median_carat = median(carat),
          count = n())  
## `summarise()` has grouped output by 'cut'. You can override using the `.groups` argument.
grp
## And we can ungroup
grp %>%
  ungroup() %>%
  summarize(n())
grp 

gapminder data

install.packages(“gapminder”) ## Only have to do this one time

library(gapminder)
my_gap <- data(gapminder)


# Answer the following questions:

# - How many observations are there in the data? How many observations per continent? How

# many observations are there per country within each continent? How many variables?

# What data type is each variable?

# - Calculate the average life expectancy per continent. Now calculate the median life

# expectancy per continent in 1952 and 2007.

# - Make a simple scatterplot of GDP per capita versus life expectancy for 2007, where the

# size of the points reflects the size of the population.

# - Now make the plot as ugly and uninformative as possible -- think bad color choices,

# truncated axes, overlapping points, ...

# - Now let's make it as informative as possible -- think x-axis on log10 scale, informative

# color/shape, etc....

# - Taking your nice looking plot, facet by continent. Using the country_colors object in

# gapminder, color the points by country.

# - Explore some other plots! Don't hesitate to use filter or select to focus on a subset.
#
# - Identify the countries with worst and best life expectancy in Asia for each year. 
# Hint: use min_rank().

# - Identify which country had the sharpest 5-year drop in life expectancy in each continent.

# Hint: use lag() in a mutate() to create a new variable representing the lag.

Reference

  1. http://ggplot2.tidyverse.org/reference/#scales

  2. http://ggplot2.tidyverse.org/reference/ggtheme.html

  3. https://cran.r-project.org/web/packages/ggthemes/vignettes/ggthemes.html

  4. https://archive.ics.uci.edu/ml/datasets.php

  5. https://www.datamentor.io/r-programming/break-next/

  6. https://www.r-project.org/

  7. http://bioconductor.org/

  8. https://neuroconductor.org/

  9. http://www.bioconductor.org/biocLite.R

  10. http://r4ds.had.co.nz/transform.html

  11. http://stat545.com/block015_graph-dos-donts.html

  12. http://www.cookbook-r.com/Graphs/Bar_and_line_graphs_(ggplot2)/

  13. http://statseducation.com/Introduction-to-R/modules/graphics/smoothing/

  14. http://search.r-project.org/library/ggplot2/html/geom_smooth.html

  15. https://blogs.sas.com/content/iml/2011/07/05/jittering-to-prevent-overplotting-in-statistical-graphics.html

  16. https://forcats.tidyverse.org/

  17. http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf