my_vector <- c(1,3,5,8,10,-5,3)
length(my_vector)
## [1] 7
sum(my_vector)
## [1] 25
Set random seed and generate some random normal draws. It generates the same random vector each time you call the random function
set.seed(13579)
my_normal_vector <- rnorm(n=length(my_vector))
my_normal_vector
## [1] -1.2347155 -1.2528339 -0.2547780 -1.5266466 1.0971147 2.4887442 0.7794803
#
## Vector multiplication (elementwise)
my_vector * my_normal_vector
## [1] -1.234715 -3.758502 -1.273890 -12.213173 10.971147 -12.443721 2.338441
## Inner product
## The quantity obtained by multiplying the corresponding
## coordinates of each of two vectors and adding the products.
my_vector %*% my_normal_vector
## [,1]
## [1,] -17.61441
## Create matrix: 5 x 4
## Normal distribution: mean=10, sd=4
# The r is for “random“, and it is a random variable having the specified
# distribution. For example, rnorm() function
my_matrix <- matrix(rnorm(n=5*4, mean=10, sd=4),
nrow=5, ncol=4)
my_matrix
## [,1] [,2] [,3] [,4]
## [1,] 10.753500 6.933320 10.987272 3.201161
## [2,] 5.894216 8.250841 6.553208 13.599404
## [3,] 8.973170 8.113796 12.639644 12.884479
## [4,] 12.984201 7.754463 9.852634 9.346003
## [5,] 11.884883 5.139338 6.189902 6.933884
## Create another matrix: 5 x 4
## rexp distribution
# rexp(m, r)—Returns a vector of m random numbers having the exponential
# distribution
gen_exp <- rexp(n=5*4, rate=1)
gen_exp
## [1] 0.33735737 2.82539938 1.12580580 0.44286020 0.02611241 1.30599715
## [7] 0.19481413 0.09892251 2.12785120 1.52468762 0.58872486 0.43111999
## [13] 4.14845884 0.40606674 2.44914039 0.30681808 1.29253900 0.12032613
## [19] 0.82349666 1.08946855
my_matrix_2 <- matrix(gen_exp, ncol=4, nrow=5)
my_matrix_2
## [,1] [,2] [,3] [,4]
## [1,] 0.33735737 1.30599715 0.5887249 0.3068181
## [2,] 2.82539938 0.19481413 0.4311200 1.2925390
## [3,] 1.12580580 0.09892251 4.1484588 0.1203261
## [4,] 0.44286020 2.12785120 0.4060667 0.8234967
## [5,] 0.02611241 1.52468762 2.4491404 1.0894686
## Double check the product functions (elementwise multiplication)
my_matrix * my_matrix_2
## [,1] [,2] [,3] [,4]
## [1,] 3.6277725 9.0548966 6.468480 0.9821741
## [2,] 16.6535149 1.6073804 2.825219 17.5777600
## [3,] 10.1020464 0.8026371 52.435041 1.5503396
## [4,] 5.7501857 16.5003440 4.000827 7.6964023
## [5,] 0.3103429 7.8358856 15.159939 7.5542487
## Make use of head and tail commands for sanity checks
my_huge_matrix <- matrix(rgamma(10000,
shape=1),
ncol=10)
head(my_huge_matrix)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 0.1585023 0.2594988 0.3045265 0.08611515 1.3593091 0.46796143 0.01747011
## [2,] 2.3844915 2.1434004 0.2076556 0.63440774 0.4867942 2.49800677 1.33030870
## [3,] 2.0498783 0.5644282 0.6914348 0.09216033 0.1677927 0.06666595 1.04110944
## [4,] 0.6544284 0.7128908 2.6595979 0.62706588 0.5343128 0.61398043 1.22492017
## [5,] 0.2356970 0.0428360 0.9375176 1.54803682 2.1684759 0.96108695 2.83205475
## [6,] 1.6757674 0.1273529 0.3218827 0.05611610 0.4491089 1.35358352 0.53563203
## [,8] [,9] [,10]
## [1,] 1.11846385 0.48473271 2.56600850
## [2,] 1.00036235 1.47877854 0.27404199
## [3,] 1.31866862 0.00207013 0.01210107
## [4,] 0.67984458 0.61003525 0.69062251
## [5,] 0.09203886 3.78459495 1.74063865
## [6,] 2.12307883 0.55484225 0.64885783
tail(my_huge_matrix)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [995,] 2.2065900 2.6204056 0.3617654 0.3113698 0.9299397 1.71883278 0.6832110
## [996,] 0.1513955 0.4863449 0.6484172 1.5576926 1.6252196 0.22696258 0.1349167
## [997,] 1.5559238 0.1786538 0.4939397 3.0058030 0.5783891 1.21986101 0.4762442
## [998,] 0.3488299 0.3241757 0.6761012 0.8329858 0.6561114 0.04526154 0.6484562
## [999,] 0.3982543 1.2403230 0.1033660 2.1795944 0.9295799 0.35622295 0.6209044
## [1000,] 1.5952870 0.2751454 1.0844354 0.1188376 0.2601905 1.78664851 2.0429866
## [,8] [,9] [,10]
## [995,] 0.06476413 0.3178026 0.43963115
## [996,] 2.13606096 0.3472772 0.06109482
## [997,] 1.99384835 0.4734880 1.22129446
## [998,] 0.46446190 0.4860314 0.22367593
## [999,] 0.35793411 1.4893320 1.01059960
## [1000,] 1.20263627 0.3589378 0.83118101
## Make sequence of values
1:10
## [1] 1 2 3 4 5 6 7 8 9 10
0:10
## [1] 0 1 2 3 4 5 6 7 8 9 10
10:0
## [1] 10 9 8 7 6 5 4 3 2 1 0
-5:5
## [1] -5 -4 -3 -2 -1 0 1 2 3 4 5
seq(from=-5, to=50, by=5)
## [1] -5 0 5 10 15 20 25 30 35 40 45 50
seq(from=-5, to=50, length=50)
## [1] -5.0000000 -3.8775510 -2.7551020 -1.6326531 -0.5102041 0.6122449
## [7] 1.7346939 2.8571429 3.9795918 5.1020408 6.2244898 7.3469388
## [13] 8.4693878 9.5918367 10.7142857 11.8367347 12.9591837 14.0816327
## [19] 15.2040816 16.3265306 17.4489796 18.5714286 19.6938776 20.8163265
## [25] 21.9387755 23.0612245 24.1836735 25.3061224 26.4285714 27.5510204
## [31] 28.6734694 29.7959184 30.9183673 32.0408163 33.1632653 34.2857143
## [37] 35.4081633 36.5306122 37.6530612 38.7755102 39.8979592 41.0204082
## [43] 42.1428571 43.2653061 44.3877551 45.5102041 46.6326531 47.7551020
## [49] 48.8775510 50.0000000
## Data frame versus matrix
my_df <- data.frame(my_matrix,
letters=c("A", "b", "c", "d","e"))
## Character vectors
my_name_vector <- c("Anne", "Bob", "Charles")
## Check the class of objects
class(my_name_vector)
## [1] "character"
class(my_normal_vector)
## [1] "numeric"
class(my_huge_matrix)
## [1] "matrix" "array"
class(my_df)
## [1] "data.frame"
#-------------------------------------------------------------
##
## Histogram: hist
## ?hist
## Histogram of first five columns of my_huge_matrix
hist(my_huge_matrix[,c(1:5)] )
hist(my_huge_matrix[,1] ,w=3)
hist(my_huge_matrix[,2] ,w=3)
hist(my_huge_matrix[,3], main="Better histogram",
xlab="Value", breaks=50,
col="cornflowerblue" ,w=3)
#To put all the 6 plots in a 2 x 3 matrix.
par(mfrow=c(2,3))
hist(my_huge_matrix[,1])
hist(my_huge_matrix[,2])
hist(my_huge_matrix[,3], main="Better histogram",
xlab="Value", breaks=50,
col="cornflowerblue")
## Scatterplots
data(iris)
# ?plot
## Plot petal length vs petal width
col_vector <- rep(NA, nrow(iris))
setosa_index <- which(iris$Species == "setosa")
versicolor_index <- which(iris$Species == "versicolor")
virginica_index <- which(iris$Species == "virginica")
col_vector[setosa_index] <- "blue"
col_vector[versicolor_index] <- "red"
col_vector[virginica_index] <- "black"
##The pch arguement in the plot function can be varied to change the marker.
plot(x=iris$Petal.Length,
y=iris$Petal.Width,
col=col_vector, pch=19)
legend("topleft", pch=19,
col=c("blue","red","black"),
legend=c("Setosa", "Versicolor", "Virginica"))
## Plot Petal length vs width, coloring by sepal length
col_vector <- rep(NA, nrow(iris))
high_index <- which(iris$Sepal.Length >
mean(iris$Sepal.Length))
col_vector[high_index] <- "red"
col_vector[-high_index] <- "blue"
shape_vector <- rep(NA, nrow(iris))
shape_vector[setosa_index] <- "S"
shape_vector[versicolor_index] <- "E"
shape_vector[virginica_index] <- "I"
plot(x=iris$Petal.Length,
y=iris$Petal.Width,
col=col_vector, pch=shape_vector)
legend("topleft", pch=19,
col=c("blue","red"),
legend=c("Low sepal length",
"High sepal length"))
plot(x=1:nrow(iris), y=sort(iris$Petal.Width),
type="l")
boxplot(iris$Petal.Length ~ iris$Species,
col=c("red", "blue", "green"))
## Density plots
## Hint: which function to select species
## lines()
plot(density(iris$Petal.Length[which(iris$Species ==
"setosa")]),
xlim=c(0,8))
lines(density(iris$Petal.Length[which(iris$Species ==
"virginica")]))
lines(density(iris$Petal.Length[which(iris$Species ==
"versicolor")]))
#R is case sensitive.
#R index starts from 1
####Help
#Help Home Page, Special Character Help, Search Help, Search Function (with partial name)
#
# help.start()
#
# help('$')
#
# ?"$"
# apropos('med')
example(median)
##
## median> median(1:4) # = 2.5 [even number]
## [1] 2.5
##
## median> median(c(1:3, 100, 1000)) # = 3 [odd, robust]
## [1] 3
# help(package = "dplyr")
#Get help on a function of a package
# help("tally", package = "dplyr")
####Basics
# The arrow operator (<-) sets the left-hand side equal to the right-hand side.
# It computes a1 and assigns it to the variable a2
#a2<- a1
a1 <- 6 + 7
a2 <- a1
# View(a2)
rm(a2)
#Comments starts with #
####R Built-in Data Sets
# data()
# Loading datasets
data(mtcars)
# Print the first 6 rows of a dataset
head(mtcars, 6)
# ?mtcars
# Number of rows (observations)
nrow(mtcars)
## [1] 32
# Number of columns (variables)
ncol(mtcars)
## [1] 11
data(iris)
# You could find a large number of datasets at https://archive.ics.uci.edu/ml/datasets.php
You could find a large number of datasets at https://archive.ics.uci.edu/ml/datasets.php
R packages: collections of functions and data sets developed by the community. Examples: “dplyr” or “data.table”
A number of packages are automatically loaded when you start R (e.g., “base”, “utils”, “graphics”, and “stats”)
First you install packages and then you should load them using command “library()”
install packages() install.packages(“tidyr”)
library(tidyr) browseVignettes()
browseVignettes(package=“packagename”)
vignette(package = “ggplot2”)
detach(packageName)
current working directory where inputs are found, and outputs are sent.
getwd()
setwd(C://file/path)
Set working directory to a new place !!! You need to change this! setwd(‘/Users/kravvaz/Desktop/PH718_spring20/Lecture2’)
### Try the following commands
getwd() ## Get working directory
## [1] "C:/Users/anemati/OneDrive - mcw.edu/UWM/Spring/PH 718 R/R code In class"
dir() ## What else is in that directory?
## [1] "code-in-class.pdf"
## [2] "code-in-class.Rmd"
## [3] "code-in-class_1.pdf"
## [4] "code-in-class_1.Rmd"
## [5] "code-in-class_files"
## [6] "comment"
## [7] "cowplot.R"
## [8] "data-wrangling-cheatsheet.pdf"
## [9] "Done"
## [10] "execute.R"
## [11] "first_ggplot.pdf"
## [12] "first_ggplot.png"
## [13] "Pearson Product-Moment Correlation.pdf"
## [14] "PH718_Lecture5_Scripts_updated.R"
## [15] "PH718_Lecture6_Scripts.R"
## [16] "PH718_Lecture7_Scripts.R"
## [17] "publishers.csv"
## [18] "R code for bio.Rproj"
## [19] "Spearman_s Rank-Order Correlation.pdf"
## [20] "superheroes.csv"
## [21] "Types of Variable.pdf"
objects()
## [1] "a1" "col_vector" "gen_exp" "high_index"
## [5] "iris" "mtcars" "my_df" "my_huge_matrix"
## [9] "my_matrix" "my_matrix_2" "my_name_vector" "my_normal_vector"
## [13] "my_vector" "setosa_index" "shape_vector" "versicolor_index"
## [17] "virginica_index"
ls()
## [1] "a1" "col_vector" "gen_exp" "high_index"
## [5] "iris" "mtcars" "my_df" "my_huge_matrix"
## [9] "my_matrix" "my_matrix_2" "my_name_vector" "my_normal_vector"
## [13] "my_vector" "setosa_index" "shape_vector" "versicolor_index"
## [17] "virginica_index"
rm(list=c("x", "y"))
## Warning in rm(list = c("x", "y")): object 'x' not found
## Warning in rm(list = c("x", "y")): object 'y' not found
ls()
## [1] "a1" "col_vector" "gen_exp" "high_index"
## [5] "iris" "mtcars" "my_df" "my_huge_matrix"
## [9] "my_matrix" "my_matrix_2" "my_name_vector" "my_normal_vector"
## [13] "my_vector" "setosa_index" "shape_vector" "versicolor_index"
## [17] "virginica_index"
a1 <- 6 + 7
a2 <- a1
is.numeric(a2)
## [1] TRUE
nchar(mtcars$hp)
## [1] 3 3 2 3 3 3 3 2 2 3 3 3 3 3 3 3 3 2 2 2 2 3 3 3 3 2 2 3 3 3 3 3
#Stores just as a date.
date1 <- as.Date('2020-01-23')
#Internally, Date objects are stored as the number of days since January 1, 1970
#To convert a Date object to its internal form, number of days since 1/1/1970.
as.numeric(date1)
## [1] 18284
#Stores a date and time.
date2 <- as.POSIXct("2020-01-23 17:30")
#In numeric form, number of seconds since 1/1/1970.
as.numeric(date2)
## [1] 1579822200
as.numeric(TRUE)
## [1] 1
class(iris$Species)
## [1] "factor"
levels(iris$Species)
## [1] "setosa" "versicolor" "virginica"
str(iris$Species)
## Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
v1 <- c(1, 2, 3, 4, 5, 6, 7, 8)
length(v1)
## [1] 8
unique(v1)
## [1] 1 2 3 4 5 6 7 8
rev(v1)
## [1] 8 7 6 5 4 3 2 1
v1[order(v1)]
## [1] 1 2 3 4 5 6 7 8
v1[v1 > 3]
## [1] 4 5 6 7 8
v1[1:3]
## [1] 1 2 3
v1[c(1,6)]
## [1] 1 6
v1[-(2:4)]
## [1] 1 5 6 7 8
v1[v1 == 5]
## [1] 5
v1[v1 %in% c(1, 2, 5)]
## [1] 1 2 5
v1 <- c(1, 2, 3, 4, 5, 6, 7, 8)
v1 <- 1:8
v1 <- seq(from=1, to=8, by=1)
#row.names(data.frame)
#nrow()
#length()
#subset(data.frame, [condition])
#rbind()
# with numerics from 1 up to 10
my_vector <- 1:10
my_vector
## [1] 1 2 3 4 5 6 7 8 9 10
# with numerics from 1 up to 9
my_matrix <- matrix(1:9, ncol = 3)
my_matrix
## [,1] [,2] [,3]
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
#First 10 elements of the built-in data frame mtcars
my_df <- mtcars[1:10,]
my_df
#Construct list with these different elements:
my_list <- list(my_vector, my_matrix, my_df)
my_list
## [[1]]
## [1] 1 2 3 4 5 6 7 8 9 10
##
## [[2]]
## [,1] [,2] [,3]
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
##
## [[3]]
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
#How to create a factor?
factor1 <- factor(c("single", "married", "married", "single"))
factor1
## [1] single married married single
## Levels: married single
factor2 <- factor(c("single", "married", "married", "single"),
levels = c("single", "married", "divorced"));
factor2
## [1] single married married single
## Levels: single married divorced
class(factor2)
## [1] "factor"
levels(factor2)
## [1] "single" "married" "divorced"
factor3 <- factor(c("single","married","married","single"))
str(factor3)
## Factor w/ 2 levels "married","single": 2 1 1 2
#How to access components of a factor?
factor3
## [1] single married married single
## Levels: married single
factor3[3] # access 3rd element
## [1] married
## Levels: married single
factor3[c(2, 4)] # access 2nd and 4th element
## [1] married single
## Levels: married single
factor3[-1] # access all but 1st element
## [1] married married single
## Levels: married single
factor3[c(TRUE, FALSE, FALSE, TRUE)] # using logical vector
## [1] single single
## Levels: married single
#How to modify a factor?
factor3
## [1] single married married single
## Levels: married single
factor3[2] <- "divorced" # modify second element;
## Warning in `[<-.factor`(`*tmp*`, 2, value = "divorced"): invalid factor level,
## NA generated
factor3[3] <- "widowed" # cannot assign values outside levels
## Warning in `[<-.factor`(`*tmp*`, 3, value = "widowed"): invalid factor level, NA
## generated
factor3
## [1] single <NA> <NA> single
## Levels: married single
#Add a value to the level first.
levels(factor3) <- c(levels(factor3), "widowed") # add new level
factor3[3] <- "widowed"
factor3
## [1] single <NA> widowed single
## Levels: married single widowed
#NA
#Missing data is encoded using value NA.
is.na(factor3) #tests whether factor3 is an NA
## [1] FALSE TRUE FALSE FALSE
###
set.seed(2020)
BMI<-rnorm(n=1000, m=26, sd=2)
hist(BMI)
hist(BMI, breaks=15, main="Breaks=15")
hist(BMI, breaks=5, main="Breaks=5")
#breaks() option with a vector gives you more control over exactly the breakpoints between bins by dictating the start and end point of each bin.
hist(BMI, breaks=c(18, 21, 24, 27, 30, 33, 36), main="Breaks is vector of breakpoints")
hist(BMI, breaks=seq(18,36,by=3), main="Breaks is vector of breakpoints")
#Density plot: freq=FALSE option
hist(BMI, freq=FALSE, main="Density plot")
#
example.sum <- function(a, b) {
return(a + b)
}
x <- 1:10
y <- 11:20
example.sum(x, y)
## [1] 12 14 16 18 20 22 24 26 28 30
### Naming conventions. Why not write example.sum() as:
example.sum.1 <- function(x, y, z) {
tmp <- x + y
tmp2 <- tmp + z
return(tmp2)
}
x <- 1:10
y <- 11:20
z <- 100
example.sum.1(x, y, z)
## [1] 112 114 116 118 120 122 124 126 128 130
### Try this
example.diff.sum <- function(x, y) {
newy = example.sum(x, y) - 10
return(newy)
}
# example.diff.sum(x, y)
example.diff.sum(10, 5)
## [1] 5
example.sum = function(a, b) {
return(a + b)
}
example.diff.sum = function(c, d) {
e = example.sum(c, d) - 10
return(e)
}
x <- 1:10
y <- 11:20
# example.diff.sum(x, y)
example.diff.sum(10, 5)
## [1] 5
HWE <- function(p) {
stopifnot(is.numeric(p))
stopifnot(p >= 0)
stopifnot(p <= 1)
prob_AA <- p ^ 2
prob_AB <- 2 * p * (1 - p)
prob_BB <- (1 - p) ^ 2
return(c(prob_AA, prob_AB, prob_BB))
}
HWE(0)
## [1] 0 0 1
HWE(0.5)
## [1] 0.25 0.50 0.25
# HWE(-5) # Error
# HWE("kourosh") # Error
C_to_F <- function(temp) {
return((temp * 1.8) + 32)
}
F_to_C <- function(temp) {
return((temp - 32) / 1.8)
}
par(mfrow = c(1, 2))
plot(1:100, C_to_F(1:100))
plot(-40:212, F_to_C(-40:212))
temp_conversion <- function(temp, type_of_conversion = "F_to_C") {
if (!type_of_conversion %in% c("F_to_C", "C_to_F")) {
stop("STOP!!! I can only convert C to F or F to C.")
}
if (type_of_conversion == "F_to_C") {
new_temp <- F_to_C(temp)
}
if (type_of_conversion == "C_to_F") {
new_temp <- C_to_F(temp)
}
return(list(temp = new_temp, conversion = type_of_conversion))
}
temp_conversion(100, type = "C_to_F")
## $temp
## [1] 212
##
## $conversion
## [1] "C_to_F"
temp_conversion(0, type = "F_to_C")
## $temp
## [1] -17.77778
##
## $conversion
## [1] "F_to_C"
x <- rnorm(100)
x[1:10]
## [1] 0.00465041 -1.22874970 -0.14059798 -0.20732697 -0.92153058 0.36047424
## [7] 1.66660243 1.44804634 -0.03285159 -1.62843554
x[20:30]
## [1] 0.18713551 -0.57543743 -0.08766336 0.31167076 0.24438772 -0.45803324
## [7] 1.32593062 -0.57952274 -0.82407695 1.08562056 1.41590793
index <- which(x < 0)
index
## [1] 2 3 4 5 9 10 11 13 14 15 17 18 19 21 22 25 27 28 33
## [20] 35 38 41 44 45 46 49 52 57 58 60 63 64 65 66 68 72 73 74
## [39] 75 78 80 81 83 84 85 89 90 93 94 96 97 100
x[index]
## [1] -1.22874970 -0.14059798 -0.20732697 -0.92153058 -0.03285159 -1.62843554
## [7] -0.56266928 -0.85173525 -0.91512892 -0.44484557 -1.44896422 -0.59389701
## [13] -0.75144575 -0.57543743 -0.08766336 -0.45803324 -0.57952274 -0.82407695
## [19] -2.07172304 -0.49889987 -1.51680685 -0.62976566 -0.36027572 -0.39350268
## [25] -1.04180068 -0.63945719 -0.85649561 -0.38280458 -1.89724701 -0.69186589
## [31] -0.43251556 -0.95196059 -0.21625030 -1.04279172 -0.10986728 -1.84878973
## [37] -0.98415680 -0.21854832 -0.80092215 -0.50991847 -0.09837813 -0.83113845
## [43] -0.19782135 -0.54494659 -0.32692897 -0.25978735 -0.11579965 -0.93762421
## [49] -0.27231887 -0.47014392 -0.06493665 -0.96108831
An introductory example
This “initializes” the object “w” so that R knows what w “stands for.”
w <- NULL
w <- NULL
for (i in 1:10) {
w[i] <- i + 10
}
w
## [1] 11 12 13 14 15 16 17 18 19 20
# same code
for(i in 1:10){
w[i] <- i+10
print(w[i])
}
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
#w <- NULL
for (hello in 1:10) {
w[hello] <- hello
}
w
## [1] 1 2 3 4 5 6 7 8 9 10
Now what does w look like?
w <- NULL
counter <- 1
for (j in 11:20) {
for (i in 1:10) {
w[counter] <- i + j
counter <- counter + 1
}
}
w
## [1] 12 13 14 15 16 17 18 19 20 21 13 14 15 16 17 18 19 20 21 22 14 15 16 17 18
## [26] 19 20 21 22 23 15 16 17 18 19 20 21 22 23 24 16 17 18 19 20 21 22 23 24 25
## [51] 17 18 19 20 21 22 23 24 25 26 18 19 20 21 22 23 24 25 26 27 19 20 21 22 23
## [76] 24 25 26 27 28 20 21 22 23 24 25 26 27 28 29 21 22 23 24 25 26 27 28 29 30
Think through these two loops carefully.
w <- NULL
counter <- 1
for(j in 11:20){
for(i in 1:10){
w[counter] <- i + j
}
counter <- counter + 1
cat(counter , " , ")
}
## 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 ,
It is usually a better idea to pre-allocate a variable of the appropriate size
R has to reallocate memory at each iteration
w <- rep(NA, length(1:10))
w
## [1] NA NA NA NA NA NA NA NA NA NA
for(i in 1:10){
w[i] <- i+10
}
w
## [1] 11 12 13 14 15 16 17 18 19 20
system.time({
w <- c()
for(i in 1:10^6)
w[i] <- i+10
}
)
## user system elapsed
## 0.37 0.03 0.70
system.time({
w <- rep(NA, 10^6)
for(i in 1:10^6)
w[i] <- i+10
}
)
## user system elapsed
## 0.08 0.00 0.08
In class example: Construct a 10 x 10 multiplication table
my_mat <- matrix(NA, nrow=10, ncol=10)
for(rows in 1:10) {
for(cols in 1:10) {
my_mat[rows, cols] <- rows * cols
}
}
my_mat
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1 2 3 4 5 6 7 8 9 10
## [2,] 2 4 6 8 10 12 14 16 18 20
## [3,] 3 6 9 12 15 18 21 24 27 30
## [4,] 4 8 12 16 20 24 28 32 36 40
## [5,] 5 10 15 20 25 30 35 40 45 50
## [6,] 6 12 18 24 30 36 42 48 54 60
## [7,] 7 14 21 28 35 42 49 56 63 70
## [8,] 8 16 24 32 40 48 56 64 72 80
## [9,] 9 18 27 36 45 54 63 72 81 90
## [10,] 10 20 30 40 50 60 70 80 90 100
w <- 100
z <- 5
while (w > 20) {
w.plus.z <- w + z
w <- w - 1
}
w
## [1] 20
w <- 100
z <- 5
while (w < 20) {
# if w > 20 => inf loop
w.plus.z <- w + z
w <- w + 1
}
w
## [1] 100
The following loop will run forever. DON’T actually run it!!!!!
w <- 100 z <- 5 while(w > 20){ w.plus.z <- w + z w <- w + 1 }
m=20
for ( k in 1:m){
if(!k %% 2)
next
print(k)
}
## [1] 1
## [1] 3
## [1] 5
## [1] 7
## [1] 9
## [1] 11
## [1] 13
## [1] 15
## [1] 17
## [1] 19
set.seed(12345) ## Set seed for random number generator
m <- rnorm(10)
for(i in 1:length(m)) {
if(m[i] < 0)
break
print(m[i])
}
## [1] 0.5855288
## [1] 0.709466
my_mat <- matrix(NA , nrow =10 , ncol=10)
for(rows in 1:10){
for(cols in 1:10){
if(rows < cols) next
my_mat[rows , cols] <- rows * cols
}
}
my_mat
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1 NA NA NA NA NA NA NA NA NA
## [2,] 2 4 NA NA NA NA NA NA NA NA
## [3,] 3 6 9 NA NA NA NA NA NA NA
## [4,] 4 8 12 16 NA NA NA NA NA NA
## [5,] 5 10 15 20 25 NA NA NA NA NA
## [6,] 6 12 18 24 30 36 NA NA NA NA
## [7,] 7 14 21 28 35 42 49 NA NA NA
## [8,] 8 16 24 32 40 48 56 64 NA NA
## [9,] 9 18 27 36 45 54 63 72 81 NA
## [10,] 10 20 30 40 50 60 70 80 90 100
my_mat[1,3]
## [1] NA
### There is also the replicate function for repeated
### evaluation of the same thing:
m <- 10
n <- 5
mymat <- replicate(m, rnorm(n))
## This is equivalent!
mymat <- matrix(NA, nrow=5, ncol=10)
for(i in 1:ncol(mymat)) {
mymat[,i] <- rnorm(5)
}
mymat
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.1495920 -1.8323773 0.05461558 0.9426009 0.6453831 1.8670992
## [2,] -1.3425315 0.8881394 -0.78464937 0.8262583 1.0431436 0.6720425
## [3,] 0.5533031 1.5934885 -1.04935282 -0.8115405 -0.3043691 -0.3079534
## [4,] 1.5899628 0.5168547 2.33051196 0.4762483 2.4771109 0.5365237
## [5,] -0.5868796 -1.2956717 1.40270538 1.0212584 0.9712207 0.8248701
## [,7] [,8] [,9] [,10]
## [1,] -0.9639015 0.6873321 0.2239254 -0.5360480
## [2,] -0.8550825 -0.5050435 -1.1562233 -0.3116061
## [3,] 1.8869469 2.1577198 0.4224185 1.5561096
## [4,] -0.3918194 -0.5997976 -1.3247553 -0.4480333
## [5,] -0.9806329 -0.6945467 0.1410843 0.3211235
## This is also equivalent!
mymat <- matrix(NA, nrow=5, ncol=10)
for(cols in 1:ncol(mymat)) {
for(rows in 1:nrow(mymat)) {
mymat[rows,cols] <- rnorm(1)
}
}
mymat
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] -1.23017225 -0.50508981 1.5448636 -1.158821 1.6421920 -1.0479137
## [2,] -1.32405869 -0.05215359 1.3214520 -1.845368 0.8836548 -1.0111225
## [3,] 1.26124227 0.62886063 0.3221516 1.157325 0.5248759 0.6689217
## [4,] 1.31923172 2.18000240 1.5309551 -2.123550 -1.1846591 0.1291773
## [5,] -0.08075376 -0.06901731 -0.4212397 -1.196032 2.6557883 -0.4225769
## [,7] [,8] [,9] [,10]
## [1,] -1.14026414 0.5401696 -1.6193283 -0.25094662
## [2,] -1.29371529 -1.5472920 0.5483979 1.69934667
## [3,] -0.59469877 0.8496529 0.1952822 -0.34429880
## [4,] -1.50081408 0.8960132 -0.8064980 0.06777206
## [5,] 0.01585569 0.1386910 -0.1086242 -0.65056973
my_mat <- matrix(1:20, nrow=4, ncol=5)
for(rows in 1:4) {
my_mat[rows, ] <- rnorm(5)
}
my_mat
## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.4876385 0.3031512 -0.24197402 -0.4817336 -0.99180286
## [2,] -0.2806491 0.6330173 -1.23981834 1.7643141 -0.02367989
## [3,] 0.1999205 1.3471928 0.03607349 0.8245811 -1.70267185
## [4,] 0.4809502 2.4835501 0.40136499 0.2151772 -1.81571235
my_mat <- matrix(1:20, nrow=4, ncol=5)
for(rows in 1:4) {
my_mat[rows,] <- rnorm(5)
}
my_mat
## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.91173942 -0.04904469 -0.4053875 1.1303818 0.81546474
## [2,] 0.07641752 1.45374735 0.3741211 -0.1709041 -0.50221281
## [3,] 0.54352211 -0.50518600 0.7867958 0.3009494 1.31022391
## [4,] 0.79843377 0.85086044 -0.4435680 -0.4467748 0.01330504
for(index in 1:20) {
my_mat[index] <- my_mat[index] * 10
}
my_mat
## [,1] [,2] [,3] [,4] [,5]
## [1,] -9.1173942 -0.4904469 -4.053875 11.303818 8.1546474
## [2,] 0.7641752 14.5374735 3.741211 -1.709041 -5.0221281
## [3,] 5.4352211 -5.0518600 7.867958 3.009494 13.1022391
## [4,] 7.9843377 8.5086044 -4.435680 -4.467748 0.1330504
## Loop form
natural_numbers <- 1:50
sum_value <- 0
for(i in 1:length(natural_numbers)) {
## Is natural numbers divisible by 3?
## Is natural numbers divisible by 5?
if(!natural_numbers[i] %% 3 | !natural_numbers[i] %% 5) {
# print(natural_numbers[i])
sum_value <- sum_value + natural_numbers[i]
}
}
## Vectorized form
new_vector <- sum(ifelse(!natural_numbers %% 3 | !natural_numbers %% 5,
natural_numbers, 0))
new_indices <- ifelse(!natural_numbers %% 3 | !natural_numbers %% 5,
TRUE, FALSE)
natural_numbers[which(new_indices == TRUE)]
## [1] 3 5 6 9 10 12 15 18 20 21 24 25 27 30 33 35 36 39 40 42 45 48 50
keep <- c()
counter <- 1
while(counter < 1000) {
if(counter %% 3 & counter %% 5) {
counter <- counter + 1
} else {
keep <- c(keep, counter)
counter <- counter + 1
}
}
sum(keep)
## [1] 233168
# ## Consider this:
v1 <- runif(10)
v1
## [1] 0.07548045 0.47438424 0.26458955 0.23074607 0.59619939 0.15892558
## [7] 0.85505484 0.23745380 0.79711170 0.07848559
v2 <- runif(10)
v2
## [1] 0.54189613 0.71694462 0.04077934 0.02560045 0.74061643 0.77014907
## [7] 0.53867819 0.38813644 0.46941770 0.12248648
v3 <- c()
for (i in 1:10) {
v3[i] <-v1[i] + v2[i]
}
v3
## [1] 0.6173766 1.1913289 0.3053689 0.2563465 1.3368158 0.9290746 1.3937330
## [8] 0.6255902 1.2665294 0.2009721
## versus this:
v3 = v1 + v2
v3
## [1] 0.6173766 1.1913289 0.3053689 0.2563465 1.3368158 0.9290746 1.3937330
## [8] 0.6255902 1.2665294 0.2009721
# ## Consider this
A <- matrix(1:20, nrow=5, ncol=4)
B <- matrix(21:40, nrow=5, ncol=4)
matsum <- matrix(0, nrow=5, ncol=4)
for(i in 1:5) {
for(j in 1:4) {
matsum[i,j] <- A[i,j] + B[i,j]
}
}
matsum
## [,1] [,2] [,3] [,4]
## [1,] 22 32 42 52
## [2,] 24 34 44 54
## [3,] 26 36 46 56
## [4,] 28 38 48 58
## [5,] 30 40 50 60
## versus this:
matsum <- A + B
matsum
## [,1] [,2] [,3] [,4]
## [1,] 22 32 42 52
## [2,] 24 34 44 54
## [3,] 26 36 46 56
## [4,] 28 38 48 58
## [5,] 30 40 50 60
Let’s consider an example where we had some artificial “signal” to random noise.
# This is a bad loop with 'growing' data
set.seed(42)
m <- 10
n <- 10
# Create matrix of normal random numbers
mymat <- replicate(m, rnorm(n))
# Transform into data frame
mydframe <- data.frame(mymat)
for (i in 1:m) {
for (j in 1:n) {
mydframe[i,j]<-mydframe[i,j] + 10*sin(0.75*pi)
}
}
mydframe
#
# ## Here is the vectorized form:
mydframe_vec <- data.frame(mymat)
mydframe_vec <- mydframe_vec + 10*sin(0.75*pi)
mydframe_vec
## What happens when you add a matrix and a vector?
m <- matrix(1:20, nrow=4, ncol=5)
v <- c(1,10,100,1000,10000)
m + v
## [,1] [,2] [,3] [,4] [,5]
## [1,] 2 10005 1009 113 27
## [2,] 12 7 10010 1014 118
## [3,] 103 17 12 10015 1019
## [4,] 1004 108 22 17 10020
# This is a bad loop with 'growing' data
time_check <- function(m,n) {
set.seed(42)
# Create matrix of normal random numbers
mymat <- replicate(m, rnorm(n))
# Transform into data frame
mydframe <- data.frame(mymat)
time_1 <- system.time({
for (i in 1:m) {
for (j in 1:n) {
mydframe[i,j]<-mydframe[i,j] + 10*sin(0.75*pi)
}
}
})
time_2 <- system.time({
mydframe_vec <- mydframe + 10*sin(0.75*pi)
})
return(list(loop = time_1, vectorized=time_2))
}
time_check(10,10)
## $loop
## user system elapsed
## 0.02 0.00 0.01
##
## $vectorized
## user system elapsed
## 0 0 0
time_check(100,100)
## $loop
## user system elapsed
## 0.31 0.00 0.33
##
## $vectorized
## user system elapsed
## 0.02 0.00 0.02
time_check(1000,1000)
## $loop
## user system elapsed
## 53.79 0.06 55.98
##
## $vectorized
## user system elapsed
## 0.07 0.00 0.06
a = TRUE
b = FALSE
a & b
## [1] FALSE
a | b
## [1] TRUE
!(a & b)
## [1] TRUE
!a | !b
## [1] TRUE
x <- 10
x < 20
## [1] TRUE
x <- 1:10
x < 5
## [1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
x == 6
## [1] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
x <- 5
ifelse(x==4, 1, 0)
## [1] 0
x <- 1:10
ifelse(x<=4, 1, 0)
## [1] 1 1 1 1 0 0 0 0 0 0
y <- NULL
x <- seq(0.1, 1, by=0.1)
for(i in 1:10){
if(x[i] < 0.5){
y[i] <- x[i]^2
}else{
y[i] <- sqrt(x[i])}
}
y
## [1] 0.0100000 0.0400000 0.0900000 0.1600000 0.7071068 0.7745967 0.8366600
## [8] 0.8944272 0.9486833 1.0000000
# What does this evaluate to, and why?
(TRUE + TRUE) * FALSE
## [1] 0
(TRUE + TRUE) * TRUE
## [1] 2
Package = fundamental unit of shareable code
Bundles together code, data, documentation, tests, and is easy to share with others
3 main repositories for packages in R:
Bioconductor: http://bioconductor.org/ (packages oriented towards analysis of biological data)
GitHub: Install the package of interest from GitHub, for example install_github(“Displayr/flipPlots”)
There are > 6000 packages at CRAN alone! And growing rapidly!!
=> Chances are, someone has probably already solved a problem you’re working on
Make sure to give credit to package developers if you use their work! (citations etc)
Let’s practice installing a couple of packages:
Tidyverse and devtools package from CRAN
install.packages(c(“tidyverse”, “devtools”)) ## You only need to do this once per package
library(tidyverse)
library(devtools) ?tidyverse
edgeR RNA-seq expression analysis package from Bioconductor
This is just an example, we won’t actually be using edgeR today
source(“http://www.bioconductor.org/biocLite.R”) ## You only need to do this once per package
biocLite(“edgeR”) ## You only need to do this once per package
library(edgeR)
vignette(“edgeR”)
Install a GitHub package (using the devtools package above)
This is just an example, we won’t actually be using AnomalyDetection today
install_github(“twitter/AnomalyDetection”) ## You only need to do this once per package
library(AnomalyDetection)
Core structures for analysis: data.frames and tibbles First of all: What is the tidyverse? https://www.tidyverse.org/ - A “meta-package” that is an optionated collection of several R packages for data science - Includes ggplot2 (plotting), dplyr (transformating/summarizing dataframe content), tidyr (tidying dataframes), purr, tibble
Pipe operator: f(x) can be rewritten as x %>% f
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(devtools)
## Loading required package: usethis
data(iris)
head(iris)
iris %>% head()
mean.sepal <- mean(iris$Sepal.Length)
mean.sepal <- iris$Sepal.Length %>% mean()
x <- c(0.109, 0.359, 0.63, 0.996, 0.515, 0.142, 0.017, 0.829, 0.907)
round(exp(diff(log(x))), 1)
## [1] 3.3 1.8 1.6 0.5 0.3 0.1 48.8 1.1
x %>% log() %>%
diff() %>%
exp() %>%
round(1)
## [1] 3.3 1.8 1.6 0.5 0.3 0.1 48.8 1.1
data(starwars)
#
## Look at data: what do rows represent? how about columns?
starwars
#
# ## filter(): filter data frame on row
filter(starwars, species=="Droid")
starwars[which(starwars$species=="Droid"),] ## base R equivalent
starwars %>% filter(species=="Droid") ## equivalent with a pipe
starwars %>% filter(., species=="Droid") ## equivalent with a pipe
starwars %>% filter(species=="Droid" & eye_color=="red")
new_df <- starwars %>%
filter(species=="Droid", eye_color=="red") ## 'and' conditions can be separated with comma
new_df
# ## select(): filter data frame on column
starwars %>% select(mass, homeworld)
starwars %>% select(-hair_color) ## We can remove rows using a '-'
starwars %>%
filter(species=="Droid") %>%
select(name, mass) ## Filter and select can be combined using a pipe %>%
# ## mutate(): add new column to data frame
# ## Note: we generally avoid overwriting data in a data frame, better to create a new variable
starwars %>%
mutate(height_in_inches=0.393701 * height)
# %>% View()
# ## group_by(): establish a grouping for downstream operations, remove with ungroup()
# ## tally(): count number of observations per grouping
starwars %>%
group_by(species) %>%
tally()
# ## summarize(): perform summary statistic on a column. Fun fact: this also exists as summarise()
starwars %>% summarize(mean_mass = mean(mass, na.rm=TRUE))
starwars %>%
group_by(species) %>%
summarize(mean_mass = mean(mass, na.rm=TRUE))
# ## arrange(): arrange a column
starwars %>% arrange(desc(mass)) # %>% View()
starwars %>% arrange(mass) # %>% View()
# ## unique, na.omit()
starwars %>% unique()
starwars %>% na.omit()
## count() vs tally()
# In brief, tally() is short-hand for summarise()
# and
# count() is a short-hand for group_by() + tally()
# Here are some examples:
mtcars %>% tally()
mtcars %>% group_by(cyl) %>% tally()
mtcars %>% count(cyl)
-If there’s time: a quick start on ggplot2
ggplot2 grammar: - data = dataframe being plotted - geometrics = the geometric shape that represents the data (point, boxplot, histogram, violin, bar, etc) - aesthetics = aesthetics of the geometric object (color, size, shape, etc)
library(ggplot2)
ggplot(starwars, aes(x=mass, y=height)) +
geom_point()
## Warning: Removed 28 rows containing missing values (geom_point).
# ## NOTE: did you see that we are using '+' here for ggplot2 and not the pipe '%>%'?
plot(starwars$mass, starwars$height) ## base R version
## These are all the same
ggplot() +
geom_point(data=starwars, color="red",
aes(x=mass, y=height))
## Warning: Removed 28 rows containing missing values (geom_point).
ggplot() +
geom_point(data=starwars, color="red",
aes(x=mass, y=height))
## Warning: Removed 28 rows containing missing values (geom_point).
ggplot(data=starwars) +
geom_point(color="red", aes(x=mass, y=height))
## Warning: Removed 28 rows containing missing values (geom_point).
ggplot(starwars, aes(x=mass, y=height)) +
geom_point(color="red")
## Warning: Removed 28 rows containing missing values (geom_point).
ggplot(starwars, aes(x=mass, y=height, color=gender)) +
geom_point()
## Warning: Removed 28 rows containing missing values (geom_point).
new_df <- starwars %>% filter(mass < 500)
ggplot(new_df,
aes(x=mass, y=height, color=gender)) +
geom_point()
ggplot(starwars, aes(x=mass, y=height, color=gender)) +
geom_point()
## Warning: Removed 28 rows containing missing values (geom_point).
new_df <- starwars %>% filter(mass < 500)
new_df
ggplot(new_df,
aes(x=mass, y=height, color=gender)) +
geom_point()
## We can change shape aesthetics too
ggplot(starwars, aes(x=mass, y=height, color=gender, shape=gender)) +
geom_point()
## Warning: Removed 29 rows containing missing values (geom_point).
## Aesthetics can be placed inside the relevant geom: the following two are equivalent
library(ggplot2)
ggplot(starwars, aes(x=mass, y=height, color=gender, shape=gender)) +
geom_point()
## Warning: Removed 29 rows containing missing values (geom_point).
ggplot(starwars) +
geom_point(aes(x=mass, y=height, color=gender, shape=gender))
## Warning: Removed 29 rows containing missing values (geom_point).
## Aesthetics are only for mapping:
ggplot(starwars) + geom_point(aes(x=mass, y=height, color=“blue”)) ## This doesn’t work
ggplot(starwars) + geom_point(aes(x=mass, y=height), color=“blue”) ## This does work
ggplot(filter(starwars, mass < 500), aes(x=mass, y=height)) +
geom_point() +
geom_line()
## Other geoms: histograms, boxplots, density plots, violin plots, barplots
ggplot(starwars) +
geom_histogram(aes(x=height), fill="orange", color="brown")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 6 rows containing non-finite values (stat_bin).
ggplot(starwars) +
geom_boxplot(aes(x="", y=log(mass)))
## Warning: Removed 28 rows containing non-finite values (stat_boxplot).
ggplot(starwars) +
geom_boxplot(aes(x=gender, y=log(mass), fill=gender))
## Warning: Removed 28 rows containing non-finite values (stat_boxplot).
ggplot(starwars) +
geom_violin(aes(x=gender, y=log(mass), fill=gender))
## Warning: Removed 28 rows containing non-finite values (stat_ydensity).
## Warning: Groups with fewer than two data points have been dropped.
ggplot(starwars) +
geom_bar(aes(x=gender, fill=gender))
#
ggplot(starwars) +
geom_density(aes(x=log(mass), fill=gender))
## Warning: Removed 28 rows containing non-finite values (stat_density).
## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
ggplot(starwars) +
geom_density(aes(x=log(mass), fill=gender), alpha=0.9)
## Warning: Removed 28 rows containing non-finite values (stat_density).
## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Can add transparency to object
names(starwars)
## [1] "name" "height" "mass" "hair_color" "skin_color"
## [6] "eye_color" "birth_year" "sex" "gender" "homeworld"
## [11] "species" "films" "vehicles" "starships"
starwars$gender
## [1] "masculine" "masculine" "masculine" "masculine" "feminine" "masculine"
## [7] "feminine" "masculine" "masculine" "masculine" "masculine" "masculine"
## [13] "masculine" "masculine" "masculine" "masculine" "masculine" "masculine"
## [19] "masculine" "masculine" "masculine" "masculine" "masculine" "masculine"
## [25] "masculine" "masculine" "feminine" "masculine" "masculine" "masculine"
## [31] "masculine" "masculine" "masculine" "masculine" "masculine" "masculine"
## [37] NA "masculine" "masculine" NA "feminine" "masculine"
## [43] "masculine" "feminine" "masculine" "masculine" "masculine" "masculine"
## [49] "masculine" "masculine" "masculine" "feminine" "masculine" "masculine"
## [55] "masculine" "masculine" "masculine" "feminine" "masculine" "masculine"
## [61] "feminine" "feminine" "feminine" "masculine" "masculine" "masculine"
## [67] "feminine" "masculine" "masculine" "feminine" "feminine" "masculine"
## [73] "feminine" "masculine" "masculine" "feminine" "masculine" "masculine"
## [79] "masculine" NA "masculine" "masculine" "feminine" "masculine"
## [85] "masculine" NA "feminine"
ggplot(starwars) +
geom_histogram(aes(x=height), fill="orange", color="brown") +
xlab("Height") +
ylab("Count") +
ggtitle("Histogram of Star Wars character heights")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 6 rows containing non-finite values (stat_bin).
g1 <- ggplot(starwars) +
geom_histogram(aes(x=height), fill="orange", color="brown")
ggplot(starwars) +
geom_histogram(aes(x=height), fill="orange", color="brown") +
xlab("Height") +
ylab("Count") +
ggtitle("Histogram of Star Wars character heights") +
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 6 rows containing non-finite values (stat_bin).
=> filter, select, mutate, group_by, tally, summarize, arrange, unique, na.omit
library(dplyr)
data(iris)
# iris %>% select(Sepal.Length)%>% mean()
mean(iris$Sepal.Length)
## [1] 5.843333
iris %>% summarize(mean.SL = mean(Sepal.Length))
## Mean Sepal.Length per species
iris %>%
group_by(Species) %>%
summarize(mean(Sepal.Length),
median.SL = median(Sepal.Length))
iris %>% arrange(Sepal.Length, Sepal.Width)
iris %>% arrange(desc(Sepal.Length))
# arrange() orders the rows of a data frame by the values of selected columns.
=> data + geom + aesthetics
geom_point, geom_histogram, geom_line, geom_boxplot, geom_violin
?filter
dplyr::filter stats::filter my_filter <- dplyr::filter filter <- dplyr::filter
Both packages dplyr and MASS – function select
ggplot(data = ) +
library(tidyverse)
data(diamonds)
## What is plotted here? Where does count come from?
## Histograms, barplots, frequency polygons, smoothers, boxplots, ...
ggplot(diamonds) +
geom_bar(aes(x = cut))
ggplot(diamonds) +
stat_count(aes(x = cut))
With bar graphs, there are two different things that the heights of bars commonly represent:
The count of cases for each group - typically, each x value represents one group. This is done with stat_bin, which calculates the number of cases in each group (if x is discrete, then each x value is a group; if x is continuous, then all the data is automatically in one group, unless you specify grouping with group=xx).
The value of a column in the data set. This is done with stat_identity, which leaves the y values unchanged.
For line graphs, the data points must be grouped so that it knows which points to connect.
If group = 1, then all points would be considered connected.
When more variables are used and multiple lines are drawn, the grouping for lines is usually done by variable.
We can override default stat for a given geom
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut, y = ..prop.., group = 1))
What happens if we don’t set group = 1 above? What is wrong with the following two graphs?
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut, y = ..prop..))
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut, fill = color, y = ..prop..))
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut, y = ..prop..))
We can even explicitly use stat_## instead of the geom By default, each stat is associated with a geom and vice versa
ggplot(data = diamonds) +
stat_count(mapping = aes(x = cut))
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut))
ggplot(data = diamonds) +
stat_summary(
mapping = aes(x = cut, y = depth),
fun.ymin = min,
fun.ymax = max,
fun.y = median
)
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: `fun.ymin` is deprecated. Use `fun.min` instead.
## Warning: `fun.ymax` is deprecated. Use `fun.max` instead.
Stacked versus side-by-side barplots
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut, fill = clarity))
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut, fill = clarity), position="fill")
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut, fill = clarity), position="dodge")
geom_smooth: What variables does stat_smooth compute? What parameters control its behavior?
http://statseducation.com/Introduction-to-R/modules/graphics/smoothing/
It can be hard to view trends with just points alone. Many times we wish to add a smoothing line in order to see what the trends look like.
This can be especially helpful when trying to understand regressions.
http://search.r-project.org/library/ggplot2/html/geom_smooth.html Smoothing method (function) to use, accepts either NULL or a character vector,
e.g. “lm”, “glm”, “gam”, “loess” or a function,
e.g. MASS::rlm or mgcv::gam, stats::lm, or stats::loess. “auto” is also accepted for backwards compatibility. It is equivalent to NULL.
For method = NULL the smoothing method is chosen based on the size of the largest group (across all panels). stats::loess() is used for less than 1,000 observations;
otherwise mgcv::gam() is used with formula = y ~ s(x, bs = “cs”) with method = “REML”.
Somewhat anecdotally, loess gives a better appearance, but is O(N^2) in memory, so does not work for larger datasets.
If you have fewer than 1,000 observations but want to use the same gam() model
that method = NULL would use, then set method = “gam”, formula = y ~ s(x, bs = “cs”).
Residual maximum likelihood (REML)
Local Polynomial Regression Fitting (LOESS)
mgcv::gam(): Generalized additive models with integrated smoothness estimation
MASS::rlm(): Robust Fitting of Linear Models
ggplot(diamonds, aes(x=carat, y=price, color=cut)) +
geom_point() +
geom_smooth(aes(fill=cut), alpha=0.5)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(diamonds, aes(x=carat, y=price, color=cut)) +
geom_point() +
geom_smooth(aes(fill=cut), alpha=0.5, method="lm")
## `geom_smooth()` using formula 'y ~ x'
geom_smooth
What to do about overplotting? Jittering?
Jittering is the act of adding random noise to data in order to prevent overplotting in statistical graphs.
Overplotting can occur when a continuous measurement is rounded to some convenient unit. This has the effect of changing a continuous variable into a discrete ordinal variable.
For example, age is measured in years and body weight is measured in pounds or kilograms. If you construct a scatter plot of weight versus age for a sufficiently large sample of people, there might be many people recorded as, say, 29 years and 70 kg, and therefore many markers plotted at the point (29, 70).
To alleviate overplotting, you can add small random noise to the data. The size of the noise is often chosen to be the width of the measurement unit. For example, to the value 70 kg you might add the quantity u, where u is a uniform random variable in the interval [-0.5, 0.5].
You can justify jittering by assuming that the true weight of a 70 kg person is equally likely to be anywhere in
the interval [69.5, 70.5].
Look at the parameters to control the amount of jittering When your dataset is large, the dots of your scatterplot will tend to overlap, making the graphic unreadable.
The jitter geom is a convenient shortcut for geom_point(position = “jitter”). It adds a small amount of random variation to the location of each point, and is a useful way of handling overplotting caused by discreteness in smaller datasets.
data(mpg)
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy))
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy), position = "jitter")
ggplot(data = mpg) +
geom_jitter(mapping = aes(x = displ, y = hwy))
ggplot(data = mpg) +
geom_count(mapping = aes(x = displ, y = hwy))
alpha aesthetic will make the points more transparent
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy), alpha=0.25)
Facets: smaller plots that display a subset of the data Useful for exploring conditional relationships and large data
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy))
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_grid(~trans)
## What is the difference between facet_wrap and facet_grid?
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_wrap(~trans)
## What happens if you facet on a continuous variable?
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_wrap(~cty)
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy))
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
coord_flip()
g <- ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
coord_flip()
ggsave(g, filename="first_ggplot.pdf")
## Saving 7 x 5 in image
ggsave(filename="first_ggplot.png") ## Save last plot
## Saving 7 x 5 in image
#install.packages("cowplot") ## Only do this once!
library(cowplot)
plot_grid(g, g, labels=c("A","B"))
R uses factors to handle categorical variables, variables that have a fixed and known set of possible values.
Factors are also helpful for reordering character vectors to improve display.
Working with factors and group order in ggplot: forcats
install.packages(“forcats”) ## not explicitly part of the tidyverse
The goal of the forcats package is to provide a suite of tools that
solve common problems with factors, including changing the order of
levels or the values. Some examples include:
https://forcats.tidyverse.org/
library(forcats)
colors <- c("red","blue","green","green","red","blue","red","green","blue")
class(colors)
## [1] "character"
colors <- factor(colors)
print(colors)
## [1] red blue green green red blue red green blue
## Levels: blue green red
class(colors)
## [1] "factor"
# Each color is a level, and underlying each level is an integer associated with that
# level; red = 3, green = 2, blue = 1. The order of the levels is assigned
# alphabetically to the integer: b = 1, g = 2, r = 3.
colors <- factor(colors, levels = c("red","blue", "green"))
print(colors)
## [1] red blue green green red blue red green blue
## Levels: red blue green
data(airquality)
airquality$Month <- factor(airquality$Month)
levels(airquality$Month)
## [1] "5" "6" "7" "8" "9"
ggplot(airquality, aes(Month, Temp)) +
geom_boxplot(aes(fill = Month)) +
ggtitle(label = "Daily Temperatures Aggregated by Month")
airquality$Month <- fct_recode(airquality$Month,
May='5', June='6', July='7', Aug='8', Sept='9')
ggplot(airquality, aes(Month, Temp)) +
geom_boxplot(aes(fill = Month)) +
ggtitle(label = "Daily Temperatures Aggregated by Month")
ggplot(airquality, aes(fct_rev(Month), Temp)) +
geom_boxplot(aes(fill = Month)) +
labs(x = "Month") +
ggtitle(label = "Our plot now has the x-axis in reverse order")
airquality$Month <- fct_relevel(airquality$Month, 'Sept', 'July', 'May', 'Aug', 'June')
levels(airquality$Month)
## [1] "Sept" "July" "May" "Aug" "June"
ggplot(airquality, aes(Month, Temp)) +
geom_boxplot(aes(fill = Month)) +
ggtitle(label = "Notice how the order of the level 'Month' has changed")
## order factors according to another variable
data(mtcars)
mtcars$model <- factor(row.names(mtcars))
class(mtcars$model)
## [1] "factor"
ggplot(mtcars, aes(mpg, model)) +
geom_point() +
ggtitle(label = "MPG vs. Car Model")
ggplot(mtcars, aes(mpg, fct_reorder(model, mpg))) +
geom_point() +
labs(y = "model") +
ggtitle(label = "Better comparison by reordering levels based on mpg values!") +
theme(plot.title = element_text(size = 10, face = 'bold'))
## ggplot2: Color schemes
colors()
http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
Color choices in R: great palette selections from RColorBrewer
Note: works in ggplot as well as base R
install.packages(“RColorBrewer”) ## Only have to do this once
library(RColorBrewer)
display.brewer.all()
When would each type of color class be appropriate?
Another great color palette: viridis Designed to be perfectly perceptually-uniform, both in color and black-and-white
Also designed to be perceived by people with the most common form of color blindness install.packages(“viridis”) ## Only have to do this once
library(viridis)
vignette(“intro-to-viridis”)
Think about what you’re trying to color (discrete vs continuous) and why!
Color schemes: scale_color_brewer(), scale_fill_brewer(), scale_color_gradient(), …
library(viridis)
## Loading required package: viridisLite
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, color=hwy)) +
scale_color_viridis()
## Why does this throw an error?
Error: Discrete value supplied to continuous scale
ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy, color=drv)) + scale_color_viridis()
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, color=drv)) +
scale_color_brewer(palette = "Accent") +
theme_bw()
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, color=drv)) +
scale_color_viridis(discrete=TRUE) +
theme_bw()
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, color=hwy)) +
scale_color_gradient() +
theme_bw()
Ewww, this plot looks terrible, don’t do red and green together!
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, color=hwy)) +
scale_color_gradient(low="green", high="red") +
theme_bw()
#
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, color=drv)) +
scale_color_manual(values=c("darkblue", "seagreen1", "deeppink2")) +
theme_bw()
x <- rnorm(1e2) x[between(x, -1, 1)]
diamonds %>% filter(between(depth, 60, 61)) %>% (view)
diamonds
starwars %>% dplyr::select(ends_with("color"))
starwars %>% dplyr::select(contains("_"))
starwars %>% dplyr::select(everything())
starwars %>% dplyr::select(gender, everything())
starwars %>% dplyr::rename(Name = name)
starwars %>% rename(Name = name)
grp <- diamonds %>% group_by(cut, color)
summarize(grp, avg_price = mean(price),
median_carat = median(carat),
count = n())
## `summarise()` has grouped output by 'cut'. You can override using the `.groups` argument.
grp
## And we can ungroup
grp %>%
ungroup() %>%
summarize(n())
grp
install.packages(“gapminder”) ## Only have to do this one time
library(gapminder)
my_gap <- data(gapminder)
# Answer the following questions:
# - How many observations are there in the data? How many observations per continent? How
# many observations are there per country within each continent? How many variables?
# What data type is each variable?
# - Calculate the average life expectancy per continent. Now calculate the median life
# expectancy per continent in 1952 and 2007.
# - Make a simple scatterplot of GDP per capita versus life expectancy for 2007, where the
# size of the points reflects the size of the population.
# - Now make the plot as ugly and uninformative as possible -- think bad color choices,
# truncated axes, overlapping points, ...
# - Now let's make it as informative as possible -- think x-axis on log10 scale, informative
# color/shape, etc....
# - Taking your nice looking plot, facet by continent. Using the country_colors object in
# gapminder, color the points by country.
# - Explore some other plots! Don't hesitate to use filter or select to focus on a subset.
#
# - Identify the countries with worst and best life expectancy in Asia for each year.
# Hint: use min_rank().
# - Identify which country had the sharpest 5-year drop in life expectancy in each continent.
# Hint: use lag() in a mutate() to create a new variable representing the lag.