#### Calculator
# Arithmetic
2 * 10
## [1] 20
1 + 2
## [1] 3
# Order of operations is preserved
1 + 5 * 10
## [1] 51
(1 + 5) * 10
## [1] 60
# Exponents use the ^ symbol
2^5
## [1] 32
9^(1/2)
## [1] 3
#### Vectors
# Create a vector with the c (short for combine) function
c(1, 4, 6, 7)
## [1] 1 4 6 7
c(1:5, 10)
## [1] 1 2 3 4 5 10
# or use a function
# (seq is short for sequence)
seq(1, 10, by = 2)
## [1] 1 3 5 7 9
seq(0, 50, length = 11)
## [1] 0 5 10 15 20 25 30 35 40 45 50
seq(1, 50, length = 11)
## [1] 1.0 5.9 10.8 15.7 20.6 25.5 30.4 35.3 40.2 45.1 50.0
1:10 # short for seq(1, 10, by = 1), or just
## [1] 1 2 3 4 5 6 7 8 9 10
seq(1, 10)
## [1] 1 2 3 4 5 6 7 8 9 10
5:1
## [1] 5 4 3 2 1
# non-integer sequences
# (Note: the [1] at the beginning of lines indicates
# the index of the first value in that row)
seq(0, 100*pi, by = pi)
## [1] 0.000000 3.141593 6.283185 9.424778 12.566371 15.707963
## [7] 18.849556 21.991149 25.132741 28.274334 31.415927 34.557519
## [13] 37.699112 40.840704 43.982297 47.123890 50.265482 53.407075
## [19] 56.548668 59.690260 62.831853 65.973446 69.115038 72.256631
## [25] 75.398224 78.539816 81.681409 84.823002 87.964594 91.106187
## [31] 94.247780 97.389372 100.530965 103.672558 106.814150 109.955743
## [37] 113.097336 116.238928 119.380521 122.522113 125.663706 128.805299
## [43] 131.946891 135.088484 138.230077 141.371669 144.513262 147.654855
## [49] 150.796447 153.938040 157.079633 160.221225 163.362818 166.504411
## [55] 169.646003 172.787596 175.929189 179.070781 182.212374 185.353967
## [61] 188.495559 191.637152 194.778745 197.920337 201.061930 204.203522
## [67] 207.345115 210.486708 213.628300 216.769893 219.911486 223.053078
## [73] 226.194671 229.336264 232.477856 235.619449 238.761042 241.902634
## [79] 245.044227 248.185820 251.327412 254.469005 257.610598 260.752190
## [85] 263.893783 267.035376 270.176968 273.318561 276.460154 279.601746
## [91] 282.743339 285.884931 289.026524 292.168117 295.309709 298.451302
## [97] 301.592895 304.734487 307.876080 311.017673 314.159265
#### Assign variables
# Assign a vector to a variable with <-
a <- 1:5
a
## [1] 1 2 3 4 5
b <- seq(15, 3, length = 5)
b
## [1] 15 12 9 6 3
c <- a * b
c
## [1] 15 24 27 24 15
#### Basic functions
# Lots of familiar functions work
a
## [1] 1 2 3 4 5
sum(a)
## [1] 15
prod(a)
## [1] 120
mean(a)
## [1] 3
sd(a)
## [1] 1.581139
var(a)
## [1] 2.5
min(a)
## [1] 1
median(a)
## [1] 3
max(a)
## [1] 5
range(a)
## [1] 1 5
#### Extracting subsets
# Specify the indices you want in the square brackets []
a <- seq(0, 100, by = 10)
# blank = include all
a
## [1] 0 10 20 30 40 50 60 70 80 90 100
a[]
## [1] 0 10 20 30 40 50 60 70 80 90 100
# integer +=include, 0=include none, -=exclude
a[5]
## [1] 40
a[c(2, 4, 6, 8)]
## [1] 10 30 50 70
a[0]
## numeric(0)
a[-c(2, 4, 6, 8)]
## [1] 0 20 40 60 80 90 100
a[c(1, 1, 1, 6, 6, 9)] # subsets can be bigger than the original set
## [1] 0 0 0 50 50 80
a[c(1,2)] <- c(333, 555) # update a subset
a
## [1] 333 555 20 30 40 50 60 70 80 90 100
#### Boolean
a
## [1] 333 555 20 30 40 50 60 70 80 90 100
(a > 50) # TRUE/FALSE for each element
## [1] TRUE TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE
which(a > 50) # which indicies are TRUE
## [1] 1 2 7 8 9 10 11
a[(a > 50)]
## [1] 333 555 60 70 80 90 100
!(a > 50) # ! negates (flips) TRUE/FALSE values
## [1] FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
a[!(a > 50)]
## [1] 20 30 40 50
#### Comparison
# Here they are: < > <= >= != == %in%
a
## [1] 333 555 20 30 40 50 60 70 80 90 100
# equal to
a[(a == 50)]
## [1] 50
# equal to
a[(a == 55)]
## numeric(0)
# not equal to
a[(a != 50)]
## [1] 333 555 20 30 40 60 70 80 90 100
# greater than
a[(a > 50)]
## [1] 333 555 60 70 80 90 100
# less than
a[(a < 50)]
## [1] 20 30 40
# less than or equal to
a[(a <= 50)]
## [1] 20 30 40 50
# which values on left are in the vector on right
(c(10, 14, 40, 60, 99) %in% a)
## [1] FALSE FALSE TRUE TRUE FALSE
#### Boolean
# & and, | or, ! not
a
## [1] 333 555 20 30 40 50 60 70 80 90 100
a[(a >= 50) & (a <= 90)]
## [1] 50 60 70 80 90
a[(a < 50) | (a > 100)]
## [1] 333 555 20 30 40
a[(a < 50) | !(a > 100)]
## [1] 20 30 40 50 60 70 80 90 100
a[(a >= 50) & !(a <= 90)]
## [1] 333 555 100
#### Missing values
NA + 8
## [1] NA
3 * NA
## [1] NA
mean(c(1, 2, NA))
## [1] NA
# Many functions have an na.rm argument (NA remove)
mean(c(NA, 1, 2), na.rm = TRUE)
## [1] 1.5
sum(c(NA, 1, 2))
## [1] NA
sum(c(NA, 1, 2), na.rm = TRUE)
## [1] 3
# Or you can remove them yourself
a <- c(NA, 1:5, NA)
a
## [1] NA 1 2 3 4 5 NA
is.na(a) # which values are missing?
## [1] TRUE FALSE FALSE FALSE FALSE FALSE TRUE
!is.na(a) # which values are NOT missing?
## [1] FALSE TRUE TRUE TRUE TRUE TRUE FALSE
a[!is.na(a)] # return those which are NOT missing
## [1] 1 2 3 4 5
a # note, this did not change the variable a
## [1] NA 1 2 3 4 5 NA
# To save the results of removing the NAs,
# assign to another variable or reassign to the original variable
# Warning: if you write over variable a then the original version is gone forever!
a <- a[!is.na(a)]
a
## [1] 1 2 3 4 5
## #### Review
## <-
## + - * / ^
## c()
## seq() # by=, length=
## sum(), prod(), mean(), sd(), var(),
## min(), median(), max(), range()
## a[]
## (a > 1), ==, !=, >, <, >=, <=, %in%
## &, |, !
## NA, mean(a, na.rm = TRUE), !is.na()
## #### Installing
## # only needed once after installing or upgrading R
## install.packages("ggplot2")
#### Library
# each time you start R
# load package ggplot2 for its functions and datasets
library(ggplot2)
# ggplot2 includes a dataset "mpg"
# ? gives help on a function or dataset
?mpg
#### mpg dataset
# head() lists the first several rows of a data.frame
head(mpg)
## # A tibble: 6 × 11
## manufacturer model displ year cyl trans drv cty hwy fl
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr>
## 1 audi a4 1.8 1999 4 auto(l5) f 18 29 p
## 2 audi a4 1.8 1999 4 manual(m5) f 21 29 p
## 3 audi a4 2.0 2008 4 manual(m6) f 20 31 p
## 4 audi a4 2.0 2008 4 auto(av) f 21 30 p
## 5 audi a4 2.8 1999 6 auto(l5) f 16 26 p
## 6 audi a4 2.8 1999 6 manual(m5) f 18 26 p
## # ... with 1 more variables: class <chr>
# str() gives the structure of the object
str(mpg)
## Classes 'tbl_df', 'tbl' and 'data.frame': 234 obs. of 11 variables:
## $ manufacturer: chr "audi" "audi" "audi" "audi" ...
## $ model : chr "a4" "a4" "a4" "a4" ...
## $ displ : num 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
## $ year : int 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
## $ cyl : int 4 4 4 4 6 6 6 4 4 4 ...
## $ trans : chr "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
## $ drv : chr "f" "f" "f" "f" ...
## $ cty : int 18 21 20 21 16 18 18 18 16 20 ...
## $ hwy : int 29 29 31 30 26 26 27 26 25 28 ...
## $ fl : chr "p" "p" "p" "p" ...
## $ class : chr "compact" "compact" "compact" "compact" ...
# summary() gives frequeny tables for categorical variables
# and mean and five-number summaries for continuous variables
summary(mpg)
## manufacturer model displ year
## Length:234 Length:234 Min. :1.600 Min. :1999
## Class :character Class :character 1st Qu.:2.400 1st Qu.:1999
## Mode :character Mode :character Median :3.300 Median :2004
## Mean :3.472 Mean :2004
## 3rd Qu.:4.600 3rd Qu.:2008
## Max. :7.000 Max. :2008
## cyl trans drv cty
## Min. :4.000 Length:234 Length:234 Min. : 9.00
## 1st Qu.:4.000 Class :character Class :character 1st Qu.:14.00
## Median :6.000 Mode :character Mode :character Median :17.00
## Mean :5.889 Mean :16.86
## 3rd Qu.:8.000 3rd Qu.:19.00
## Max. :8.000 Max. :35.00
## hwy fl class
## Min. :12.00 Length:234 Length:234
## 1st Qu.:18.00 Class :character Class :character
## Median :24.00 Mode :character Mode :character
## Mean :23.44
## 3rd Qu.:27.00
## Max. :44.00
#### ggplot_mpg_displ_hwy
# specify the dataset and variables
p <- ggplot(mpg, aes(x = displ, y = hwy))
p <- p + geom_point() # add a plot layer with points
print(p)

#### ggplot_mpg_displ_hwy_colour_class
p <- ggplot(mpg, aes(x = displ, y = hwy))
p <- p + geom_point(aes(colour = class))
print(p)

#### ggplot_mpg_displ_hwy_colour_class_size_cyl_shape_drv
p <- ggplot(mpg, aes(x = displ, y = hwy))
p <- p + geom_point(aes(colour = class, size = cyl, shape = drv))
print(p)

#### ggplot_mpg_displ_hwy_colour_class_size_cyl_shape_drv_alpha
p <- ggplot(mpg, aes(x = displ, y = hwy))
p <- p + geom_point(aes(colour = class, size = cyl, shape = drv)
, alpha = 1/4) # alpha is the opacity
print(p)

#### ggplot_mpg_displ_hwy_facet
# start by creating a basic scatterplot
p <- ggplot(mpg, aes(x = displ, y = hwy))
p <- p + geom_point()
## two methods
# facet_grid(rows ~ cols) for 2D grid, "." for no split.
# facet_wrap(~ var) for 1D ribbon wrapped into 2D.
# examples of subsetting the scatterplot in facets
p1 <- p + facet_grid(. ~ cyl) # columns are cyl categories
p2 <- p + facet_grid(drv ~ .) # rows are drv categories
p3 <- p + facet_grid(drv ~ cyl) # both
p4 <- p + facet_wrap(~ class) # wrap plots by class category
# plot all four in one arrangement
library(gridExtra)
grid.arrange(p1, p2, p3, p4, ncol = 2)

#### ggplot_mpg_cty_hwy
p <- ggplot(mpg, aes(x = cty, y = hwy))
p <- p + geom_point()
print(p)

#### ggplot_mpg_cty_hwy_jitter
p <- ggplot(mpg, aes(x = cty, y = hwy))
p <- p + geom_point(position = "jitter", alpha = 1/2)
print(p)

#### ggplot_mpg_class_hwy
p <- ggplot(mpg, aes(x = class, y = hwy))
p <- p + geom_point()
print(p)

#### ggplot_mpg_reorder_class_hwy
p <- ggplot(mpg, aes(x = reorder(class, hwy), y = hwy))
p <- p + geom_point()
print(p)

#### ggplot_mpg_reorder_class_hwy_jitter
p <- ggplot(mpg, aes(x = reorder(class, hwy), y = hwy))
p <- p + geom_point(position = "jitter")
print(p)

#### ggplot_mpg_reorder_class_hwy_jitter_less
p <- ggplot(mpg, aes(x = reorder(class, hwy), y = hwy))
p <- p + geom_jitter(position = position_jitter(width = 0.1))
print(p)

#### ggplot_mpg_reorder_class_hwy_boxplot
p <- ggplot(mpg, aes(x = reorder(class, hwy), y = hwy))
p <- p + geom_boxplot()
print(p)

#### ggplot_mpg_reorder_class_hwy_jitter_boxplot
p <- ggplot(mpg, aes(x = reorder(class, hwy), y = hwy))
p <- p + geom_jitter(position = position_jitter(width = 0.1))
p <- p + geom_boxplot(alpha = 0.5)
print(p)

#### ggplot_mpg_reorder_class_hwy_boxplot_jitter
p <- ggplot(mpg, aes(x = reorder(class, hwy), y = hwy))
p <- p + geom_boxplot(alpha = 0.5)
p <- p + geom_jitter(position = position_jitter(width = 0.1))
print(p)

#### ggplot_mpg_reorder_class_hwy_boxplot_jitter_median
p <- ggplot(mpg, aes(x = reorder(class, hwy, FUN = median), y = hwy))
p <- p + geom_boxplot(alpha = 0.5)
p <- p + geom_jitter(position = position_jitter(width = 0.1))
print(p)

## #### Review
## library(ggplot2)
## ?help
## head()
## str()
## summary()
## ggplot(df)
## geom_point()
## aes()
## colour, size, shape, alpha
## position = "jitter"
## geom_jitter(position = position_jitter(width = 0.1))
## geom_boxplot()
## facet_grid()
## facet_wrap()
## reorder()
## median()