Annie
15 April 2016
This concept is really hard. It's totally normal to feel incredibly confused.
I also don't fully understand it myself (especially in R), so let's try to get our heads around this together.
A chair, an apple, a table, … also functions, vectors, and data.frames!
x <- c('hello', 'goodbye')
class(x)
[1] "character"
x <- 1
class(x)
[1] "numeric"
x <- data.frame(a = 1:10, b = 2:11)
class(x)
[1] "data.frame"
testFunc <- function(x) x + 1
testFunc(2)
[1] 3
class(testFunc)
[1] "function"
x <- table(c('M', 'M', 'F', 'F'),
c('White', 'Af-Am', 'Af-Am', 'Af-Am'))
class(x)
[1] "table"
data(iris)
head(iris, 2)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
mod <- glm(Sepal.Width ~ Sepal.Length, data = iris)
mod
Call: glm(formula = Sepal.Width ~ Sepal.Length, data = iris)
Coefficients:
(Intercept) Sepal.Length
3.41895 -0.06188
Degrees of Freedom: 149 Total (i.e. Null); 148 Residual
Null Deviance: 28.31
Residual Deviance: 27.92 AIC: 179.5
class(mod)
[1] "glm" "lm"
str(mod)
List of 30
$ coefficients : Named num [1:2] 3.4189 -0.0619
..- attr(*, "names")= chr [1:2] "(Intercept)" "Sepal.Length"
$ residuals : Named num [1:150] 0.3967 -0.1157 0.0719 -0.0343 0.4905 ...
..- attr(*, "names")= chr [1:150] "1" "2" "3" "4" ...
$ fitted.values : Named num [1:150] 3.1 3.12 3.13 3.13 3.11 ...
..- attr(*, "names")= chr [1:150] "1" "2" "3" "4" ...
$ effects : Named num [1:150] -37.4445 -0.6255 0.0564 -0.0485 0.471 ...
..- attr(*, "names")= chr [1:150] "(Intercept)" "Sepal.Length" "" "" ...
$ R : num [1:2, 1:2] -12.2 0 -71.6 10.1
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:2] "(Intercept)" "Sepal.Length"
.. ..$ : chr [1:2] "(Intercept)" "Sepal.Length"
$ rank : int 2
$ qr :List of 5
..$ qr : num [1:150, 1:2] -12.2474 0.0816 0.0816 0.0816 0.0816 ...
.. ..- attr(*, "dimnames")=List of 2
.. .. ..$ : chr [1:150] "1" "2" "3" "4" ...
.. .. ..$ : chr [1:2] "(Intercept)" "Sepal.Length"
..$ rank : int 2
..$ qraux: num [1:2] 1.08 1.09
..$ pivot: int [1:2] 1 2
..$ tol : num 1e-11
..- attr(*, "class")= chr "qr"
$ family :List of 11
..$ family : chr "gaussian"
..$ link : chr "identity"
..$ linkfun :function (mu)
..$ linkinv :function (eta)
..$ variance :function (mu)
..$ dev.resids:function (y, mu, wt)
..$ aic :function (y, n, mu, wt, dev)
..$ mu.eta :function (eta)
..$ initialize: expression({ n <- rep.int(1, nobs) if (is.null(etastart) && is.null(start) && is.null(mustart) && ((family$link == "inverse" && any(y == 0)) || (family$link == "log" && any(y <= 0)))) stop("cannot find valid starting values: please specify some") mustart <- y })
..$ validmu :function (mu)
..$ valideta :function (eta)
..- attr(*, "class")= chr "family"
$ linear.predictors: Named num [1:150] 3.1 3.12 3.13 3.13 3.11 ...
..- attr(*, "names")= chr [1:150] "1" "2" "3" "4" ...
$ deviance : num 27.9
$ aic : num 179
$ null.deviance : num 28.3
$ iter : int 2
$ weights : Named num [1:150] 1 1 1 1 1 1 1 1 1 1 ...
..- attr(*, "names")= chr [1:150] "1" "2" "3" "4" ...
$ prior.weights : Named num [1:150] 1 1 1 1 1 1 1 1 1 1 ...
..- attr(*, "names")= chr [1:150] "1" "2" "3" "4" ...
$ df.residual : int 148
$ df.null : int 149
$ y : Named num [1:150] 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
..- attr(*, "names")= chr [1:150] "1" "2" "3" "4" ...
$ converged : logi TRUE
$ boundary : logi FALSE
$ model :'data.frame': 150 obs. of 2 variables:
..$ Sepal.Width : num [1:150] 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
..$ Sepal.Length: num [1:150] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
..- attr(*, "terms")=Classes 'terms', 'formula' length 3 Sepal.Width ~ Sepal.Length
.. .. ..- attr(*, "variables")= language list(Sepal.Width, Sepal.Length)
.. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
.. .. .. ..- attr(*, "dimnames")=List of 2
.. .. .. .. ..$ : chr [1:2] "Sepal.Width" "Sepal.Length"
.. .. .. .. ..$ : chr "Sepal.Length"
.. .. ..- attr(*, "term.labels")= chr "Sepal.Length"
.. .. ..- attr(*, "order")= int 1
.. .. ..- attr(*, "intercept")= int 1
.. .. ..- attr(*, "response")= int 1
.. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
.. .. ..- attr(*, "predvars")= language list(Sepal.Width, Sepal.Length)
.. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
.. .. .. ..- attr(*, "names")= chr [1:2] "Sepal.Width" "Sepal.Length"
$ call : language glm(formula = Sepal.Width ~ Sepal.Length, data = iris)
$ formula :Class 'formula' length 3 Sepal.Width ~ Sepal.Length
.. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
$ terms :Classes 'terms', 'formula' length 3 Sepal.Width ~ Sepal.Length
.. ..- attr(*, "variables")= language list(Sepal.Width, Sepal.Length)
.. ..- attr(*, "factors")= int [1:2, 1] 0 1
.. .. ..- attr(*, "dimnames")=List of 2
.. .. .. ..$ : chr [1:2] "Sepal.Width" "Sepal.Length"
.. .. .. ..$ : chr "Sepal.Length"
.. ..- attr(*, "term.labels")= chr "Sepal.Length"
.. ..- attr(*, "order")= int 1
.. ..- attr(*, "intercept")= int 1
.. ..- attr(*, "response")= int 1
.. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
.. ..- attr(*, "predvars")= language list(Sepal.Width, Sepal.Length)
.. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
.. .. ..- attr(*, "names")= chr [1:2] "Sepal.Width" "Sepal.Length"
$ data :'data.frame': 150 obs. of 5 variables:
..$ Sepal.Length: num [1:150] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
..$ Sepal.Width : num [1:150] 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
..$ Petal.Length: num [1:150] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
..$ Petal.Width : num [1:150] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
..$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
$ offset : NULL
$ control :List of 3
..$ epsilon: num 1e-08
..$ maxit : num 25
..$ trace : logi FALSE
$ method : chr "glm.fit"
$ contrasts : NULL
$ xlevels : Named list()
- attr(*, "class")= chr [1:2] "glm" "lm"
mod$coefficients
(Intercept) Sepal.Length
3.4189468 -0.0618848
mod$converged
[1] TRUE
mod$family
Family: gaussian
Link function: identity
data.frames are collections of vectors of different typesglm objects store a bunch of attributes related to linear modelsR has special functions called generics that do different things depending on what type of object is passed to it
print('hello')
[1] "hello"
print(mod)
Call: glm(formula = Sepal.Width ~ Sepal.Length, data = iris)
Coefficients:
(Intercept) Sepal.Length
3.41895 -0.06188
Degrees of Freedom: 149 Total (i.e. Null); 148 Residual
Null Deviance: 28.31
Residual Deviance: 27.92 AIC: 179.5
summary is another great example of a generic function.
Based on what type of object is passed in, it displays different summary statistics.
summary('hello')
Length Class Mode
1 character character
summary(c(1, 1, 0, 0, 0))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0 0.0 0.0 0.4 1.0 1.0
summary(mod)
Call:
glm(formula = Sepal.Width ~ Sepal.Length, data = iris)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.1095 -0.2454 -0.0167 0.2763 1.3338
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.41895 0.25356 13.48 <2e-16 ***
Sepal.Length -0.06188 0.04297 -1.44 0.152
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for gaussian family taken to be 0.1886193)
Null deviance: 28.307 on 149 degrees of freedom
Residual deviance: 27.916 on 148 degrees of freedom
AIC: 179.46
Number of Fisher Scoring iterations: 2
R has three types of objects:
Read more about them at Hadley Wickham's website if you care because we won't be talking about them.
For the rest of these slides, we'll be talking about S4, which is the strictest of these object types.
Okay, objects are cool. Generic functions operate on them, but how do we create them?
Classes are blueprints for creating objects. Creating a class doesn't create an object, but it tells us how to create the object.
Sort of like how creating a function doesn't run the function, but you can call the function other stuff.
setClass("pokemon",
slots = c(
# These are the different attributes that pokemon have
# and the types they can take
name = 'character',
hp = 'numeric',
attack = 'numeric',
defense = 'numeric',
type = 'character'
))
getSlots('pokemon')
name hp attack defense type
"character" "numeric" "numeric" "numeric" "character"
Remember, we haven't created an pokemon objects yet, but
now we know what attributes we need to create them.
Here's how we create an instance of the object.
poke1 <- new("pokemon",
name = "snorlax",
# Note how we're passing in parameters
hp = 160,
attack = 110,
defense = 65,
type = 'normal')
print(poke1)
An object of class "pokemon"
Slot "name":
[1] "snorlax"
Slot "hp":
[1] 160
Slot "attack":
[1] 110
Slot "defense":
[1] 65
Slot "type":
[1] "normal"
S4 objects have slots, which you access through @
poke1@name
[1] "snorlax"
poke1@attack
[1] 110
A function that is associated with a generic function is
called a method. Let's write a method for the print
function to handle pokemon objects.
setMethod("print",
# Means that it operates on the "pokemon" class
signature = "pokemon",
# What the print function should do
function(x){
cat(sprintf('%s, go! This %s pokemon has an attack strength of %s.',
toupper(x@name),
x@type,
x@attack))
})
print(poke1)
SNORLAX, go! This normal pokemon has an attack strength of 110.
Now we can go around creating more pokemon, but what if they have different attributes?
For example, we might want to store how long a Snorlax sleeps, or how poisonous a Bulbasaur is.
All of these pokemon have the same attributes as the base
pokemon class, but also have slightly different attributes.
To deal with this, we can create new classes that inherit things from a base class.
setClass("snorlax",
# What it inherits from
contains = "pokemon",
# New attributse for a snorlax
slots = c(
sleep = 'numeric'
),
# Set defaults
prototype = list(name = 'snorlax',
hp = 160,
attack = 110,
defense = 65,
type = 'normal')
)
snor_snor <- new("snorlax", sleep = 12, attack = 0)
What do you think will happen if we run
print(snor_snor)
print(snor_snor)
SNORLAX, go! This normal pokemon has an attack strength of 0.
First, we're creating a new generic function fight.
setGeneric("fight", function(object){
standardGeneric("fight")
})
Now, we're going to create a method for fight that applies to the pokemon class.
fight.pokemon <- function(object){
# Generate attack outcome
r <- rnorm(1, mean = object@attack, sd = object@defense/5)
# Print status
if(r > 100){
cat("It's super effective!\n")
} else if(r < 0) {
cat("You use splash. But nothing happened.\n")
} else {
cat("It's not really effective.\n")
}
r
}
setMethod("fight", "pokemon", fight.pokemon)
fight(snor_snor)
You use splash. But nothing happened.
[1] -5.885751
We're going to move onto modeling in the next section, so things are going to get bit more complicated.
Further references (I would read in this order):