Sources: Art of R Programming, Norman Matloff R Cookbook, Paul Teetor http://www.cookbook-r.com/Manipulating_data/Changing_the_order_of_levels_of_a_factor/

Factors

factors - overview

x <- c(5,12,13,12)
xf <- factor(x)
xf
## [1] 5  12 13 12
## Levels: 5 12 13
str(xf)
##  Factor w/ 3 levels "5","12","13": 1 2 3 2
attr(xf,"levels")
## [1] "5"  "12" "13"
length(xf)
## [1] 4
x <- c(5,12,13,12)

xff <- factor(x, levels=c(5,12,13,88))
xff
## [1] 5  12 13 12
## Levels: 5 12 13 88
xff[2] <- 88
xff
## [1] 5  88 13 12
## Levels: 5 12 13 88
#xff[2] <- 28     # not valid

What is the default behavior when I load character data? How do I override this default behavior?

read.csv()

airlines <- read.csv("C:/Data/flights/airlines.csv")
airlines
##    carrier                        name
## 1       9E           Endeavor Air Inc.
## 2       AA      American Airlines Inc.
## 3       AS        Alaska Airlines Inc.
## 4       B6             JetBlue Airways
## 5       DL        Delta Air Lines Inc.
## 6       EV    ExpressJet Airlines Inc.
## 7       F9      Frontier Airlines Inc.
## 8       FL AirTran Airways Corporation
## 9       HA      Hawaiian Airlines Inc.
## 10      MQ                   Envoy Air
## 11      OO       SkyWest Airlines Inc.
## 12      UA       United Air Lines Inc.
## 13      US             US Airways Inc.
## 14      VX              Virgin America
## 15      WN      Southwest Airlines Co.
## 16      YV          Mesa Airlines Inc.
# View(airlines)
str(airlines)
## 'data.frame':    16 obs. of  2 variables:
##  $ carrier: Factor w/ 16 levels "9E","AA","AS",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ name   : Factor w/ 16 levels "AirTran Airways Corporation",..: 5 3 2 10 4 7 8 1 9 6 ...
airlines2 <- read.csv("C:/Data/flights/airlines.csv", as.is = TRUE)
airlines2
##    carrier                        name
## 1       9E           Endeavor Air Inc.
## 2       AA      American Airlines Inc.
## 3       AS        Alaska Airlines Inc.
## 4       B6             JetBlue Airways
## 5       DL        Delta Air Lines Inc.
## 6       EV    ExpressJet Airlines Inc.
## 7       F9      Frontier Airlines Inc.
## 8       FL AirTran Airways Corporation
## 9       HA      Hawaiian Airlines Inc.
## 10      MQ                   Envoy Air
## 11      OO       SkyWest Airlines Inc.
## 12      UA       United Air Lines Inc.
## 13      US             US Airways Inc.
## 14      VX              Virgin America
## 15      WN      Southwest Airlines Co.
## 16      YV          Mesa Airlines Inc.
str(airlines2)
## 'data.frame':    16 obs. of  2 variables:
##  $ carrier: chr  "9E" "AA" "AS" "B6" ...
##  $ name   : chr  "Endeavor Air Inc." "American Airlines Inc." "Alaska Airlines Inc." "JetBlue Airways" ...
# some functions behave indifferently (or don't work at all) depending on whether you are working with character data or factor data

# nchar(airlines$carrier) # not valid
nchar(airlines2$carrier)  
##  [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
airlines2$carrier = as.character(airlines2$carrier)
str(airlines2$carrier)
##  chr [1:16] "9E" "AA" "AS" "B6" "DL" "EV" "F9" "FL" ...
airlines2$carrier = as.factor(airlines2$carrier)
str(airlines2$carrier)
##  Factor w/ 16 levels "9E","AA","AS",..: 1 2 3 4 5 6 7 8 9 10 ...

How do I change the order of factor levels?

sizes <- factor(c("small", "large", "large", "small", "medium"))
sizes
## [1] small  large  large  small  medium
## Levels: large medium small
sizes <- factor(sizes, levels = c("small", "medium", "large"))
sizes
## [1] small  large  large  small  medium
## Levels: small medium large

How do I use split() function to group data by factors?

d <- 
  (list(gender=c("M","M","F","M","F","F"),
   age=c(47,59,21,32,33,24), income=c(55000,88000,34250,76500,123000,45465)))
d
## $gender
## [1] "M" "M" "F" "M" "F" "F"
## 
## $age
## [1] 47 59 21 32 33 24
## 
## $income
## [1]  55000  88000  34250  76500 123000  45465
split(d$income,d$gender)
## $F
## [1]  34250 123000  45465
## 
## $M
## [1] 55000 88000 76500
d$over25 <- ifelse(d$age > 25, 1, 0)
d
## $gender
## [1] "M" "M" "F" "M" "F" "F"
## 
## $age
## [1] 47 59 21 32 33 24
## 
## $income
## [1]  55000  88000  34250  76500 123000  45465
## 
## $over25
## [1] 1 1 0 1 1 0
split(d$income,list(d$gender,d$over25))
## $F.0
## [1] 34250 45465
## 
## $M.0
## numeric(0)
## 
## $F.1
## [1] 123000
## 
## $M.1
## [1] 55000 88000 76500

How do I use tapply() to apply a function to every group?

tapply(x,f,g) where x is a vector, f is a factor or list of factors, and g is a function

ages <- c(25, 26, 55, 37, 21, 42)
affils <- c("R","D","D","R","U","D")
tapply(ages, affils, mean)
##  D  R  U 
## 41 31 21
# could also be done with base R aggregate() function
aggregate(ages,by=list(affils), mean)
##   Group.1  x
## 1       D 41
## 2       R 31
## 3       U 21
# WHAT IS THE DIFFERENCE BETWEEN WHAT IS RETURNED BY tapply() AND BY aggregate()?  WHICH IS FASTER?

How do I use tapply() over more than one set of factors at once?

d <- 
  (list(gender=c("M","M","F","M","F","F"),
   age=c(47,59,21,32,33,24), income=c(55000,88000,34250,76500,123000,45465)))
d
## $gender
## [1] "M" "M" "F" "M" "F" "F"
## 
## $age
## [1] 47 59 21 32 33 24
## 
## $income
## [1]  55000  88000  34250  76500 123000  45465
d$over25 <- ifelse(d$age > 25, 1, 0)
d
## $gender
## [1] "M" "M" "F" "M" "F" "F"
## 
## $age
## [1] 47 59 21 32 33 24
## 
## $income
## [1]  55000  88000  34250  76500 123000  45465
## 
## $over25
## [1] 1 1 0 1 1 0
tapply(d$income, list(d$gender, d$over25), mean)
##         0         1
## F 39857.5 123000.00
## M      NA  73166.67

How do I create a factor from continuous data? Why would I want to?

binning - splitting data into groups according to intervals

x <- rnorm(100)
x
##   [1] -1.26808842 -0.13203060  0.39843774 -0.95800618  0.85826208
##   [6]  2.16101171 -0.57480262  0.49989970  0.78778634 -1.45259631
##  [11] -0.52759822 -0.80981767  1.27361735  0.58545749 -0.17522700
##  [16]  2.40567748  0.71777319 -0.27504714 -1.41015238 -1.42428335
##  [21] -0.49639846 -0.51610122 -0.09130968  0.27185427 -0.13988893
##  [26]  2.64612744 -1.72083499  0.01722310 -0.35142115  0.11743942
##  [31]  0.90016515 -0.42941285 -1.17221618  0.70554912 -1.31431183
##  [36]  0.75521988  0.16030904  0.54753881 -0.15183530  0.92037087
##  [41] -1.12549817  0.52333792  0.26807497  0.38647836 -0.44028945
##  [46]  1.08325663  1.45267488  0.65364864 -1.18603251  0.07760420
##  [51] -0.44095417  0.48061650 -0.90407485  0.14822760  0.07738265
##  [56] -0.74575816 -0.51387533 -1.21794598  0.72124085  0.64918895
##  [61] -1.78674191 -1.39182596 -0.33100425  1.43720184 -1.09213236
##  [66]  0.81128596 -2.20872656 -0.01684372 -2.41917905 -1.61780077
##  [71] -1.22419697  1.84229310  1.09138132 -0.58828757 -0.52678009
##  [76]  0.14966843  0.17335177  0.83488263  0.35951259  0.24135203
##  [81] -2.21837296 -1.56057434  0.75595211  1.87916751 -1.50685371
##  [86] -0.32321218  0.02472768 -0.54243382  1.64338742  1.83265720
##  [91]  1.27262297  0.15082793 -1.32386418  0.48212255 -1.35441915
##  [96] -1.62325807  1.58356909 -0.19843085  0.06851890  1.10811122
breaks <- c(-3,-2,-1,0,1,2,3)
f <- cut(x, breaks)
f
##   [1] (-2,-1] (-1,0]  (0,1]   (-1,0]  (0,1]   (2,3]   (-1,0]  (0,1]  
##   [9] (0,1]   (-2,-1] (-1,0]  (-1,0]  (1,2]   (0,1]   (-1,0]  (2,3]  
##  [17] (0,1]   (-1,0]  (-2,-1] (-2,-1] (-1,0]  (-1,0]  (-1,0]  (0,1]  
##  [25] (-1,0]  (2,3]   (-2,-1] (0,1]   (-1,0]  (0,1]   (0,1]   (-1,0] 
##  [33] (-2,-1] (0,1]   (-2,-1] (0,1]   (0,1]   (0,1]   (-1,0]  (0,1]  
##  [41] (-2,-1] (0,1]   (0,1]   (0,1]   (-1,0]  (1,2]   (1,2]   (0,1]  
##  [49] (-2,-1] (0,1]   (-1,0]  (0,1]   (-1,0]  (0,1]   (0,1]   (-1,0] 
##  [57] (-1,0]  (-2,-1] (0,1]   (0,1]   (-2,-1] (-2,-1] (-1,0]  (1,2]  
##  [65] (-2,-1] (0,1]   (-3,-2] (-1,0]  (-3,-2] (-2,-1] (-2,-1] (1,2]  
##  [73] (1,2]   (-1,0]  (-1,0]  (0,1]   (0,1]   (0,1]   (0,1]   (0,1]  
##  [81] (-3,-2] (-2,-1] (0,1]   (1,2]   (-2,-1] (-1,0]  (0,1]   (-1,0] 
##  [89] (1,2]   (1,2]   (1,2]   (0,1]   (-2,-1] (0,1]   (-2,-1] (-2,-1]
##  [97] (1,2]   (-1,0]  (0,1]   (1,2]  
## Levels: (-3,-2] (-2,-1] (-1,0] (0,1] (1,2] (2,3]
summary(f)
## (-3,-2] (-2,-1]  (-1,0]   (0,1]   (1,2]   (2,3] 
##       3      20      26      36      12       3
f <- cut(x, breaks, labels=c("Bottom","Low","Neg","Pos","High","Top"))
f
##   [1] Low    Neg    Pos    Neg    Pos    Top    Neg    Pos    Pos    Low   
##  [11] Neg    Neg    High   Pos    Neg    Top    Pos    Neg    Low    Low   
##  [21] Neg    Neg    Neg    Pos    Neg    Top    Low    Pos    Neg    Pos   
##  [31] Pos    Neg    Low    Pos    Low    Pos    Pos    Pos    Neg    Pos   
##  [41] Low    Pos    Pos    Pos    Neg    High   High   Pos    Low    Pos   
##  [51] Neg    Pos    Neg    Pos    Pos    Neg    Neg    Low    Pos    Pos   
##  [61] Low    Low    Neg    High   Low    Pos    Bottom Neg    Bottom Low   
##  [71] Low    High   High   Neg    Neg    Pos    Pos    Pos    Pos    Pos   
##  [81] Bottom Low    Pos    High   Low    Neg    Pos    Neg    High   High  
##  [91] High   Pos    Low    Pos    Low    Low    High   Neg    Pos    High  
## Levels: Bottom Low Neg Pos High Top
summary(f)
## Bottom    Low    Neg    Pos   High    Top 
##      3     20     26     36     12      3

How do I create a box plot by factor level?

#install.packages("MASS")
library(MASS)
## Warning: package 'MASS' was built under R version 3.1.3
data(Cars93, package="MASS")
View(Cars93)

boxplot(Cars93$Horsepower)

boxplot(Horsepower ~ Origin, data=Cars93)

How do I create multiple scatterplots, one for each factor level?

conditioning plot - scatterplot with 2+ numeric variables and a factor

#install.packages("MASS")
library(MASS)
data(Cars93, package="MASS")
View(Cars93)

coplot(Horsepower ~ MPG.city | Origin, data=Cars93)

TO DO: For the diamonds dataset in ggplot2 package, what is the average price of a diamond for each cut?

# install.packages(ggplot2)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.3