Sources: Art of R Programming, Norman Matloff R Cookbook, Paul Teetor http://www.cookbook-r.com/Manipulating_data/Changing_the_order_of_levels_of_a_factor/
x <- c(5,12,13,12)
xf <- factor(x)
xf
## [1] 5 12 13 12
## Levels: 5 12 13
str(xf)
## Factor w/ 3 levels "5","12","13": 1 2 3 2
attr(xf,"levels")
## [1] "5" "12" "13"
length(xf)
## [1] 4
x <- c(5,12,13,12)
xff <- factor(x, levels=c(5,12,13,88))
xff
## [1] 5 12 13 12
## Levels: 5 12 13 88
xff[2] <- 88
xff
## [1] 5 88 13 12
## Levels: 5 12 13 88
#xff[2] <- 28 # not valid
read.csv()
airlines <- read.csv("C:/Data/flights/airlines.csv")
airlines
## carrier name
## 1 9E Endeavor Air Inc.
## 2 AA American Airlines Inc.
## 3 AS Alaska Airlines Inc.
## 4 B6 JetBlue Airways
## 5 DL Delta Air Lines Inc.
## 6 EV ExpressJet Airlines Inc.
## 7 F9 Frontier Airlines Inc.
## 8 FL AirTran Airways Corporation
## 9 HA Hawaiian Airlines Inc.
## 10 MQ Envoy Air
## 11 OO SkyWest Airlines Inc.
## 12 UA United Air Lines Inc.
## 13 US US Airways Inc.
## 14 VX Virgin America
## 15 WN Southwest Airlines Co.
## 16 YV Mesa Airlines Inc.
# View(airlines)
str(airlines)
## 'data.frame': 16 obs. of 2 variables:
## $ carrier: Factor w/ 16 levels "9E","AA","AS",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ name : Factor w/ 16 levels "AirTran Airways Corporation",..: 5 3 2 10 4 7 8 1 9 6 ...
airlines2 <- read.csv("C:/Data/flights/airlines.csv", as.is = TRUE)
airlines2
## carrier name
## 1 9E Endeavor Air Inc.
## 2 AA American Airlines Inc.
## 3 AS Alaska Airlines Inc.
## 4 B6 JetBlue Airways
## 5 DL Delta Air Lines Inc.
## 6 EV ExpressJet Airlines Inc.
## 7 F9 Frontier Airlines Inc.
## 8 FL AirTran Airways Corporation
## 9 HA Hawaiian Airlines Inc.
## 10 MQ Envoy Air
## 11 OO SkyWest Airlines Inc.
## 12 UA United Air Lines Inc.
## 13 US US Airways Inc.
## 14 VX Virgin America
## 15 WN Southwest Airlines Co.
## 16 YV Mesa Airlines Inc.
str(airlines2)
## 'data.frame': 16 obs. of 2 variables:
## $ carrier: chr "9E" "AA" "AS" "B6" ...
## $ name : chr "Endeavor Air Inc." "American Airlines Inc." "Alaska Airlines Inc." "JetBlue Airways" ...
# some functions behave indifferently (or don't work at all) depending on whether you are working with character data or factor data
# nchar(airlines$carrier) # not valid
nchar(airlines2$carrier)
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
airlines2$carrier = as.character(airlines2$carrier)
str(airlines2$carrier)
## chr [1:16] "9E" "AA" "AS" "B6" "DL" "EV" "F9" "FL" ...
airlines2$carrier = as.factor(airlines2$carrier)
str(airlines2$carrier)
## Factor w/ 16 levels "9E","AA","AS",..: 1 2 3 4 5 6 7 8 9 10 ...
sizes <- factor(c("small", "large", "large", "small", "medium"))
sizes
## [1] small large large small medium
## Levels: large medium small
sizes <- factor(sizes, levels = c("small", "medium", "large"))
sizes
## [1] small large large small medium
## Levels: small medium large
d <-
(list(gender=c("M","M","F","M","F","F"),
age=c(47,59,21,32,33,24), income=c(55000,88000,34250,76500,123000,45465)))
d
## $gender
## [1] "M" "M" "F" "M" "F" "F"
##
## $age
## [1] 47 59 21 32 33 24
##
## $income
## [1] 55000 88000 34250 76500 123000 45465
split(d$income,d$gender)
## $F
## [1] 34250 123000 45465
##
## $M
## [1] 55000 88000 76500
d$over25 <- ifelse(d$age > 25, 1, 0)
d
## $gender
## [1] "M" "M" "F" "M" "F" "F"
##
## $age
## [1] 47 59 21 32 33 24
##
## $income
## [1] 55000 88000 34250 76500 123000 45465
##
## $over25
## [1] 1 1 0 1 1 0
split(d$income,list(d$gender,d$over25))
## $F.0
## [1] 34250 45465
##
## $M.0
## numeric(0)
##
## $F.1
## [1] 123000
##
## $M.1
## [1] 55000 88000 76500
tapply(x,f,g) where x is a vector, f is a factor or list of factors, and g is a function
ages <- c(25, 26, 55, 37, 21, 42)
affils <- c("R","D","D","R","U","D")
tapply(ages, affils, mean)
## D R U
## 41 31 21
# could also be done with base R aggregate() function
aggregate(ages,by=list(affils), mean)
## Group.1 x
## 1 D 41
## 2 R 31
## 3 U 21
# WHAT IS THE DIFFERENCE BETWEEN WHAT IS RETURNED BY tapply() AND BY aggregate()? WHICH IS FASTER?
d <-
(list(gender=c("M","M","F","M","F","F"),
age=c(47,59,21,32,33,24), income=c(55000,88000,34250,76500,123000,45465)))
d
## $gender
## [1] "M" "M" "F" "M" "F" "F"
##
## $age
## [1] 47 59 21 32 33 24
##
## $income
## [1] 55000 88000 34250 76500 123000 45465
d$over25 <- ifelse(d$age > 25, 1, 0)
d
## $gender
## [1] "M" "M" "F" "M" "F" "F"
##
## $age
## [1] 47 59 21 32 33 24
##
## $income
## [1] 55000 88000 34250 76500 123000 45465
##
## $over25
## [1] 1 1 0 1 1 0
tapply(d$income, list(d$gender, d$over25), mean)
## 0 1
## F 39857.5 123000.00
## M NA 73166.67
binning - splitting data into groups according to intervals
x <- rnorm(100)
x
## [1] -1.26808842 -0.13203060 0.39843774 -0.95800618 0.85826208
## [6] 2.16101171 -0.57480262 0.49989970 0.78778634 -1.45259631
## [11] -0.52759822 -0.80981767 1.27361735 0.58545749 -0.17522700
## [16] 2.40567748 0.71777319 -0.27504714 -1.41015238 -1.42428335
## [21] -0.49639846 -0.51610122 -0.09130968 0.27185427 -0.13988893
## [26] 2.64612744 -1.72083499 0.01722310 -0.35142115 0.11743942
## [31] 0.90016515 -0.42941285 -1.17221618 0.70554912 -1.31431183
## [36] 0.75521988 0.16030904 0.54753881 -0.15183530 0.92037087
## [41] -1.12549817 0.52333792 0.26807497 0.38647836 -0.44028945
## [46] 1.08325663 1.45267488 0.65364864 -1.18603251 0.07760420
## [51] -0.44095417 0.48061650 -0.90407485 0.14822760 0.07738265
## [56] -0.74575816 -0.51387533 -1.21794598 0.72124085 0.64918895
## [61] -1.78674191 -1.39182596 -0.33100425 1.43720184 -1.09213236
## [66] 0.81128596 -2.20872656 -0.01684372 -2.41917905 -1.61780077
## [71] -1.22419697 1.84229310 1.09138132 -0.58828757 -0.52678009
## [76] 0.14966843 0.17335177 0.83488263 0.35951259 0.24135203
## [81] -2.21837296 -1.56057434 0.75595211 1.87916751 -1.50685371
## [86] -0.32321218 0.02472768 -0.54243382 1.64338742 1.83265720
## [91] 1.27262297 0.15082793 -1.32386418 0.48212255 -1.35441915
## [96] -1.62325807 1.58356909 -0.19843085 0.06851890 1.10811122
breaks <- c(-3,-2,-1,0,1,2,3)
f <- cut(x, breaks)
f
## [1] (-2,-1] (-1,0] (0,1] (-1,0] (0,1] (2,3] (-1,0] (0,1]
## [9] (0,1] (-2,-1] (-1,0] (-1,0] (1,2] (0,1] (-1,0] (2,3]
## [17] (0,1] (-1,0] (-2,-1] (-2,-1] (-1,0] (-1,0] (-1,0] (0,1]
## [25] (-1,0] (2,3] (-2,-1] (0,1] (-1,0] (0,1] (0,1] (-1,0]
## [33] (-2,-1] (0,1] (-2,-1] (0,1] (0,1] (0,1] (-1,0] (0,1]
## [41] (-2,-1] (0,1] (0,1] (0,1] (-1,0] (1,2] (1,2] (0,1]
## [49] (-2,-1] (0,1] (-1,0] (0,1] (-1,0] (0,1] (0,1] (-1,0]
## [57] (-1,0] (-2,-1] (0,1] (0,1] (-2,-1] (-2,-1] (-1,0] (1,2]
## [65] (-2,-1] (0,1] (-3,-2] (-1,0] (-3,-2] (-2,-1] (-2,-1] (1,2]
## [73] (1,2] (-1,0] (-1,0] (0,1] (0,1] (0,1] (0,1] (0,1]
## [81] (-3,-2] (-2,-1] (0,1] (1,2] (-2,-1] (-1,0] (0,1] (-1,0]
## [89] (1,2] (1,2] (1,2] (0,1] (-2,-1] (0,1] (-2,-1] (-2,-1]
## [97] (1,2] (-1,0] (0,1] (1,2]
## Levels: (-3,-2] (-2,-1] (-1,0] (0,1] (1,2] (2,3]
summary(f)
## (-3,-2] (-2,-1] (-1,0] (0,1] (1,2] (2,3]
## 3 20 26 36 12 3
f <- cut(x, breaks, labels=c("Bottom","Low","Neg","Pos","High","Top"))
f
## [1] Low Neg Pos Neg Pos Top Neg Pos Pos Low
## [11] Neg Neg High Pos Neg Top Pos Neg Low Low
## [21] Neg Neg Neg Pos Neg Top Low Pos Neg Pos
## [31] Pos Neg Low Pos Low Pos Pos Pos Neg Pos
## [41] Low Pos Pos Pos Neg High High Pos Low Pos
## [51] Neg Pos Neg Pos Pos Neg Neg Low Pos Pos
## [61] Low Low Neg High Low Pos Bottom Neg Bottom Low
## [71] Low High High Neg Neg Pos Pos Pos Pos Pos
## [81] Bottom Low Pos High Low Neg Pos Neg High High
## [91] High Pos Low Pos Low Low High Neg Pos High
## Levels: Bottom Low Neg Pos High Top
summary(f)
## Bottom Low Neg Pos High Top
## 3 20 26 36 12 3
#install.packages("MASS")
library(MASS)
## Warning: package 'MASS' was built under R version 3.1.3
data(Cars93, package="MASS")
View(Cars93)
boxplot(Cars93$Horsepower)
boxplot(Horsepower ~ Origin, data=Cars93)
conditioning plot - scatterplot with 2+ numeric variables and a factor
#install.packages("MASS")
library(MASS)
data(Cars93, package="MASS")
View(Cars93)
coplot(Horsepower ~ MPG.city | Origin, data=Cars93)
# install.packages(ggplot2)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.3