Contents
- Matrices
- Factors
- Lists
- Data frames
- Reading dataframes from file (first iteration)
- Plotting with dataframes
november 2015
Contents
m <- matrix(1:10, nrow = 2, ncol = 5); m
[,1] [,2] [,3] [,4] [,5] [1,] 1 3 5 7 9 [2,] 2 4 6 8 10
v <- 1:10; dim(v) <- c(2, 5); v
[,1] [,2] [,3] [,4] [,5] [1,] 1 3 5 7 9 [2,] 2 4 6 8 10
factor(x)as.factor(x)factor(x, levels = my_levels)eye_colors <- c("green", "blue", "brown", "brown", "blue", "brown", "brown", "brown", "blue", "brown", "green", "brown", "brown", "blue", "blue", "brown")
plot(eye_colors)
Error in plot.window(...): need finite 'ylim' values
eye_colors <- as.factor(eye_colors) plot(eye_colors)
table(eye_colors)
eye_colors
blue brown green
5 9 2
sum(eye_colors == "blue")
[1] 5
classSizes <- factor(
c("big","small","huge","huge","small","big","small","big"),
levels = c("small", "normal", "big", "huge"), ordered = TRUE)
plot(classSizes)
classSizes < "big"
[1] FALSE TRUE FALSE FALSE TRUE FALSE TRUE FALSE
sum(classSizes == "huge")
[1] 2
When you already have an unorderd factor, you can make it ordered by using the function ordered() together with the levels vector
classSizes <- factor(c("big","small","huge","huge","small","big","small","big"))
classSizes <- ordered(classSizes, levels = c("small", "big", "huge"))
classSizes
[1] big small huge huge small big small big Levels: small < big < huge
Factors are used all the time e.g. for defining treated/untreated. That's why R knows how to deal with them so well:
with(ChickWeight, plot(weight ~ Diet))
[[]]x <- c(2, 3, 1)
y <- c("foo", "bar")
l <- list(x, y); l
[[1]] [1] 2 3 1 [[2]] [1] "foo" "bar"
l[[2]]
[1] "foo" "bar"
l[[1]][2]
[1] 3
List can also have named elements
x <- c(2, 3, 1)
y <- c("foo", "bar")
l <- list("numbers" = x, "words" = y)
l
$numbers [1] 2 3 1 $words [1] "foo" "bar"
Accessing named elements can be done in three ways
l[[2]] # index
[1] "foo" "bar"
l[["words"]] # name of element with double brackets
[1] "foo" "bar"
l$words # name of element with dollar sign
[1] "foo" "bar"
Accessing named elements has its limitations
select <- "words" l[[select]] ## OK
[1] "foo" "bar"
l$select ##fails - no element with name "select"
NULL
single brackets on a list returns a list; double brackets a vector
l[[2]]
[1] "foo" "bar"
l[2]
$words [1] "foo" "bar"
l["words"]
$words [1] "foo" "bar"
This behaviour can become awkward
l["words"]$words
[1] "foo" "bar"
l[2]["words"][1]$words ## mind****
[1] "foo" "bar"
dim) attributearray() functionx <- 1:10 dim(x) <- c(2, 5) x
[,1] [,2] [,3] [,4] [,5] [1,] 1 3 5 7 9 [2,] 2 4 6 8 10
class(x)
[1] "matrix"
a <- array(data = 1:12, dim = c(2, 3, 2))
# same as "a <- 1:12; dim(a) <- c(2, 3, 2)"
rownames(a) <- c("foo", "bar")
a
, , 1
[,1] [,2] [,3]
foo 1 3 5
bar 2 4 6
, , 2
[,1] [,2] [,3]
foo 7 9 11
bar 8 10 12
class(a)
[1] "array"
geneNames <- c("P53","BRCA1","VAMP1", "FHIT")
sig <- c(TRUE, TRUE, FALSE, FALSE)
meanExp <- c(4.5, 7.3, 5.4, 2.4)
genes <- data.frame(
"name" = geneNames,
"significant" = sig,
"expression" = meanExp)
genes
name significant expression 1 P53 TRUE 4.5 2 BRCA1 TRUE 7.3 3 VAMP1 FALSE 5.4 4 FHIT FALSE 2.4
genes[2,1] #row 2, element 1
[1] BRCA1 Levels: BRCA1 FHIT P53 VAMP1
genes[, 1:2] #columns 1 and 2
name significant 1 P53 TRUE 2 BRCA1 TRUE 3 VAMP1 FALSE 4 FHIT FALSE
genes[1:2] #columns 1 and 2 (!)
name significant 1 P53 TRUE 2 BRCA1 TRUE 3 VAMP1 FALSE 4 FHIT FALSE
genes[1:2,] #row 1 and 2
name significant expression 1 P53 TRUE 4.5 2 BRCA1 TRUE 7.3
genes[c("name", "expression")] #columns "name" and "expression"
name expression 1 P53 4.5 2 BRCA1 7.3 3 VAMP1 5.4 4 FHIT 2.4
genes$name #column "name"
[1] P53 BRCA1 VAMP1 FHIT Levels: BRCA1 FHIT P53 VAMP1
my_data[row_sel, col_sel]row_sel and col_sel can be
genes[["name"]] ## select column w. double brackets like list
[1] P53 BRCA1 VAMP1 FHIT Levels: BRCA1 FHIT P53 VAMP1
class(genes) ## it is NOT a list though
[1] "data.frame"
str(genes)
'data.frame': 4 obs. of 3 variables: $ name : Factor w/ 4 levels "BRCA1","FHIT",..: 3 1 4 2 $ significant: logi TRUE TRUE FALSE FALSE $ expression : num 4.5 7.3 5.4 2.4
whale liver.Se tooth.Se 1 6.23 140.16 2 6.79 133.32 3 7.92 135.34 ... 19 41.23 206.30 20 45.47 141.31
whale.selenium <- read.table("data/whale_selenium.txt")
head(whale.selenium)
V1 V2 V3 1 whale liver.Se tooth.Se 2 1 6.23 140.16 3 2 6.79 133.32 4 3 7.92 135.34 5 4 8.02 127.82 6 5 9.34 108.67
whale.selenium <- read.table(
file = "data/whale_selenium.txt",
header = TRUE,
row.names = 1)
summary(whale.selenium)
liver.Se tooth.Se Min. : 6.230 Min. :108.7 1st Qu.: 9.835 1st Qu.:134.8 Median :14.905 Median :143.4 Mean :20.685 Mean :156.6 3rd Qu.:33.633 3rd Qu.:175.1 Max. :45.470 Max. :245.1
plot(
whale.selenium$liver.Se, whale.selenium$tooth.Se,
xlab = "liver Selenium", ylab = "tooth Selenium")
abline(lm(whale.selenium$tooth.Se ~ whale.selenium$liver.Se))
or, with a smoother:
scatter.smooth(
whale.selenium$liver.Se, whale.selenium$tooth.Se,
xlab = "liver Selenium", ylab = "tooth Selenium")
abline(lm(whale.selenium$tooth.Se ~ whale.selenium$liver.Se))
More advanced file reading will be dealt with in a later presentation.
names(whale.selenium) <- c("liver", "tooth")
head(whale.selenium, n=2)
liver tooth 1 6.23 140.16 2 6.79 133.32
##or
colnames(whale.selenium) <- c("brrrr", "gross")
head(whale.selenium, n=2)
brrrr gross 1 6.23 140.16 2 6.79 133.32
You can add columns to an exisiting dataframe
## add simulated stomach data whale.selenium$stomach <- rnorm(nrow(whale.selenium), 42, 6) head(whale.selenium, n=2)
liver tooth stomach 1 6.23 140.16 39.81500 2 6.79 133.32 29.05594
# or cbind(whale.selenium, "a_code" = rep(1:2, nrow(whale.selenium)))
liver tooth stomach a_code 1 6.23 140.16 39.81500 1 2 6.79 133.32 29.05594 2 3 7.92 135.34 48.58592 1 4 8.02 127.82 42.10717 2 5 9.34 108.67 41.56941 1 6 10.00 146.22 42.19763 2 7 10.57 131.18 57.02038 1 8 11.04 145.51 41.67049 2 9 12.36 163.24 34.87155 1 10 14.53 136.55 40.26866 2 11 15.28 112.63 42.58383 1 12 18.68 245.07 39.74285 2 13 22.08 140.48 40.61575 1 14 27.55 177.93 37.94400 2 15 32.83 160.73 47.22081 1 16 36.04 227.60 44.88895 2 17 37.74 177.69 36.34232 1 18 40.00 174.23 45.49380 2 19 41.23 206.30 39.41568 1 20 45.47 141.31 46.95745 2 21 6.23 140.16 39.81500 1 22 6.79 133.32 29.05594 2 23 7.92 135.34 48.58592 1 24 8.02 127.82 42.10717 2 25 9.34 108.67 41.56941 1 26 10.00 146.22 42.19763 2 27 10.57 131.18 57.02038 1 28 11.04 145.51 41.67049 2 29 12.36 163.24 34.87155 1 30 14.53 136.55 40.26866 2 31 15.28 112.63 42.58383 1 32 18.68 245.07 39.74285 2 33 22.08 140.48 40.61575 1 34 27.55 177.93 37.94400 2 35 32.83 160.73 47.22081 1 36 36.04 227.60 44.88895 2 37 37.74 177.69 36.34232 1 38 40.00 174.23 45.49380 2 39 41.23 206.30 39.41568 1 40 45.47 141.31 46.95745 2
rbind()Adding rows is similar (continued on next slide)
myData1 <- data.frame(colA = 1:3, colB = c("a", "b", "c")); myData1
colA colB 1 1 a 2 2 b 3 3 c
myData2 <- data.frame(colA = 4:5, colB = c("d", "e")); myData2
colA colB 1 4 d 2 5 e
myDataComplete <- rbind(myData1, myData2) myDataComplete
colA colB 1 1 a 2 2 b 3 3 c 4 4 d 5 5 e
Note that the column names of both dataframes need to match for this operation to succeed!
summary(whale.selenium) ## gives a 5-number summary of each column
liver tooth stomach Min. : 6.230 Min. :108.7 Min. :29.06 1st Qu.: 9.835 1st Qu.:134.8 1st Qu.:39.66 Median :14.905 Median :143.4 Median :41.62 Mean :20.685 Mean :156.6 Mean :41.92 3rd Qu.:33.633 3rd Qu.:175.1 3rd Qu.:45.04 Max. :45.470 Max. :245.1 Max. :57.02
dim(whale.selenium)
[1] 20 3
subset() to make both column and row selections[ , ]##select rows for which Solar.R is available head(subset(airquality, subset = !is.na(Solar.R)))
Ozone Solar.R Wind Temp Month Day 1 41 190 7.4 67 5 1 2 36 118 8.0 72 5 2 3 12 149 12.6 74 5 3 4 18 313 11.5 62 5 4 7 23 299 8.6 65 5 7 8 19 99 13.8 59 5 8
subset() cont.## select two columns only head(subset(airquality, select = c(Ozone, Solar.R)))
Ozone Solar.R 1 41 190 2 36 118 3 12 149 4 18 313 5 NA NA 6 28 NA
subset() cont.## combine row and colum selection head(subset(airquality, subset = !is.na(Solar.R), select = c(Ozone, Solar.R)))
Ozone Solar.R 1 41 190 2 36 118 3 12 149 4 18 313 7 23 299 8 19 99
subset() cont.## shorthand subset(airquality, Day == 1, select = -Temp)
Ozone Solar.R Wind Month Day 1 41 190 7.4 5 1 32 NA 286 8.6 6 1 62 135 269 4.1 7 1 93 39 83 6.9 8 1 124 96 167 6.9 9 1
subset() can be used more sophisticated; just GIYF