Week 1
setwd('~/desktop/coursera')
data <- read.csv("coursera.csv", header=TRUE)
names(data) # Extract names of dataset
## [1] "Ozone" "Solar.R" "Wind" "Temp" "Month" "Day"
head(data) # First 6 rows of dataset
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
tail(data) # Last 6 rows of dataset
## Ozone Solar.R Wind Temp Month Day
## 148 14 20 16.6 63 9 25
## 149 30 193 6.9 70 9 26
## 150 NA 145 13.2 77 9 27
## 151 14 191 14.3 75 9 28
## 152 18 131 8.0 76 9 29
## 153 20 223 11.5 68 9 30
nrow(data) # Number of observations
## [1] 153
data[47,1] # Find the value of ozone in the 47th row
## [1] 21
sum(is.na(data[,1])) # Find number of missing values in Ozone column
## [1] 37
mean(na.omit(data[,1])) # Find mean of Ozone column excluding missing values
## [1] 42.12931
ozone <- subset(data, !is.na(Ozone), select = Ozone) # We will consider this approach instead
apply(ozone, 2, mean) # For a data.frame, a margin of 2 indicates columns
## Ozone
## 42.12931
# Extract the subset of rows of the data frame where Ozone values are above 31 and Temp values are above 90. What is the mean of Solar.R in this subset?
solar <- subset(data, Temp>90 & Ozone>31, select=Solar.R)
apply(solar, 2, mean)
## Solar.R
## 212.8
# Find the mean of "Temp" when "Month" is equal to 6.
temp <- subset(data, Month==6, select=Temp)
apply(temp, 2, mean)
## Temp
## 79.1
# Find the maximum ozone value in the month of May (i.e. Month = 5)
ozone2 <- subset(data, Month==5 & !is.na(Ozone), select=Ozone)
apply(ozone2, 2, max)
## Ozone
## 115
## Factors
x <- factor(c("yes", "no", "yes", "no", "no", "no"))
table(x) ## Frequency table
## x
## no yes
## 4 2
unclass(x) ## See how factors are represented in R. Stripping the class and left with integers.
## [1] 2 1 2 1 1 1
## attr(,"levels")
## [1] "no" "yes"
Missing values
## is.na() Used to test objects if they're NA
## is.nan() Used to test if they're NaN
x <- c(1,2,3,NA,10)
is.na(x)
## [1] FALSE FALSE FALSE TRUE FALSE
is.nan(x)
## [1] FALSE FALSE FALSE FALSE FALSE
y <- c(1,2,3,NaN,10)
is.na(y)
## [1] FALSE FALSE FALSE TRUE FALSE
is.nan(y)
## [1] FALSE FALSE FALSE TRUE FALSE
data.frame
z <- data.frame(Dog=c(1,2,3,4), Cat=c("a", "b", "c", "d"))
z
## Dog Cat
## 1 1 a
## 2 2 b
## 3 3 c
## 4 4 d
nrow(z)
## [1] 4
ncol(z)
## [1] 2
Names Attribute
x <- 1:3
names(x)
## NULL
names(x) <- c("a", "b", "c")
m <- matrix(1:4, nrow=2, ncol=2, byrow=TRUE)
dimnames(m) <- list(c("a", "b"), c("c", "d"))
m
## c d
## a 1 2
## b 3 4
Reading tabular data
read.csv("coursera.csv", header=TRUE, sep=",") ## If your data is separated by commas
Reading in Large Datasets
setwd('~/desktop/Ostats')
gestation <- read.table("gestation.txt", nrow=10) ## Read in data of the first 10 rows to get classes
classes <- sapply(gestation, class) ## Find classes of each column
tabAll <- read.table("gestation.txt", colClasses=classes) ## Tell R the classes. The intuition behind this is that if you can tell R the types of data that are in each column, then R won't spend extra time and memory figuring them out on its own.
## Let's say we have a data.frame with 1,500,000 rows and 120 columns. How many gigs of memory will it take up?
1500000*120*8 ## There are 8 bytes per numeric
## [1] 1.44e+09
(1500000*120*8)/(2^20) ## There are 2^20 bytes per MB
## [1] 1373.291
((1500000*120*8)/(2^20))/1000 ## There are 1000 MBs per GB
## [1] 1.373291
Subsetting
# [always returns an object of the same class as the original]
# [[used to extract elements of a list of data.frame; it can only be used to extract a single element and the class of the returned object will not necessarily be a list or data.frame]]
# $ used to extract element from list or data.frame by name
x <- c("a", "b", "c", "c", "d", "a")
x[1]
## [1] "a"
x[1:4]
## [1] "a" "b" "c" "c"
x[x > "a"] ## This will return a character vector
## [1] "b" "c" "c" "d"
u <- x > "a" ## This will return a logical vector
u
## [1] FALSE TRUE TRUE TRUE TRUE FALSE
Lists
x <- list(dog=c(1,2,3,4), cat=3.14, squirrel="Hello")
x[1] ## Returns a list ALWAYS
## $dog
## [1] 1 2 3 4
x[[1]] ## Returns a vector
## [1] 1 2 3 4
x[c(1,3)] ## Extract multiple elements, returns a list since it's [ ]
## $dog
## [1] 1 2 3 4
##
## $squirrel
## [1] "Hello"
x[[c(1,3)]] ## Returns a single element; same as below
## [1] 3
x[[1]][[3]]
## [1] 3
name <- "dog" ## Same as below
x[[name]]
## [1] 1 2 3 4
x[["dog"]]
## [1] 1 2 3 4
x$dog
## [1] 1 2 3 4
Matrices
x <- matrix(1:6, 2, 3) ## A matrix from 1 to 6 with 2 rows and 3 columns
x[1,2] ## First row 2nd column
## [1] 3
x[1,] ## First row
## [1] 1 3 5
x[,2] ## Second column
## [1] 3 4
## By default, when a single element is retrieved from a matrix, it is returned as a vector of length 1 rather than a 1x1 matrix. This can be turned off with the following:
x[1,2, drop=FALSE]
## [,1]
## [1,] 3
x[1, , drop=FALSE]
## [,1] [,2] [,3]
## [1,] 1 3 5
Removing NA
z <- c(1, 2, NA, 3, NA, 4)
bad <- is.na(z)
z[!bad] ## ! means not
## [1] 1 2 3 4
w <- c("a", "b", NA, "c", NA, "d")
good <- complete.cases(z,w) ## Tells us which combinations are not true
good ## i.e., 1 and "a" are good, 2 and "b" are good, and so forth...
## [1] TRUE TRUE FALSE TRUE FALSE TRUE
z[good]
## [1] 1 2 3 4
w[good]
## [1] "a" "b" "c" "d"
## Consider the coursera dataset
head(data)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
good <- complete.cases(data) ## The logical vector here tells me which rows are complete
data[good,][1:6,]
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 7 23 299 8.6 65 5 7
## 8 19 99 13.8 59 5 8
Vectorized matrix operations
x <- matrix(1:4,2,2); y<-matrix(rep(10,4),2,2) ## y is a matrix of all 10's, i.e., 4 10's
x; y
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
## [,1] [,2]
## [1,] 10 10
## [2,] 10 10
x*y ## Element-wise multiplication
## [,1] [,2]
## [1,] 10 30
## [2,] 20 40
x %*% y ## True matrix multiplication
## [,1] [,2]
## [1,] 40 40
## [2,] 60 60