March 22nd, 2018
#
.#this is an R comment
?function
into your RStudio console to search for a function.??"stringOrRegEx"
to search for any pattern among the documentation for packages in your library.#?class #??"linear model"
(7 + 14) * 12
## [1] 252
1:5 #also works in reverse
## [1] 1 2 3 4 5
sum(1:5)
## [1] 15
class
function.<-
or =
(<-
is generally preferred).num <- 3 char <- 'c' #use "" or '' to denote character class(num) #methods never called on objects - num.class() is wrong!
## [1] "numeric"
class(char) #all string are characters in R
## [1] "character"
as.character(4)
## [1] "4"
as.numeric("4")
## [1] 4
dat <- c(1:5) dat #prints contents of dat to console
## [1] 1 2 3 4 5
class(dat) #vector is an object, not a class!
## [1] "integer"
employee <- c('John Doe','Peter Gynn','Jolie Hope') salary <- c(21000, 23400, 26800) dat <- data.frame(employee, salary)
dat
## employee salary ## 1 John Doe 21000 ## 2 Peter Gynn 23400 ## 3 Jolie Hope 26800
class(dat) #dataframe is a class, however
## [1] "data.frame"
mtcars
dataset here.mtcars <- mtcars mtcars
## mpg cyl disp hp drat wt qsec vs am gear carb ## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 ## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 ## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 ## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 ## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 ## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 ## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 ## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 ## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 ## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 ## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4 ## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3 ## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3 ## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3 ## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4 ## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4 ## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4 ## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 ## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 ## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 ## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1 ## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2 ## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2 ## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4 ## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2 ## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 ## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2 ## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2 ## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4 ## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6 ## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8 ## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
#View(mtcars) opens up data in RStudio console
#data <- read.csv("path/to/file/data.csv")
fread
command for reading in large datasets (will be especially useful during DataFest).#setwd("your/desired/directory") #data <- data.table::fread("data.csv")
class(mtcars)
## [1] "data.frame"
dim(mtcars) #also nrow(mtcars) and ncol(mtcars)
## [1] 32 11
names(mtcars) #also rownames(mtcars) colnames(mtcars)
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear" ## [11] "carb"
str(mtcars)
## 'data.frame': 32 obs. of 11 variables: ## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ... ## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ... ## $ disp: num 160 160 108 258 360 ... ## $ hp : num 110 110 93 110 175 105 245 62 95 123 ... ## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ... ## $ wt : num 2.62 2.88 2.32 3.21 3.44 ... ## $ qsec: num 16.5 17 18.6 19.4 17 ... ## $ vs : num 0 0 1 1 0 1 0 1 1 1 ... ## $ am : num 1 1 1 0 0 0 0 0 0 0 ... ## $ gear: num 4 4 4 3 3 3 3 4 4 4 ... ## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
gender <- c("Male", "Female") gender <- as.factor(gender) #convert to factor from character class(gender)
## [1] "factor"
levels
command to get or change the names assigned to the different levels of your factor.levels(gender)
## [1] "Female" "Male"
levels(gender) <- c("F","M") levels(gender)
## [1] "F" "M"
ordered = T
in the factor
command to create an ordinal factor.rank <- c(1:5) rank <- factor(rank, levels = c(1:5), ordered = T) rank
## [1] 1 2 3 4 5 ## Levels: 1 < 2 < 3 < 4 < 5
levels(rank)
## [1] "1" "2" "3" "4" "5"
mtcars
dataset are in fact categorical, even though they are coded as numeric right now. Now, with knowledge of how factors work in R, we can change them.mtcars$cyl <- as.factor(mtcars$cyl) class(mtcars$cyl)
## [1] "factor"
summary(mtcars)
## mpg cyl disp hp drat ## Min. :10.40 4:11 Min. : 71.1 Min. : 52.0 Min. :2.760 ## 1st Qu.:15.43 6: 7 1st Qu.:120.8 1st Qu.: 96.5 1st Qu.:3.080 ## Median :19.20 8:14 Median :196.3 Median :123.0 Median :3.695 ## Mean :20.09 Mean :230.7 Mean :146.7 Mean :3.597 ## 3rd Qu.:22.80 3rd Qu.:326.0 3rd Qu.:180.0 3rd Qu.:3.920 ## Max. :33.90 Max. :472.0 Max. :335.0 Max. :4.930 ## wt qsec vs am ## Min. :1.513 Min. :14.50 Min. :0.0000 Min. :0.0000 ## 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000 1st Qu.:0.0000 ## Median :3.325 Median :17.71 Median :0.0000 Median :0.0000 ## Mean :3.217 Mean :17.85 Mean :0.4375 Mean :0.4062 ## 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000 3rd Qu.:1.0000 ## Max. :5.424 Max. :22.90 Max. :1.0000 Max. :1.0000 ## gear carb ## Min. :3.000 Min. :1.000 ## 1st Qu.:3.000 1st Qu.:2.000 ## Median :4.000 Median :2.000 ## Mean :3.688 Mean :2.812 ## 3rd Qu.:4.000 3rd Qu.:4.000 ## Max. :5.000 Max. :8.000
NA
or NaN
.na <- c(4, 5, NA, 7, NaN) mean(na)
## [1] NA
is.na(na) #check for missingness
## [1] FALSE FALSE TRUE FALSE TRUE
na.not <- na.omit(na) is.na(na.not)
## [1] FALSE FALSE FALSE
mean(na.not)
## [1] 5.333333
mtcars[2,4]
## [1] 110
mtcars[,3]
## [1] 160.0 160.0 108.0 258.0 360.0 225.0 360.0 146.7 140.8 167.6 167.6 ## [12] 275.8 275.8 275.8 472.0 460.0 440.0 78.7 75.7 71.1 120.1 318.0 ## [23] 304.0 350.0 400.0 79.0 120.3 95.1 351.0 145.0 301.0 121.0
mtcars[1:3,]
## mpg cyl disp hp drat wt qsec vs am gear carb ## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 ## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 ## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
mtcars$mpg
## [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 ## [15] 10.4 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4 ## [29] 15.8 19.7 15.0 21.4
mean(mtcars$mpg)
## [1] 20.09062
median(mtcars$hp)
## [1] 123
sd(mtcars$disp)
## [1] 123.9387
cor(mtcars$mpg, mtcars$hp)
## [1] -0.7761684
cor.test(mtcars$mpg, mtcars$hp)
## ## Pearson's product-moment correlation ## ## data: mtcars$mpg and mtcars$hp ## t = -6.7424, df = 30, p-value = 1.788e-07 ## alternative hypothesis: true correlation is not equal to 0 ## 95 percent confidence interval: ## -0.8852686 -0.5860994 ## sample estimates: ## cor ## -0.7761684
a <- c(1:3) b <- c(4:6) a + b
## [1] 5 7 9
mean(a + b)
## [1] 7
sd(a + b)
## [1] 2
sapply(1:10, function(x) x^2)
## [1] 1 4 9 16 25 36 49 64 81 100
vapply(c("a", "b", "c", "d"), function(x) x=="d", logical(1))
## a b c d ## FALSE FALSE FALSE TRUE
mtcars[,8:11] <- lapply(mtcars[,8:11], as.factor) str(mtcars)
## 'data.frame': 32 obs. of 11 variables: ## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ... ## $ cyl : Factor w/ 3 levels "4","6","8": 2 2 1 2 3 2 3 1 1 2 ... ## $ disp: num 160 160 108 258 360 ... ## $ hp : num 110 110 93 110 175 105 245 62 95 123 ... ## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ... ## $ wt : num 2.62 2.88 2.32 3.21 3.44 ... ## $ qsec: num 16.5 17 18.6 19.4 17 ... ## $ vs : Factor w/ 2 levels "0","1": 1 1 2 2 1 2 1 2 2 2 ... ## $ am : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 1 1 1 ... ## $ gear: Factor w/ 3 levels "3","4","5": 2 2 2 1 1 1 1 2 2 2 ... ## $ carb: Factor w/ 6 levels "1","2","3","4",..: 4 4 1 1 2 1 4 2 2 4 ...
plot(mtcars) #pairs plot
plot(mtcars$cyl)
hist(mtcars$mpg)
plot(mtcars$mpg~mtcars$hp)
tidyverse
dplyr
and tidyr
, both part of Hadley Wickham's tidyverse
.ggplot2
, later.tidyverse
package.tidyverse
lubridate
is another package in the tidyverse
we are not covering, but could come in handy during DataFest for working with dates in R. install.packages("packageName")
.library(packageName)
command.#install.packages("tidyverse") library(tidyverse) #library(dplyr) for only data manipulation #library(tidyr) for only data cleaning
dplyr
dplyr
is a library used for data manipulation that contains six main functions - select
, filter
, mutate
, summarise
, group_by
, and arrange
.glimpse
function in dplyr
is essentially a better version of base R's str
function - use it when first loading a dataset to get a sense of its types and values.data(iris)
glimpse
glimpse(iris)
## Observations: 150 ## Variables: 5 ## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9,... ## $ Sepal.Width <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1,... ## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5,... ## $ Petal.Width <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1,... ## $ Species <fctr> setosa, setosa, setosa, setosa, setosa, setosa, ...
select
select
to select specific columns of the data to work with.iris.petal <- select(iris, Petal.Length, Petal.Width, Species) colnames(iris.petal)
## [1] "Petal.Length" "Petal.Width" "Species"
iris.mat <- as.matrix(select(iris, -Species))
filter
filter
to select only certain rows of the dataset, based on a supplied Boolean conditional statement.iris.setosa <- filter(iris, Species == "setosa") iris.not.setosa <- filter(iris, Species != "setosa") iris.large <- filter(iris, Petal.Length > 5.5) iris.large.setosa <- filter(iris, Species=="setosa", Petal.Length > 5.5)
filter
continued%in%
function to test for equality between a vector and the specific observation in your dataset.&
), or (|
), and not (!
) are also allowed.iris.not.versicolor <- filter(iris, Species %in% c("setosa", "virginica")) iris.not.versicolor2 <- filter(iris, !(Species == "versicolor")) identical(iris.not.versicolor, iris.not.versicolor2)
## [1] TRUE
mutate
mutate
to create new variables in the dataset based on the values of your existing ones.ifelse
function takes logical condition, then what to output if the statement evaluates to true, and finally what to output if the statement is false.iris <- mutate(iris, Petal.Size = Petal.Length + Petal.Width) iris <- mutate(iris, large = ifelse(Petal.Size > 5, 1, 0)) iris <- mutate(iris, Species = factor(Species)) #modify in-place
summarise
summarise
to collapse your entire table into summary statistics.summarise(iris, avg.petalSize = mean(Petal.Size), sd.petalSize = sd(Petal.Size))
## avg.petalSize sd.petalSize ## 1 4.957333 2.507689
iris %>% select(Petal.Size, Sepal.Length, Sepal.Width) %>% filter(Petal.Size <= 5) %>% summarise(mean.sepalLength = mean(Sepal.Length), mean.spealWidth = mean(Sepal.Width))
## mean.sepalLength mean.spealWidth ## 1 5.07 3.26
summarise(filter(select(iris, Petal.Size, Sepal.Length, Sepal.Width), Petal.Size <=5), mean.sepalLength = mean(Sepal.Length), mean.spealWidth = mean(Sepal.Width))
## mean.sepalLength mean.spealWidth ## 1 5.07 3.26
summarise
with group_by
group_by
partitions your data into groups (usually the different levels of a factor variable) then carries out operations by group.iris %>% group_by(Species) %>% summarise(n = n(), mean.petalSize = mean(Petal.Size), sd.petalSize = sd(Petal.Size))
## # A tibble: 3 x 4 ## Species n mean.petalSize sd.petalSize ## <fctr> <int> <dbl> <dbl> ## 1 setosa 50 1.708 0.2310932 ## 2 versicolor 50 5.586 0.6372806 ## 3 virginica 50 7.578 0.6911363
arrange
arrange
with summarise
to specify the ordering of your resulting table.iris %>% group_by(Species) %>% summarise(n = n(), mean.petalSize = mean(Petal.Size), sd.petalSize = sd(Petal.Size)) %>% arrange(desc(mean.petalSize))
## # A tibble: 3 x 4 ## Species n mean.petalSize sd.petalSize ## <fctr> <int> <dbl> <dbl> ## 1 virginica 50 7.578 0.6911363 ## 2 versicolor 50 5.586 0.6372806 ## 3 setosa 50 1.708 0.2310932
tidyr
dplyr
, tidyr
is used to organize data in a "tidy" way.mtcars <- mtcars %>% mutate(car = rownames(mtcars)) mtcars[,6:12]
## wt qsec vs am gear carb car ## 1 2.620 16.46 0 1 4 4 Mazda RX4 ## 2 2.875 17.02 0 1 4 4 Mazda RX4 Wag ## 3 2.320 18.61 1 1 4 1 Datsun 710 ## 4 3.215 19.44 1 0 3 1 Hornet 4 Drive ## 5 3.440 17.02 0 0 3 2 Hornet Sportabout ## 6 3.460 20.22 1 0 3 1 Valiant ## 7 3.570 15.84 0 0 3 4 Duster 360 ## 8 3.190 20.00 1 0 4 2 Merc 240D ## 9 3.150 22.90 1 0 4 2 Merc 230 ## 10 3.440 18.30 1 0 4 4 Merc 280 ## 11 3.440 18.90 1 0 4 4 Merc 280C ## 12 4.070 17.40 0 0 3 3 Merc 450SE ## 13 3.730 17.60 0 0 3 3 Merc 450SL ## 14 3.780 18.00 0 0 3 3 Merc 450SLC ## 15 5.250 17.98 0 0 3 4 Cadillac Fleetwood ## 16 5.424 17.82 0 0 3 4 Lincoln Continental ## 17 5.345 17.42 0 0 3 4 Chrysler Imperial ## 18 2.200 19.47 1 1 4 1 Fiat 128 ## 19 1.615 18.52 1 1 4 2 Honda Civic ## 20 1.835 19.90 1 1 4 1 Toyota Corolla ## 21 2.465 20.01 1 0 3 1 Toyota Corona ## 22 3.520 16.87 0 0 3 2 Dodge Challenger ## 23 3.435 17.30 0 0 3 2 AMC Javelin ## 24 3.840 15.41 0 0 3 4 Camaro Z28 ## 25 3.845 17.05 0 0 3 2 Pontiac Firebird ## 26 1.935 18.90 1 1 4 1 Fiat X1-9 ## 27 2.140 16.70 0 1 5 2 Porsche 914-2 ## 28 1.513 16.90 1 1 5 2 Lotus Europa ## 29 3.170 14.50 0 1 5 4 Ford Pantera L ## 30 2.770 15.50 0 1 5 6 Ferrari Dino ## 31 3.570 14.60 0 1 5 8 Maserati Bora ## 32 2.780 18.60 1 1 4 2 Volvo 142E
gather
gather
is used to transform your data from "wide" to "long" format.#all equivalent mtcarsLong <- mtcars %>% gather(attribute, value, -car) mtcarsLong2 <- mtcars %>% gather(attribute, value, mpg:carb) mtcarsLong3 <- mtcars %>% gather(attribute, value, 2:12)
mtcarsLong
## car attribute value ## 1 Mazda RX4 mpg 21 ## 2 Mazda RX4 Wag mpg 21 ## 3 Datsun 710 mpg 22.8 ## 4 Hornet 4 Drive mpg 21.4 ## 5 Hornet Sportabout mpg 18.7 ## 6 Valiant mpg 18.1 ## 7 Duster 360 mpg 14.3 ## 8 Merc 240D mpg 24.4 ## 9 Merc 230 mpg 22.8 ## 10 Merc 280 mpg 19.2 ## 11 Merc 280C mpg 17.8 ## 12 Merc 450SE mpg 16.4 ## 13 Merc 450SL mpg 17.3 ## 14 Merc 450SLC mpg 15.2 ## 15 Cadillac Fleetwood mpg 10.4 ## 16 Lincoln Continental mpg 10.4 ## 17 Chrysler Imperial mpg 14.7 ## 18 Fiat 128 mpg 32.4 ## 19 Honda Civic mpg 30.4 ## 20 Toyota Corolla mpg 33.9 ## 21 Toyota Corona mpg 21.5 ## 22 Dodge Challenger mpg 15.5 ## 23 AMC Javelin mpg 15.2 ## 24 Camaro Z28 mpg 13.3 ## 25 Pontiac Firebird mpg 19.2 ## 26 Fiat X1-9 mpg 27.3 ## 27 Porsche 914-2 mpg 26 ## 28 Lotus Europa mpg 30.4 ## 29 Ford Pantera L mpg 15.8 ## 30 Ferrari Dino mpg 19.7 ## 31 Maserati Bora mpg 15 ## 32 Volvo 142E mpg 21.4 ## 33 Mazda RX4 cyl 6 ## 34 Mazda RX4 Wag cyl 6 ## 35 Datsun 710 cyl 4 ## 36 Hornet 4 Drive cyl 6 ## 37 Hornet Sportabout cyl 8 ## 38 Valiant cyl 6 ## 39 Duster 360 cyl 8 ## 40 Merc 240D cyl 4 ## 41 Merc 230 cyl 4 ## 42 Merc 280 cyl 6 ## 43 Merc 280C cyl 6 ## 44 Merc 450SE cyl 8 ## 45 Merc 450SL cyl 8 ## 46 Merc 450SLC cyl 8 ## 47 Cadillac Fleetwood cyl 8 ## 48 Lincoln Continental cyl 8 ## 49 Chrysler Imperial cyl 8 ## 50 Fiat 128 cyl 4 ## 51 Honda Civic cyl 4 ## 52 Toyota Corolla cyl 4 ## 53 Toyota Corona cyl 4 ## 54 Dodge Challenger cyl 8 ## 55 AMC Javelin cyl 8 ## 56 Camaro Z28 cyl 8 ## 57 Pontiac Firebird cyl 8 ## 58 Fiat X1-9 cyl 4 ## 59 Porsche 914-2 cyl 4 ## 60 Lotus Europa cyl 4 ## 61 Ford Pantera L cyl 8 ## 62 Ferrari Dino cyl 6 ## 63 Maserati Bora cyl 8 ## 64 Volvo 142E cyl 4 ## 65 Mazda RX4 disp 160 ## 66 Mazda RX4 Wag disp 160 ## 67 Datsun 710 disp 108 ## 68 Hornet 4 Drive disp 258 ## 69 Hornet Sportabout disp 360 ## 70 Valiant disp 225 ## 71 Duster 360 disp 360 ## 72 Merc 240D disp 146.7 ## 73 Merc 230 disp 140.8 ## 74 Merc 280 disp 167.6 ## 75 Merc 280C disp 167.6 ## 76 Merc 450SE disp 275.8 ## 77 Merc 450SL disp 275.8 ## 78 Merc 450SLC disp 275.8 ## 79 Cadillac Fleetwood disp 472 ## 80 Lincoln Continental disp 460 ## 81 Chrysler Imperial disp 440 ## 82 Fiat 128 disp 78.7 ## 83 Honda Civic disp 75.7 ## 84 Toyota Corolla disp 71.1 ## 85 Toyota Corona disp 120.1 ## 86 Dodge Challenger disp 318 ## 87 AMC Javelin disp 304 ## 88 Camaro Z28 disp 350 ## 89 Pontiac Firebird disp 400 ## 90 Fiat X1-9 disp 79 ## 91 Porsche 914-2 disp 120.3 ## 92 Lotus Europa disp 95.1 ## 93 Ford Pantera L disp 351 ## 94 Ferrari Dino disp 145 ## 95 Maserati Bora disp 301 ## 96 Volvo 142E disp 121 ## 97 Mazda RX4 hp 110 ## 98 Mazda RX4 Wag hp 110 ## 99 Datsun 710 hp 93 ## 100 Hornet 4 Drive hp 110 ## 101 Hornet Sportabout hp 175 ## 102 Valiant hp 105 ## 103 Duster 360 hp 245 ## 104 Merc 240D hp 62 ## 105 Merc 230 hp 95 ## 106 Merc 280 hp 123 ## 107 Merc 280C hp 123 ## 108 Merc 450SE hp 180 ## 109 Merc 450SL hp 180 ## 110 Merc 450SLC hp 180 ## 111 Cadillac Fleetwood hp 205 ## 112 Lincoln Continental hp 215 ## 113 Chrysler Imperial hp 230 ## 114 Fiat 128 hp 66 ## 115 Honda Civic hp 52 ## 116 Toyota Corolla hp 65 ## 117 Toyota Corona hp 97 ## 118 Dodge Challenger hp 150 ## 119 AMC Javelin hp 150 ## 120 Camaro Z28 hp 245 ## 121 Pontiac Firebird hp 175 ## 122 Fiat X1-9 hp 66 ## 123 Porsche 914-2 hp 91 ## 124 Lotus Europa hp 113 ## 125 Ford Pantera L hp 264 ## 126 Ferrari Dino hp 175 ## 127 Maserati Bora hp 335 ## 128 Volvo 142E hp 109 ## 129 Mazda RX4 drat 3.9 ## 130 Mazda RX4 Wag drat 3.9 ## 131 Datsun 710 drat 3.85 ## 132 Hornet 4 Drive drat 3.08 ## 133 Hornet Sportabout drat 3.15 ## 134 Valiant drat 2.76 ## 135 Duster 360 drat 3.21 ## 136 Merc 240D drat 3.69 ## 137 Merc 230 drat 3.92 ## 138 Merc 280 drat 3.92 ## 139 Merc 280C drat 3.92 ## 140 Merc 450SE drat 3.07 ## 141 Merc 450SL drat 3.07 ## 142 Merc 450SLC drat 3.07 ## 143 Cadillac Fleetwood drat 2.93 ## 144 Lincoln Continental drat 3 ## 145 Chrysler Imperial drat 3.23 ## 146 Fiat 128 drat 4.08 ## 147 Honda Civic drat 4.93 ## 148 Toyota Corolla drat 4.22 ## 149 Toyota Corona drat 3.7 ## 150 Dodge Challenger drat 2.76 ## 151 AMC Javelin drat 3.15 ## 152 Camaro Z28 drat 3.73 ## 153 Pontiac Firebird drat 3.08 ## 154 Fiat X1-9 drat 4.08 ## 155 Porsche 914-2 drat 4.43 ## 156 Lotus Europa drat 3.77 ## 157 Ford Pantera L drat 4.22 ## 158 Ferrari Dino drat 3.62 ## 159 Maserati Bora drat 3.54 ## 160 Volvo 142E drat 4.11 ## 161 Mazda RX4 wt 2.62 ## 162 Mazda RX4 Wag wt 2.875 ## 163 Datsun 710 wt 2.32 ## 164 Hornet 4 Drive wt 3.215 ## 165 Hornet Sportabout wt 3.44 ## 166 Valiant wt 3.46 ## 167 Duster 360 wt 3.57 ## 168 Merc 240D wt 3.19 ## 169 Merc 230 wt 3.15 ## 170 Merc 280 wt 3.44 ## 171 Merc 280C wt 3.44 ## 172 Merc 450SE wt 4.07 ## 173 Merc 450SL wt 3.73 ## 174 Merc 450SLC wt 3.78 ## 175 Cadillac Fleetwood wt 5.25 ## 176 Lincoln Continental wt 5.424 ## 177 Chrysler Imperial wt 5.345 ## 178 Fiat 128 wt 2.2 ## 179 Honda Civic wt 1.615 ## 180 Toyota Corolla wt 1.835 ## 181 Toyota Corona wt 2.465 ## 182 Dodge Challenger wt 3.52 ## 183 AMC Javelin wt 3.435 ## 184 Camaro Z28 wt 3.84 ## 185 Pontiac Firebird wt 3.845 ## 186 Fiat X1-9 wt 1.935 ## 187 Porsche 914-2 wt 2.14 ## 188 Lotus Europa wt 1.513 ## 189 Ford Pantera L wt 3.17 ## 190 Ferrari Dino wt 2.77 ## 191 Maserati Bora wt 3.57 ## 192 Volvo 142E wt 2.78 ## 193 Mazda RX4 qsec 16.46 ## 194 Mazda RX4 Wag qsec 17.02 ## 195 Datsun 710 qsec 18.61 ## 196 Hornet 4 Drive qsec 19.44 ## 197 Hornet Sportabout qsec 17.02 ## 198 Valiant qsec 20.22 ## 199 Duster 360 qsec 15.84 ## 200 Merc 240D qsec 20 ## 201 Merc 230 qsec 22.9 ## 202 Merc 280 qsec 18.3 ## 203 Merc 280C qsec 18.9 ## 204 Merc 450SE qsec 17.4 ## 205 Merc 450SL qsec 17.6 ## 206 Merc 450SLC qsec 18 ## 207 Cadillac Fleetwood qsec 17.98 ## 208 Lincoln Continental qsec 17.82 ## 209 Chrysler Imperial qsec 17.42 ## 210 Fiat 128 qsec 19.47 ## 211 Honda Civic qsec 18.52 ## 212 Toyota Corolla qsec 19.9 ## 213 Toyota Corona qsec 20.01 ## 214 Dodge Challenger qsec 16.87 ## 215 AMC Javelin qsec 17.3 ## 216 Camaro Z28 qsec 15.41 ## 217 Pontiac Firebird qsec 17.05 ## 218 Fiat X1-9 qsec 18.9 ## 219 Porsche 914-2 qsec 16.7 ## 220 Lotus Europa qsec 16.9 ## 221 Ford Pantera L qsec 14.5 ## 222 Ferrari Dino qsec 15.5 ## 223 Maserati Bora qsec 14.6 ## 224 Volvo 142E qsec 18.6 ## 225 Mazda RX4 vs 0 ## 226 Mazda RX4 Wag vs 0 ## 227 Datsun 710 vs 1 ## 228 Hornet 4 Drive vs 1 ## 229 Hornet Sportabout vs 0 ## 230 Valiant vs 1 ## 231 Duster 360 vs 0 ## 232 Merc 240D vs 1 ## 233 Merc 230 vs 1 ## 234 Merc 280 vs 1 ## 235 Merc 280C vs 1 ## 236 Merc 450SE vs 0 ## 237 Merc 450SL vs 0 ## 238 Merc 450SLC vs 0 ## 239 Cadillac Fleetwood vs 0 ## 240 Lincoln Continental vs 0 ## 241 Chrysler Imperial vs 0 ## 242 Fiat 128 vs 1 ## 243 Honda Civic vs 1 ## 244 Toyota Corolla vs 1 ## 245 Toyota Corona vs 1 ## 246 Dodge Challenger vs 0 ## 247 AMC Javelin vs 0 ## 248 Camaro Z28 vs 0 ## 249 Pontiac Firebird vs 0 ## 250 Fiat X1-9 vs 1 ## 251 Porsche 914-2 vs 0 ## 252 Lotus Europa vs 1 ## 253 Ford Pantera L vs 0 ## 254 Ferrari Dino vs 0 ## 255 Maserati Bora vs 0 ## 256 Volvo 142E vs 1 ## 257 Mazda RX4 am 1 ## 258 Mazda RX4 Wag am 1 ## 259 Datsun 710 am 1 ## 260 Hornet 4 Drive am 0 ## 261 Hornet Sportabout am 0 ## 262 Valiant am 0 ## 263 Duster 360 am 0 ## 264 Merc 240D am 0 ## 265 Merc 230 am 0 ## 266 Merc 280 am 0 ## 267 Merc 280C am 0 ## 268 Merc 450SE am 0 ## 269 Merc 450SL am 0 ## 270 Merc 450SLC am 0 ## 271 Cadillac Fleetwood am 0 ## 272 Lincoln Continental am 0 ## 273 Chrysler Imperial am 0 ## 274 Fiat 128 am 1 ## 275 Honda Civic am 1 ## 276 Toyota Corolla am 1 ## 277 Toyota Corona am 0 ## 278 Dodge Challenger am 0 ## 279 AMC Javelin am 0 ## 280 Camaro Z28 am 0 ## 281 Pontiac Firebird am 0 ## 282 Fiat X1-9 am 1 ## 283 Porsche 914-2 am 1 ## 284 Lotus Europa am 1 ## 285 Ford Pantera L am 1 ## 286 Ferrari Dino am 1 ## 287 Maserati Bora am 1 ## 288 Volvo 142E am 1 ## 289 Mazda RX4 gear 4 ## 290 Mazda RX4 Wag gear 4 ## 291 Datsun 710 gear 4 ## 292 Hornet 4 Drive gear 3 ## 293 Hornet Sportabout gear 3 ## 294 Valiant gear 3 ## 295 Duster 360 gear 3 ## 296 Merc 240D gear 4 ## 297 Merc 230 gear 4 ## 298 Merc 280 gear 4 ## 299 Merc 280C gear 4 ## 300 Merc 450SE gear 3 ## 301 Merc 450SL gear 3 ## 302 Merc 450SLC gear 3 ## 303 Cadillac Fleetwood gear 3 ## 304 Lincoln Continental gear 3 ## 305 Chrysler Imperial gear 3 ## 306 Fiat 128 gear 4 ## 307 Honda Civic gear 4 ## 308 Toyota Corolla gear 4 ## 309 Toyota Corona gear 3 ## 310 Dodge Challenger gear 3 ## 311 AMC Javelin gear 3 ## 312 Camaro Z28 gear 3 ## 313 Pontiac Firebird gear 3 ## 314 Fiat X1-9 gear 4 ## 315 Porsche 914-2 gear 5 ## 316 Lotus Europa gear 5 ## 317 Ford Pantera L gear 5 ## 318 Ferrari Dino gear 5 ## 319 Maserati Bora gear 5 ## 320 Volvo 142E gear 4 ## 321 Mazda RX4 carb 4 ## 322 Mazda RX4 Wag carb 4 ## 323 Datsun 710 carb 1 ## 324 Hornet 4 Drive carb 1 ## 325 Hornet Sportabout carb 2 ## 326 Valiant carb 1 ## 327 Duster 360 carb 4 ## 328 Merc 240D carb 2 ## 329 Merc 230 carb 2 ## 330 Merc 280 carb 4 ## 331 Merc 280C carb 4 ## 332 Merc 450SE carb 3 ## 333 Merc 450SL carb 3 ## 334 Merc 450SLC carb 3 ## 335 Cadillac Fleetwood carb 4 ## 336 Lincoln Continental carb 4 ## 337 Chrysler Imperial carb 4 ## 338 Fiat 128 carb 1 ## 339 Honda Civic carb 2 ## 340 Toyota Corolla carb 1 ## 341 Toyota Corona carb 1 ## 342 Dodge Challenger carb 2 ## 343 AMC Javelin carb 2 ## 344 Camaro Z28 carb 4 ## 345 Pontiac Firebird carb 2 ## 346 Fiat X1-9 carb 1 ## 347 Porsche 914-2 carb 2 ## 348 Lotus Europa carb 2 ## 349 Ford Pantera L carb 4 ## 350 Ferrari Dino carb 6 ## 351 Maserati Bora carb 8 ## 352 Volvo 142E carb 2
spread
spread
is the complement of gather
- it takes a key-value pair and spreads it across multiple columns, effectively transforming the data from "long" to "wide" format.mtcarsWide <- mtcarsLong %>% spread(attribute, value)
mtcarsWide #back to normal
## car am carb cyl disp drat gear hp mpg qsec vs wt ## 1 AMC Javelin 0 2 8 304 3.15 3 150 15.2 17.3 0 3.435 ## 2 Cadillac Fleetwood 0 4 8 472 2.93 3 205 10.4 17.98 0 5.25 ## 3 Camaro Z28 0 4 8 350 3.73 3 245 13.3 15.41 0 3.84 ## 4 Chrysler Imperial 0 4 8 440 3.23 3 230 14.7 17.42 0 5.345 ## 5 Datsun 710 1 1 4 108 3.85 4 93 22.8 18.61 1 2.32 ## 6 Dodge Challenger 0 2 8 318 2.76 3 150 15.5 16.87 0 3.52 ## 7 Duster 360 0 4 8 360 3.21 3 245 14.3 15.84 0 3.57 ## 8 Ferrari Dino 1 6 6 145 3.62 5 175 19.7 15.5 0 2.77 ## 9 Fiat 128 1 1 4 78.7 4.08 4 66 32.4 19.47 1 2.2 ## 10 Fiat X1-9 1 1 4 79 4.08 4 66 27.3 18.9 1 1.935 ## 11 Ford Pantera L 1 4 8 351 4.22 5 264 15.8 14.5 0 3.17 ## 12 Honda Civic 1 2 4 75.7 4.93 4 52 30.4 18.52 1 1.615 ## 13 Hornet 4 Drive 0 1 6 258 3.08 3 110 21.4 19.44 1 3.215 ## 14 Hornet Sportabout 0 2 8 360 3.15 3 175 18.7 17.02 0 3.44 ## 15 Lincoln Continental 0 4 8 460 3 3 215 10.4 17.82 0 5.424 ## 16 Lotus Europa 1 2 4 95.1 3.77 5 113 30.4 16.9 1 1.513 ## 17 Maserati Bora 1 8 8 301 3.54 5 335 15 14.6 0 3.57 ## 18 Mazda RX4 1 4 6 160 3.9 4 110 21 16.46 0 2.62 ## 19 Mazda RX4 Wag 1 4 6 160 3.9 4 110 21 17.02 0 2.875 ## 20 Merc 230 0 2 4 140.8 3.92 4 95 22.8 22.9 1 3.15 ## 21 Merc 240D 0 2 4 146.7 3.69 4 62 24.4 20 1 3.19 ## 22 Merc 280 0 4 6 167.6 3.92 4 123 19.2 18.3 1 3.44 ## 23 Merc 280C 0 4 6 167.6 3.92 4 123 17.8 18.9 1 3.44 ## 24 Merc 450SE 0 3 8 275.8 3.07 3 180 16.4 17.4 0 4.07 ## 25 Merc 450SL 0 3 8 275.8 3.07 3 180 17.3 17.6 0 3.73 ## 26 Merc 450SLC 0 3 8 275.8 3.07 3 180 15.2 18 0 3.78 ## 27 Pontiac Firebird 0 2 8 400 3.08 3 175 19.2 17.05 0 3.845 ## 28 Porsche 914-2 1 2 4 120.3 4.43 5 91 26 16.7 0 2.14 ## 29 Toyota Corolla 1 1 4 71.1 4.22 4 65 33.9 19.9 1 1.835 ## 30 Toyota Corona 0 1 4 120.1 3.7 3 97 21.5 20.01 1 2.465 ## 31 Valiant 0 1 6 225 2.76 3 105 18.1 20.22 1 3.46 ## 32 Volvo 142E 1 2 4 121 4.11 4 109 21.4 18.6 1 2.78
separate
separate
takes a single variable and splits it into two, usually based on a specified delimiting character.mtcarsSep <- mtcars %>% separate(car, c("brand", "make"), sep = " ") mtcarsSep[,12:13]
## brand make ## 1 Mazda RX4 ## 2 Mazda RX4 ## 3 Datsun 710 ## 4 Hornet 4 ## 5 Hornet Sportabout ## 6 Valiant <NA> ## 7 Duster 360 ## 8 Merc 240D ## 9 Merc 230 ## 10 Merc 280 ## 11 Merc 280C ## 12 Merc 450SE ## 13 Merc 450SL ## 14 Merc 450SLC ## 15 Cadillac Fleetwood ## 16 Lincoln Continental ## 17 Chrysler Imperial ## 18 Fiat 128 ## 19 Honda Civic ## 20 Toyota Corolla ## 21 Toyota Corona ## 22 Dodge Challenger ## 23 AMC Javelin ## 24 Camaro Z28 ## 25 Pontiac Firebird ## 26 Fiat X1-9 ## 27 Porsche 914-2 ## 28 Lotus Europa ## 29 Ford Pantera ## 30 Ferrari Dino ## 31 Maserati Bora ## 32 Volvo 142E
unite
unite
is the complement to separate
and instead combines two columns into one in a similar fashion.mtcarsUnite <- mtcarsSep %>% unite(brand, make, col = car, sep = " ") mtcarsUnite[,12]
## [1] "Mazda RX4" "Mazda RX4" "Datsun 710" ## [4] "Hornet 4" "Hornet Sportabout" "Valiant NA" ## [7] "Duster 360" "Merc 240D" "Merc 230" ## [10] "Merc 280" "Merc 280C" "Merc 450SE" ## [13] "Merc 450SL" "Merc 450SLC" "Cadillac Fleetwood" ## [16] "Lincoln Continental" "Chrysler Imperial" "Fiat 128" ## [19] "Honda Civic" "Toyota Corolla" "Toyota Corona" ## [22] "Dodge Challenger" "AMC Javelin" "Camaro Z28" ## [25] "Pontiac Firebird" "Fiat X1-9" "Porsche 914-2" ## [28] "Lotus Europa" "Ford Pantera" "Ferrari Dino" ## [31] "Maserati Bora" "Volvo 142E"
#some mock data - rep function used to #repeat supplied object given number of times df1 <- data.frame(CustomerId = c(1:6), Product = c(rep("Toaster", 3), rep("Radio", 3))) df2 <- data.frame(CustomerId = c(2, 4, 6), State = c(rep("Alabama", 2), rep("Ohio", 1)))
df1
## CustomerId Product ## 1 1 Toaster ## 2 2 Toaster ## 3 3 Toaster ## 4 4 Radio ## 5 5 Radio ## 6 6 Radio
df2
## CustomerId State ## 1 2 Alabama ## 2 4 Alabama ## 3 6 Ohio
merge
function.by
argument (takes multiple inputs as well).merge(df1, df2, by="CustomerId")
## CustomerId Product State ## 1 2 Toaster Alabama ## 2 4 Radio Alabama ## 3 6 Radio Ohio
dplyr
dplyr
provides more robust options for joining inspired by SQL (these also take multiple keys, like merge
).merge
can accomplish many similar joins if the parameters are specified correctly.merge
perform by default? inner_join
returns all rows where the value of the key in df1
matches the one in df2
. Notice how this omits certain values that didn't have a match in df2
.inner_join(df1, df2, by = "CustomerId")
## CustomerId Product State ## 1 2 Toaster Alabama ## 2 4 Radio Alabama ## 3 6 Radio Ohio
left_join
returns all rows in df1
, as well as any values in df2
that have a match.df2
that don't have a match will return NA
.left_join(df1, df2, by = "CustomerId")
## CustomerId Product State ## 1 1 Toaster <NA> ## 2 2 Toaster Alabama ## 3 3 Toaster <NA> ## 4 4 Radio Alabama ## 5 5 Radio <NA> ## 6 6 Radio Ohio
right_join
returns all rows in df2
, as well as any values in df1
that have a match.df1
that don't have a match will return NA
.inner_join
. Can you see why?right_join(df1, df2, by = "CustomerId")
## CustomerId Product State ## 1 2 Toaster Alabama ## 2 4 Radio Alabama ## 3 6 Radio Ohio
full_join
returns all rows in df1
and df2
.NA
.full_join(df1, df2, by = "CustomerId")
## CustomerId Product State ## 1 1 Toaster <NA> ## 2 2 Toaster Alabama ## 3 3 Toaster <NA> ## 4 4 Radio Alabama ## 5 5 Radio <NA> ## 6 6 Radio Ohio
rbind
a <- c("A","B","C") b <- c("D", "E", "F") ab <- rbind(a,b) ab
## [,1] [,2] [,3] ## a "A" "B" "C" ## b "D" "E" "F"
ggplot2
ggplot2
.qplot
qplot
(meaning quick plot) is an easy way to create nice-looking graphs through the ggplot2
package.qplot(var1, var2, data = myData, aesthetic = var3)
qplot
exampleqplot(Sepal.Length, Sepal.Width, data=iris)
qplot(Sepal.Length, Sepal.Width, data=iris, col = Species)
qplot(Sepal.Length, Sepal.Width, data=iris, shape = Species)
qplot(Sepal.Length, Sepal.Width, data=iris, size = Petal.Width)
ggplot2
ggplot
to specify the dataset and any axis/parameter values, then add additional geoms as necessary.geom_point
is used for dot plots.p <- ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) + geom_point()
p
+
operator.p + aes(col = Species)
geom_smooth
adds a regression line.p + geom_smooth()
p + geom_smooth() + aes(col = Species)
ggplot(iris, aes(x = Species, y = Sepal.Length)) + geom_boxplot()
p <- ggplot(iris, aes(x = Sepal.Width)) + geom_histogram() p
p + aes(fill = Species)
ggplot2
can be combined with piping to first create a subset of interest from your dataframe, then creating a plot based on that.p <- iris %>% mutate(Sepal.Size = Sepal.Length + Sepal.Width) %>% filter(Sepal.Size > 8) %>% ggplot(aes(x = Petal.Length, y = Sepal.Size, shape = Species, col = Species)) + geom_point() + geom_smooth(method = "lm", se = F) + theme_bw()
p
p <- iris %>% ggplot(aes(x = Petal.Length, fill = Species)) + geom_density(alpha = 0.75)
p
p <- iris %>% filter(Petal.Size > 5) %>% ggplot(aes(x = Species, fill = Species)) + geom_bar()
p
p <- iris %>% ggplot(aes(x = Petal.Length, y = Petal.Width, alpha = Species)) + geom_point() + facet_grid(Species~.)
p
ggpairs
function offers a nice way to visualize relationships in your data at a larger scale.gather
in tidyr
.library(GGally) p1 <- ggpairs(iris) p2 <- ggpairs(iris, 2:4) p3 <- ggpairs(iris, columns = c("Species", "Sepal.Length", "Sepal.Width"))
p1
p2
p3
p <- ggplot(iris, aes(x = Petal.Length, y = Sepal.Length)) + geom_point() + geom_smooth(se = F, method="lm")
p
lm
command to fit linear regression in R.mod <- lm(Petal.Length~Sepal.Length, data=iris)
summary(mod)
## ## Call: ## lm(formula = Petal.Length ~ Sepal.Length, data = iris) ## ## Residuals: ## Min 1Q Median 3Q Max ## -2.47747 -0.59072 -0.00668 0.60484 2.49512 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) -7.10144 0.50666 -14.02 <2e-16 *** ## Sepal.Length 1.85843 0.08586 21.65 <2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 0.8678 on 148 degrees of freedom ## Multiple R-squared: 0.76, Adjusted R-squared: 0.7583 ## F-statistic: 468.6 on 1 and 148 DF, p-value: < 2.2e-16
plot(mod, 1)
plot(mod, 2) #our model does pretty well!
data.size<-nrow(iris) train.size<-0.80 train.row.nums<-sample(1:data.size, data.size*train.size, replace=FALSE) train.data<-subset(iris[train.row.nums,]) test.row.nums<-setdiff(1:data.size,train.row.nums) test.data<-subset(iris[test.row.nums,])
fit <- lm(Petal.Length~Sepal.Length, train.data) mse.train <- mean(fit$residuals^2) fit.predicted <- predict(fit, test.data[,-3], interval="predict") mse.test <- mean((test.data$Petal.Length - fit.predicted)^2) mse.train
## [1] 0.7329038
mse.test
## [1] 2.766055
glm
function to access a wide range of generalized linear models (where errors are not asusmed to be normally distributed) which includes tool for predicting categorial variables like logistic regression.caret
, e1071
, and randomForest
contain more machine-learning focused methods like Naive Bayes classifiers and random forest models, as well as tools for evaluating them.tidyverse
for data manipulation and plotting before covering simple linear regression to highlight R's modeling capabilities.library(help = "datasets")
into your RStudio console to get a list of R's pre-included datasets to practice on.