The following is the code for class:
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.2
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.3
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## Warning: package 'readr' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.4
## Warning: package 'forcats' was built under R version 3.4.4
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
# First practice: joining
# a tibble is a type of a data frame
superheroes <- tibble::tribble(
~name, ~alignment, ~gender, ~publisher,
"Magneto", "bad", "male", "Marvel",
"Storm", "good", "female", "Marvel",
"Mystique", "bad", "female", "Marvel",
"Batman", "good", "male", "DC",
"Joker", "bad", "male", "DC",
"Catwoman", "bad", "female", "DC",
"Hellboy", "good", "male", "Dark Horse Comics"
)
publishers <- tibble::tribble(
~publisher, ~yr_founded,
"DC", 1934L,
"Marvel", 1939L,
"Image", 1992L
)
# inner join super hero and publisher
insp<-inner_join(superheroes, publishers)
## Joining, by = "publisher"
insp
## # A tibble: 6 x 5
## name alignment gender publisher yr_founded
## <chr> <chr> <chr> <chr> <int>
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
# left join super hero and publisher
ljsp<-left_join(superheroes, publishers)
## Joining, by = "publisher"
ljsp
## # A tibble: 7 x 5
## name alignment gender publisher yr_founded
## <chr> <chr> <chr> <chr> <int>
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
## 7 Hellboy good male Dark Horse Comics NA
# using dplyr with the iris dataset
data(iris)
?iris
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
ggplot(iris, aes(y=Sepal.Length, x=Species, fill=Species))+
geom_boxplot()
# group by species and summarise septal length info
iris%>%
group_by(Species)%>%
summarise(n=n(),
meanSL=mean(Sepal.Length, na.rm=TRUE),
sdSL=sd(Sepal.Length, na.rm=TRUE))
## # A tibble: 3 x 4
## Species n meanSL sdSL
## <fct> <int> <dbl> <dbl>
## 1 setosa 50 5.01 0.352
## 2 versicolor 50 5.94 0.516
## 3 virginica 50 6.59 0.636
# group_by (for all variables)
iris%>%
group_by(Species)%>%
summarise(n=n(),
meanSL=mean(Sepal.Length, na.rm=TRUE),
sdSL=sd(Sepal.Length, na.rm=TRUE),
meanSW=mean(Sepal.Width, na.rm=TRUE),
sdSW=sd(Sepal.Width, na.rm=TRUE),
meanPL=mean(Petal.Length, na.rm=TRUE),
sdPL=sd(Petal.Length, na.rm=TRUE),
meanPW=mean(Petal.Width, na.rm=TRUE),
sdPW=sd(Petal.Width, na.rm=TRUE))
## # A tibble: 3 x 10
## Species n meanSL sdSL meanSW sdSW meanPL sdPL meanPW sdPW
## <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 50 5.01 0.352 3.43 0.379 1.46 0.174 0.246 0.105
## 2 versicolor 50 5.94 0.516 2.77 0.314 4.26 0.470 1.33 0.198
## 3 virginica 50 6.59 0.636 2.97 0.322 5.55 0.552 2.03 0.275
# filter for setosa
setosa<-iris%>%
filter(Species=="setosa")
ggplot(setosa, aes(Sepal.Length))+
geom_histogram(bins=10)
# filter for small
small<-iris%>%
filter(Sepal.Length<6)
ggplot(small, aes(Species, fill=Species))+
geom_bar()
# mutate to create ratio between sepal length and petal length
r_iris<-iris%>%
mutate(ratio=Sepal.Length/Petal.Length)
head(r_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species ratio
## 1 5.1 3.5 1.4 0.2 setosa 3.642857
## 2 4.9 3.0 1.4 0.2 setosa 3.500000
## 3 4.7 3.2 1.3 0.2 setosa 3.615385
## 4 4.6 3.1 1.5 0.2 setosa 3.066667
## 5 5.0 3.6 1.4 0.2 setosa 3.571429
## 6 5.4 3.9 1.7 0.4 setosa 3.176471
ggplot(r_iris, aes(y=ratio, x=Species, fill=Species))+
geom_boxplot()
# select only certain columns
iris_trim<-iris%>%
select(Sepal.Length, Sepal.Width, Species)
head(iris_trim)
## Sepal.Length Sepal.Width Species
## 1 5.1 3.5 setosa
## 2 4.9 3.0 setosa
## 3 4.7 3.2 setosa
## 4 4.6 3.1 setosa
## 5 5.0 3.6 setosa
## 6 5.4 3.9 setosa
# tidyr examples
# generate stock market data
set.seed(1)
stocks <- data.frame(
time = as.Date('2009-01-01') + 0:9,
X = rnorm(10, 20, 1),
Y = rnorm(10, 20, 2),
Z = rnorm(10, 20, 4)
)
stocks
## time X Y Z
## 1 2009-01-01 19.37355 23.02356 23.67591
## 2 2009-01-02 20.18364 20.77969 23.12855
## 3 2009-01-03 19.16437 18.75752 20.29826
## 4 2009-01-04 21.59528 15.57060 12.04259
## 5 2009-01-05 20.32951 22.24986 22.47930
## 6 2009-01-06 19.17953 19.91013 19.77549
## 7 2009-01-07 20.48743 19.96762 19.37682
## 8 2009-01-08 20.73832 21.88767 14.11699
## 9 2009-01-09 20.57578 21.64244 18.08740
## 10 2009-01-10 19.69461 21.18780 21.67177
# gather
stocksG<-stocks%>%
gather(key=stock, value=price, -time )
head(stocksG)
## time stock price
## 1 2009-01-01 X 19.37355
## 2 2009-01-02 X 20.18364
## 3 2009-01-03 X 19.16437
## 4 2009-01-04 X 21.59528
## 5 2009-01-05 X 20.32951
## 6 2009-01-06 X 19.17953
ggplot(stocksG, aes(time, price, color=stock))+
geom_line()
# spread
stocksS<-stocksG%>%
spread(key=stock, value=price)
head(stocksS)
## time X Y Z
## 1 2009-01-01 19.37355 23.02356 23.67591
## 2 2009-01-02 20.18364 20.77969 23.12855
## 3 2009-01-03 19.16437 18.75752 20.29826
## 4 2009-01-04 21.59528 15.57060 12.04259
## 5 2009-01-05 20.32951 22.24986 22.47930
## 6 2009-01-06 19.17953 19.91013 19.77549
# lets create more fake data for dates and times
set.seed(1)
date <- as.Date('2016-01-01') + 0:14
hour <- sample(1:24, 15)
min <- sample(1:60, 15)
second <- sample(1:60, 15)
event <- sample(letters, 15)
data <- data.frame(date, hour, min, second, event)
data
## date hour min second event
## 1 2016-01-01 7 30 29 u
## 2 2016-01-02 9 43 36 a
## 3 2016-01-03 13 58 60 l
## 4 2016-01-04 20 22 11 q
## 5 2016-01-05 5 44 47 p
## 6 2016-01-06 18 52 37 k
## 7 2016-01-07 19 12 43 r
## 8 2016-01-08 12 35 6 i
## 9 2016-01-09 11 7 38 e
## 10 2016-01-10 1 14 21 b
## 11 2016-01-11 3 20 42 w
## 12 2016-01-12 14 1 32 t
## 13 2016-01-13 23 19 52 h
## 14 2016-01-14 21 41 26 s
## 15 2016-01-15 8 16 25 o
dataUnite <- data %>%
unite(datehour, date, hour, sep = ' ') %>%
unite(datetime, datehour, min, second, sep = ':')
str(dataUnite)
## 'data.frame': 15 obs. of 2 variables:
## $ datetime: chr "2016-01-01 7:30:29" "2016-01-02 9:43:36" "2016-01-03 13:58:60" "2016-01-04 20:22:11" ...
## $ event : Factor w/ 15 levels "a","b","e","h",..: 14 1 7 10 9 6 11 5 3 2 ...
# seperate
dataSep <- dataUnite %>%
separate(datetime, c('date', 'time'), sep = ' ') %>%
separate(time, c('hour', 'min', 'second'), sep = ':')
dataSep
## date hour min second event
## 1 2016-01-01 7 30 29 u
## 2 2016-01-02 9 43 36 a
## 3 2016-01-03 13 58 60 l
## 4 2016-01-04 20 22 11 q
## 5 2016-01-05 5 44 47 p
## 6 2016-01-06 18 52 37 k
## 7 2016-01-07 19 12 43 r
## 8 2016-01-08 12 35 6 i
## 9 2016-01-09 11 7 38 e
## 10 2016-01-10 1 14 21 b
## 11 2016-01-11 3 20 42 w
## 12 2016-01-12 14 1 32 t
## 13 2016-01-13 23 19 52 h
## 14 2016-01-14 21 41 26 s
## 15 2016-01-15 8 16 25 o