Data Wrangling

The following is the code for class:

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.2
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.3
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## Warning: package 'readr' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.4
## Warning: package 'forcats' was built under R version 3.4.4
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
# First practice: joining 

# a tibble is a type of a data frame

superheroes <- tibble::tribble(
  ~name, ~alignment,  ~gender,          ~publisher,
  "Magneto",      "bad",   "male",            "Marvel",
  "Storm",     "good", "female",            "Marvel",
  "Mystique",      "bad", "female",            "Marvel",
  "Batman",     "good",   "male",                "DC",
  "Joker",      "bad",   "male",                "DC",
  "Catwoman",      "bad", "female",                "DC",
  "Hellboy",     "good",   "male", "Dark Horse Comics"
)

publishers <- tibble::tribble(
  ~publisher, ~yr_founded,
  "DC",       1934L,
  "Marvel",       1939L,
  "Image",       1992L
)

# inner join super hero and publisher
insp<-inner_join(superheroes, publishers)
## Joining, by = "publisher"
insp
## # A tibble: 6 x 5
##   name     alignment gender publisher yr_founded
##   <chr>    <chr>     <chr>  <chr>          <int>
## 1 Magneto  bad       male   Marvel          1939
## 2 Storm    good      female Marvel          1939
## 3 Mystique bad       female Marvel          1939
## 4 Batman   good      male   DC              1934
## 5 Joker    bad       male   DC              1934
## 6 Catwoman bad       female DC              1934
# left join super hero and publisher 
ljsp<-left_join(superheroes, publishers)
## Joining, by = "publisher"
ljsp
## # A tibble: 7 x 5
##   name     alignment gender publisher         yr_founded
##   <chr>    <chr>     <chr>  <chr>                  <int>
## 1 Magneto  bad       male   Marvel                  1939
## 2 Storm    good      female Marvel                  1939
## 3 Mystique bad       female Marvel                  1939
## 4 Batman   good      male   DC                      1934
## 5 Joker    bad       male   DC                      1934
## 6 Catwoman bad       female DC                      1934
## 7 Hellboy  good      male   Dark Horse Comics         NA
# using dplyr with the iris dataset
data(iris)
?iris

head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
ggplot(iris, aes(y=Sepal.Length, x=Species, fill=Species))+
  geom_boxplot()

# group by species and summarise septal length info
iris%>%
  group_by(Species)%>%
  summarise(n=n(),
            meanSL=mean(Sepal.Length, na.rm=TRUE),
            sdSL=sd(Sepal.Length, na.rm=TRUE))
## # A tibble: 3 x 4
##   Species        n meanSL  sdSL
##   <fct>      <int>  <dbl> <dbl>
## 1 setosa        50   5.01 0.352
## 2 versicolor    50   5.94 0.516
## 3 virginica     50   6.59 0.636
# group_by (for all variables)
iris%>%
  group_by(Species)%>%
  summarise(n=n(),
            meanSL=mean(Sepal.Length, na.rm=TRUE),
            sdSL=sd(Sepal.Length, na.rm=TRUE),
            meanSW=mean(Sepal.Width, na.rm=TRUE),
            sdSW=sd(Sepal.Width, na.rm=TRUE),
            meanPL=mean(Petal.Length, na.rm=TRUE),
            sdPL=sd(Petal.Length, na.rm=TRUE),
            meanPW=mean(Petal.Width, na.rm=TRUE),
            sdPW=sd(Petal.Width, na.rm=TRUE))
## # A tibble: 3 x 10
##   Species        n meanSL  sdSL meanSW  sdSW meanPL  sdPL meanPW  sdPW
##   <fct>      <int>  <dbl> <dbl>  <dbl> <dbl>  <dbl> <dbl>  <dbl> <dbl>
## 1 setosa        50   5.01 0.352   3.43 0.379   1.46 0.174  0.246 0.105
## 2 versicolor    50   5.94 0.516   2.77 0.314   4.26 0.470  1.33  0.198
## 3 virginica     50   6.59 0.636   2.97 0.322   5.55 0.552  2.03  0.275
# filter for setosa 
setosa<-iris%>%
  filter(Species=="setosa")

ggplot(setosa, aes(Sepal.Length))+
  geom_histogram(bins=10)

# filter for small
small<-iris%>%
  filter(Sepal.Length<6)

ggplot(small, aes(Species, fill=Species))+
  geom_bar()

# mutate to create ratio between sepal length and petal length
r_iris<-iris%>%
  mutate(ratio=Sepal.Length/Petal.Length)

head(r_iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species    ratio
## 1          5.1         3.5          1.4         0.2  setosa 3.642857
## 2          4.9         3.0          1.4         0.2  setosa 3.500000
## 3          4.7         3.2          1.3         0.2  setosa 3.615385
## 4          4.6         3.1          1.5         0.2  setosa 3.066667
## 5          5.0         3.6          1.4         0.2  setosa 3.571429
## 6          5.4         3.9          1.7         0.4  setosa 3.176471
ggplot(r_iris, aes(y=ratio, x=Species, fill=Species))+
  geom_boxplot()

# select only certain columns 
iris_trim<-iris%>%
  select(Sepal.Length, Sepal.Width, Species)

head(iris_trim)
##   Sepal.Length Sepal.Width Species
## 1          5.1         3.5  setosa
## 2          4.9         3.0  setosa
## 3          4.7         3.2  setosa
## 4          4.6         3.1  setosa
## 5          5.0         3.6  setosa
## 6          5.4         3.9  setosa
# tidyr examples 

# generate stock market data
set.seed(1)
stocks <- data.frame(
  time = as.Date('2009-01-01') + 0:9,
  X = rnorm(10, 20, 1),
  Y = rnorm(10, 20, 2),
  Z = rnorm(10, 20, 4)
)
stocks 
##          time        X        Y        Z
## 1  2009-01-01 19.37355 23.02356 23.67591
## 2  2009-01-02 20.18364 20.77969 23.12855
## 3  2009-01-03 19.16437 18.75752 20.29826
## 4  2009-01-04 21.59528 15.57060 12.04259
## 5  2009-01-05 20.32951 22.24986 22.47930
## 6  2009-01-06 19.17953 19.91013 19.77549
## 7  2009-01-07 20.48743 19.96762 19.37682
## 8  2009-01-08 20.73832 21.88767 14.11699
## 9  2009-01-09 20.57578 21.64244 18.08740
## 10 2009-01-10 19.69461 21.18780 21.67177
# gather 
stocksG<-stocks%>%
  gather(key=stock, value=price, -time )

head(stocksG)
##         time stock    price
## 1 2009-01-01     X 19.37355
## 2 2009-01-02     X 20.18364
## 3 2009-01-03     X 19.16437
## 4 2009-01-04     X 21.59528
## 5 2009-01-05     X 20.32951
## 6 2009-01-06     X 19.17953
ggplot(stocksG, aes(time, price, color=stock))+
  geom_line()

# spread
stocksS<-stocksG%>%
  spread(key=stock, value=price)

head(stocksS)
##         time        X        Y        Z
## 1 2009-01-01 19.37355 23.02356 23.67591
## 2 2009-01-02 20.18364 20.77969 23.12855
## 3 2009-01-03 19.16437 18.75752 20.29826
## 4 2009-01-04 21.59528 15.57060 12.04259
## 5 2009-01-05 20.32951 22.24986 22.47930
## 6 2009-01-06 19.17953 19.91013 19.77549
# lets create more fake data for dates and times
set.seed(1)
date <- as.Date('2016-01-01') + 0:14
hour <- sample(1:24, 15)
min <- sample(1:60, 15)
second <- sample(1:60, 15)
event <- sample(letters, 15)
data <- data.frame(date, hour, min, second, event)
data
##          date hour min second event
## 1  2016-01-01    7  30     29     u
## 2  2016-01-02    9  43     36     a
## 3  2016-01-03   13  58     60     l
## 4  2016-01-04   20  22     11     q
## 5  2016-01-05    5  44     47     p
## 6  2016-01-06   18  52     37     k
## 7  2016-01-07   19  12     43     r
## 8  2016-01-08   12  35      6     i
## 9  2016-01-09   11   7     38     e
## 10 2016-01-10    1  14     21     b
## 11 2016-01-11    3  20     42     w
## 12 2016-01-12   14   1     32     t
## 13 2016-01-13   23  19     52     h
## 14 2016-01-14   21  41     26     s
## 15 2016-01-15    8  16     25     o
dataUnite <- data %>%
  unite(datehour, date, hour, sep = ' ') %>%
  unite(datetime, datehour, min, second, sep = ':')

str(dataUnite)
## 'data.frame':    15 obs. of  2 variables:
##  $ datetime: chr  "2016-01-01 7:30:29" "2016-01-02 9:43:36" "2016-01-03 13:58:60" "2016-01-04 20:22:11" ...
##  $ event   : Factor w/ 15 levels "a","b","e","h",..: 14 1 7 10 9 6 11 5 3 2 ...
# seperate 
dataSep <- dataUnite %>% 
  separate(datetime, c('date', 'time'), sep = ' ') %>% 
  separate(time, c('hour', 'min', 'second'), sep = ':')

dataSep
##          date hour min second event
## 1  2016-01-01    7  30     29     u
## 2  2016-01-02    9  43     36     a
## 3  2016-01-03   13  58     60     l
## 4  2016-01-04   20  22     11     q
## 5  2016-01-05    5  44     47     p
## 6  2016-01-06   18  52     37     k
## 7  2016-01-07   19  12     43     r
## 8  2016-01-08   12  35      6     i
## 9  2016-01-09   11   7     38     e
## 10 2016-01-10    1  14     21     b
## 11 2016-01-11    3  20     42     w
## 12 2016-01-12   14   1     32     t
## 13 2016-01-13   23  19     52     h
## 14 2016-01-14   21  41     26     s
## 15 2016-01-15    8  16     25     o