Paquete dplyr -> manipulacion de datos Argumentos: set de datos, ¿que hacer con el set?. Resultado: set de datos modificado
mutate() adds new variables that are functions of existing variables.
select() picks variables based on their names.
filter() picks cases based on their values.
summarise() reduces multiple values down to a single summary.
arrange() changes the ordering of the rows.
Picks cases based on their values.(set data, criteria)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.5 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
data("mtcars")
mt4.carbs <- filter(mtcars, carb >= 4) #cars whit 4 or more carburetors
mt4.carbs
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
mt4.carbs.aut.trans <- filter(mtcars, carb >= 4, am == 0)
#cars whit 4 or more carburetors and automatic transmission
mt4.carbs.aut.trans
## mpg cyl disp hp drat wt qsec vs am gear carb
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
disp.l250.or.mpg.m20 <- filter(mtcars, disp < 250|mpg > 20)
#cars whit diplacement less than 250 or mpg more than 20
disp.l250.or.mpg.m20
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
max.mpg <- filter(mtcars, mpg == max(mpg))#car whit mpg max
max.mpg
## mpg cyl disp hp drat wt qsec vs am gear carb
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.9 1 1 4 1
data("airquality")
no.miss.solarrad <- filter(airquality, !is.na(Solar.R))
head(no.miss.solarrad)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 23 299 8.6 65 5 7
## 6 19 99 13.8 59 5 8
airqual.no.na <- filter(airquality[1:10,],complete.cases(airquality[1:10,]))
airqual.no.na
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 23 299 8.6 65 5 7
## 6 19 99 13.8 59 5 8
## 7 8 19 20.1 61 5 9
data("iris")
table(iris$Species) #counts of species
##
## setosa versicolor virginica
## 50 50 50
iris.two.species <- filter(iris, Species %in% c("setosa", "virginica"))
head(iris.two.species)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
table(iris.two.species$Species)
##
## setosa versicolor virginica
## 50 0 50
nrow(iris)
## [1] 150
nrow(iris.two.species)
## [1] 100
airqual.3c <- filter(airquality, Ozone < 20) [,1:3]
head((airqual.3c))
## Ozone Solar.R Wind
## 1 12 149 12.6
## 2 18 313 11.5
## 3 19 99 13.8
## 4 8 19 20.1
## 5 7 NA 6.9
## 6 16 256 9.7
#counting of rows based on number of gears (group_by argument)
table(mtcars$gear) #Table: tablas de frecuencia
##
## 3 4 5
## 15 12 5
more.frequent.no.gears <- mtcars %>%
group_by(gear)%>%
filter(n()>10)
table(more.frequent.no.gears$gear)
##
## 3 4
## 15 12
#example group_by
mtcars%>%
group_by(gear)%>%
summarise(mean=mean(mpg), sum=sum(mpg),n=n())
## # A tibble: 3 x 4
## gear mean sum n
## <dbl> <dbl> <dbl> <int>
## 1 3 16.1 242. 15
## 2 4 24.5 294. 12
## 3 5 21.4 107. 5
more.frequent.no.gears.horsepower <- mtcars %>%
group_by(gear)%>%
filter(n()>10, hp < 105)
table(more.frequent.no.gears.horsepower$gear)
##
## 3 4
## 1 7
names(iris)
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
iris.startswhitS <- iris %>%
## dplyr::select(starts_with("S"))
select(starts_with("S"))
head(iris.startswhitS)
## Sepal.Length Sepal.Width Species
## 1 5.1 3.5 setosa
## 2 4.9 3.0 setosa
## 3 4.7 3.2 setosa
## 4 4.6 3.1 setosa
## 5 5.0 3.6 setosa
## 6 5.4 3.9 setosa
new.mtcars <- mtcars %>%
filter_at(vars(cyl, hp),
all_vars(. == max(.)))
new.mtcars
## mpg cyl disp hp drat wt qsec vs am gear carb
## Maserati Bora 15 8 301 335 3.54 3.57 14.6 0 1 5 8
#vars -> equivalent semantics to "select" (vector of column names) PREGUNTAR
nmtcars <- mtcars %>%
summarise(across(c(cyl,hp), max))
nmtcars
## cyl hp
## 1 8 335
head(msleep)
## # A tibble: 6 x 11
## name genus vore order conservation sleep_total sleep_rem sleep_cycle awake
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Cheetah Acin~ carni Carn~ lc 12.1 NA NA 11.9
## 2 Owl mo~ Aotus omni Prim~ <NA> 17 1.8 NA 7
## 3 Mounta~ Aplo~ herbi Rode~ nt 14.4 2.4 NA 9.6
## 4 Greate~ Blar~ omni Sori~ lc 14.9 2.3 0.133 9.1
## 5 Cow Bos herbi Arti~ domesticated 4 0.7 0.667 20
## 6 Three-~ Brad~ herbi Pilo~ <NA> 14.4 2.2 0.767 9.6
## # ... with 2 more variables: brainwt <dbl>, bodywt <dbl>
head(arrange(msleep, name))
## # A tibble: 6 x 11
## name genus vore order conservation sleep_total sleep_rem sleep_cycle awake
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Africa~ Loxo~ herbi Prob~ vu 3.3 NA NA 20.7
## 2 Africa~ Cric~ omni Rode~ <NA> 8.3 2 NA 15.7
## 3 Africa~ Rhab~ omni Rode~ <NA> 8.7 NA NA 15.3
## 4 Arctic~ Vulp~ carni Carn~ <NA> 12.5 NA NA 11.5
## 5 Arctic~ Sper~ herbi Rode~ lc 16.6 NA NA 7.4
## 6 Asian ~ Elep~ herbi Prob~ en 3.9 NA NA 20.1
## # ... with 2 more variables: brainwt <dbl>, bodywt <dbl>
animal.name.sequence <- arrange(msleep, vore, order)
animal.name.sequence [1:10,1:4]
## # A tibble: 10 x 4
## name genus vore order
## <chr> <chr> <chr> <chr>
## 1 Cheetah Acinonyx carni Carnivora
## 2 Northern fur seal Callorhinus carni Carnivora
## 3 Dog Canis carni Carnivora
## 4 Domestic cat Felis carni Carnivora
## 5 Gray seal Haliochoerus carni Carnivora
## 6 Tiger Panthera carni Carnivora
## 7 Jaguar Panthera carni Carnivora
## 8 Lion Panthera carni Carnivora
## 9 Caspian seal Phoca carni Carnivora
## 10 Genet Genetta carni Carnivora
animal.name.sequence.desc <- arrange(msleep, vore, desc(order))
animal.name.sequence.desc[1:10,1:4]
## # A tibble: 10 x 4
## name genus vore order
## <chr> <chr> <chr> <chr>
## 1 Northern grasshopper mouse Onychomys carni Rodentia
## 2 Slow loris Nyctibeus carni Primates
## 3 Thick-tailed opposum Lutreolina carni Didelphimorphia
## 4 Long-nosed armadillo Dasypus carni Cingulata
## 5 Pilot whale Globicephalus carni Cetacea
## 6 Common porpoise Phocoena carni Cetacea
## 7 Bottle-nosed dolphin Tursiops carni Cetacea
## 8 Cheetah Acinonyx carni Carnivora
## 9 Northern fur seal Callorhinus carni Carnivora
## 10 Dog Canis carni Carnivora
names(msleep)
## [1] "name" "genus" "vore" "order" "conservation"
## [6] "sleep_total" "sleep_rem" "sleep_cycle" "awake" "brainwt"
## [11] "bodywt"
renamed.msleep <- rename(msleep, habito_alimento = vore, conservation_status = conservation)
names(renamed.msleep)
## [1] "name" "genus" "habito_alimento"
## [4] "order" "conservation_status" "sleep_total"
## [7] "sleep_rem" "sleep_cycle" "awake"
## [10] "brainwt" "bodywt"
data("ChickWeight")
head(ChickWeight)
## weight Time Chick Diet
## 1 42 0 1 1
## 2 51 2 1 1
## 3 59 4 1 1
## 4 64 6 1 1
## 5 76 8 1 1
## 6 93 10 1 1
ChickWeight.with.log <- mutate(ChickWeight, log.of.weight= log10(weight))
head(ChickWeight.with.log)
## weight Time Chick Diet log.of.weight
## 1 42 0 1 1 1.623249
## 2 51 2 1 1 1.707570
## 3 59 4 1 1 1.770852
## 4 64 6 1 1 1.806180
## 5 76 8 1 1 1.880814
## 6 93 10 1 1 1.968483
names(msleep)
## [1] "name" "genus" "vore" "order" "conservation"
## [6] "sleep_total" "sleep_rem" "sleep_cycle" "awake" "brainwt"
## [11] "bodywt"
msleep.raiz.sqrt <- mutate_all(msleep[,6:11],list(sqrt_root= sqrt))
#columns names changed "_sqrt_root"
#mutate_all(msleep[,6:11],funs("square.root"=sqrt(.)))
#funs () was deprecated, use: list (name = function)
names(msleep.raiz.sqrt)
## [1] "sleep_total" "sleep_rem" "sleep_cycle"
## [4] "awake" "brainwt" "bodywt"
## [7] "sleep_total_sqrt_root" "sleep_rem_sqrt_root" "sleep_cycle_sqrt_root"
## [10] "awake_sqrt_root" "brainwt_sqrt_root" "bodywt_sqrt_root"
data("Titanic")
Titanic <- as.data.frame(Titanic) #converto to data frame
head(Titanic)
## Class Sex Age Survived Freq
## 1 1st Male Child No 0
## 2 2nd Male Child No 0
## 3 3rd Male Child No 35
## 4 Crew Male Child No 0
## 5 1st Female Child No 0
## 6 2nd Female Child No 0
titanic.ranks<- mutate_at(Titanic, vars(Class,Age,Survived), list(Rank=min_rank))
titanic.with.ranks <- mutate_at(Titanic, vars(Class,Age,Survived),
funs(Rank = min_rank(desc(.))))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
head(titanic.ranks,10)
## Class Sex Age Survived Freq Class_Rank Age_Rank Survived_Rank
## 1 1st Male Child No 0 1 1 1
## 2 2nd Male Child No 0 9 1 1
## 3 3rd Male Child No 35 17 1 1
## 4 Crew Male Child No 0 25 1 1
## 5 1st Female Child No 0 1 1 1
## 6 2nd Female Child No 0 9 1 1
## 7 3rd Female Child No 17 17 1 1
## 8 Crew Female Child No 0 25 1 1
## 9 1st Male Adult No 118 1 17 1
## 10 2nd Male Adult No 154 9 17 1
head(titanic.with.ranks,10)
## Class Sex Age Survived Freq Class_Rank Age_Rank Survived_Rank
## 1 1st Male Child No 0 25 17 17
## 2 2nd Male Child No 0 17 17 17
## 3 3rd Male Child No 35 9 17 17
## 4 Crew Male Child No 0 1 17 17
## 5 1st Female Child No 0 25 17 17
## 6 2nd Female Child No 0 17 17 17
## 7 3rd Female Child No 17 9 17 17
## 8 Crew Female Child No 0 1 17 17
## 9 1st Male Adult No 118 25 1 17
## 10 2nd Male Adult No 154 17 1 17
#Example 1
divide.by.10 <- function(a.number)(a.number/10)
head(CO2)
## Plant Type Treatment conc uptake
## 1 Qn1 Quebec nonchilled 95 16.0
## 2 Qn1 Quebec nonchilled 175 30.4
## 3 Qn1 Quebec nonchilled 250 34.8
## 4 Qn1 Quebec nonchilled 350 37.2
## 5 Qn1 Quebec nonchilled 500 35.3
## 6 Qn1 Quebec nonchilled 675 39.2
new.df <- CO2 %>%
mutate_if(is.numeric, divide.by.10)
head(new.df)
## Plant Type Treatment conc uptake
## 1 Qn1 Quebec nonchilled 9.5 1.60
## 2 Qn1 Quebec nonchilled 17.5 3.04
## 3 Qn1 Quebec nonchilled 25.0 3.48
## 4 Qn1 Quebec nonchilled 35.0 3.72
## 5 Qn1 Quebec nonchilled 50.0 3.53
## 6 Qn1 Quebec nonchilled 67.5 3.92
#Example 2
#Reemplazar NAs por 0
df <- data.frame(
alpha = c(22, 1, NA),
almond = c(0, 5, 10),
grape = c(0, 2, 2),
apple = c(NA, 5, 10))
df
## alpha almond grape apple
## 1 22 0 0 NA
## 2 1 5 2 5
## 3 NA 10 2 10
df.fix.alpha <- df %>% mutate_if(is.numeric, coalesce, ... = 0)
df.fix.alpha
## alpha almond grape apple
## 1 22 0 0 0
## 2 1 5 2 5
## 3 0 10 2 10
table(msleep$vore)
##
## carni herbi insecti omni
## 19 32 5 20
msleep.sin.c.o.a <- filter(msleep, !str_detect(vore, paste(c("c","h"), collapse = "|")))
head(msleep.sin.c.o.a)
## # A tibble: 6 x 11
## name genus vore order conservation sleep_total sleep_rem sleep_cycle awake
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Owl mo~ Aotus omni Prim~ <NA> 17 1.8 NA 7
## 2 Greate~ Blar~ omni Sori~ lc 14.9 2.3 0.133 9.1
## 3 Grivet Cerc~ omni Prim~ lc 10 0.7 NA 14
## 4 Star-n~ Cond~ omni Sori~ lc 10.3 2.2 NA 13.7
## 5 Africa~ Cric~ omni Rode~ <NA> 8.3 2 NA 15.7
## 6 Lesser~ Cryp~ omni Sori~ lc 9.1 1.4 0.15 14.9
## # ... with 2 more variables: brainwt <dbl>, bodywt <dbl>
table(msleep.sin.c.o.a$vore)
##
## omni
## 20
msleep.with.dup.indicator <- mutate(msleep, duplicate.indicator= duplicated(conservation))
head(msleep.with.dup.indicator, 20)
## # A tibble: 20 x 12
## name genus vore order conservation sleep_total sleep_rem sleep_cycle awake
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Cheet~ Acin~ carni Carn~ lc 12.1 NA NA 11.9
## 2 Owl m~ Aotus omni Prim~ <NA> 17 1.8 NA 7
## 3 Mount~ Aplo~ herbi Rode~ nt 14.4 2.4 NA 9.6
## 4 Great~ Blar~ omni Sori~ lc 14.9 2.3 0.133 9.1
## 5 Cow Bos herbi Arti~ domesticated 4 0.7 0.667 20
## 6 Three~ Brad~ herbi Pilo~ <NA> 14.4 2.2 0.767 9.6
## 7 North~ Call~ carni Carn~ vu 8.7 1.4 0.383 15.3
## 8 Vespe~ Calo~ <NA> Rode~ <NA> 7 NA NA 17
## 9 Dog Canis carni Carn~ domesticated 10.1 2.9 0.333 13.9
## 10 Roe d~ Capr~ herbi Arti~ lc 3 NA NA 21
## 11 Goat Capri herbi Arti~ lc 5.3 0.6 NA 18.7
## 12 Guine~ Cavis herbi Rode~ domesticated 9.4 0.8 0.217 14.6
## 13 Grivet Cerc~ omni Prim~ lc 10 0.7 NA 14
## 14 Chinc~ Chin~ herbi Rode~ domesticated 12.5 1.5 0.117 11.5
## 15 Star-~ Cond~ omni Sori~ lc 10.3 2.2 NA 13.7
## 16 Afric~ Cric~ omni Rode~ <NA> 8.3 2 NA 15.7
## 17 Lesse~ Cryp~ omni Sori~ lc 9.1 1.4 0.15 14.9
## 18 Long-~ Dasy~ carni Cing~ lc 17.4 3.1 0.383 6.6
## 19 Tree ~ Dend~ herbi Hyra~ lc 5.3 0.5 NA 18.7
## 20 North~ Dide~ omni Dide~ lc 18 4.9 0.333 6
## # ... with 3 more variables: brainwt <dbl>, bodywt <dbl>,
## # duplicate.indicator <lgl>
#duplicated.indicator = FLASE/TRUE
msleep.with.dup.indicator[1:6,c(1,5,12)]
## # A tibble: 6 x 3
## name conservation duplicate.indicator
## <chr> <chr> <lgl>
## 1 Cheetah lc FALSE
## 2 Owl monkey <NA> FALSE
## 3 Mountain beaver nt FALSE
## 4 Greater short-tailed shrew lc TRUE
## 5 Cow domesticated FALSE
## 6 Three-toed sloth <NA> TRUE
#conservation (major key), genus (minor,key)
msleep.with.dup.indicator2 <- mutate(msleep,duplicate.indicator = duplicated(conservation)) %>%
arrange(conservation, genus)
msleep.with.dup.indicator2 [1:20, c(1,2,5,12)]
## # A tibble: 20 x 4
## name genus conservation duplicate.indicator
## <chr> <chr> <chr> <lgl>
## 1 Giraffe Giraffa cd FALSE
## 2 Pilot whale Globicephalus cd TRUE
## 3 Cow Bos domesticated FALSE
## 4 Dog Canis domesticated TRUE
## 5 Guinea pig Cavis domesticated TRUE
## 6 Chinchilla Chinchilla domesticated TRUE
## 7 Horse Equus domesticated TRUE
## 8 Donkey Equus domesticated TRUE
## 9 Domestic cat Felis domesticated TRUE
## 10 Rabbit Oryctolagus domesticated TRUE
## 11 Sheep Ovis domesticated TRUE
## 12 Pig Sus domesticated TRUE
## 13 Asian elephant Elephas en FALSE
## 14 Golden hamster Mesocricetus en TRUE
## 15 Tiger Panthera en TRUE
## 16 Giant armadillo Priodontes en TRUE
## 17 Cheetah Acinonyx lc FALSE
## 18 Greater short-tailed shrew Blarina lc TRUE
## 19 Roe deer Capreolus lc TRUE
## 20 Goat Capri lc TRUE
msleepdup <- msleep %>%
mutate(dup.indicator.gen = duplicated(genus), du.indicator.cons = duplicated(conservation))%>%
arrange(conservation, genus)
msleepdup [1:10,c(1,2,5,12,13)]
## # A tibble: 10 x 5
## name genus conservation dup.indicator.gen du.indicator.cons
## <chr> <chr> <chr> <lgl> <lgl>
## 1 Giraffe Giraffa cd FALSE FALSE
## 2 Pilot whale Globicephalus cd FALSE TRUE
## 3 Cow Bos domesticated FALSE FALSE
## 4 Dog Canis domesticated FALSE TRUE
## 5 Guinea pig Cavis domesticated FALSE TRUE
## 6 Chinchilla Chinchilla domesticated FALSE TRUE
## 7 Horse Equus domesticated FALSE TRUE
## 8 Donkey Equus domesticated TRUE TRUE
## 9 Domestic cat Felis domesticated FALSE TRUE
## 10 Rabbit Oryctolagus domesticated FALSE TRUE
###PREGUNTAR
msleepdup2 <- msleep%>%
mutate(across(c(genus,conservation), duplicated))%>%
arrange(conservation, genus)
msleepdup2
## # A tibble: 83 x 11
## name genus vore order conservation sleep_total sleep_rem sleep_cycle awake
## <chr> <lgl> <chr> <chr> <lgl> <dbl> <dbl> <dbl> <dbl>
## 1 Cheet~ FALSE carni Carn~ FALSE 12.1 NA NA 11.9
## 2 Owl m~ FALSE omni Prim~ FALSE 17 1.8 NA 7
## 3 Mount~ FALSE herbi Rode~ FALSE 14.4 2.4 NA 9.6
## 4 Cow FALSE herbi Arti~ FALSE 4 0.7 0.667 20
## 5 North~ FALSE carni Carn~ FALSE 8.7 1.4 0.383 15.3
## 6 Asian~ FALSE herbi Prob~ FALSE 3.9 NA NA 20.1
## 7 Giraf~ FALSE herbi Arti~ FALSE 1.9 0.4 NA 22.1
## 8 Great~ FALSE omni Sori~ TRUE 14.9 2.3 0.133 9.1
## 9 Three~ FALSE herbi Pilo~ TRUE 14.4 2.2 0.767 9.6
## 10 Vespe~ FALSE <NA> Rode~ TRUE 7 NA NA 17
## # ... with 73 more rows, and 2 more variables: brainwt <dbl>, bodywt <dbl>
fruit <- c("apple", "pear", "orange", "grape", "orange", "orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
w <- c(2,2,2,4,5,6)
df <- data.frame(fruit,x,y,z,w)
df
## fruit x y z w
## 1 apple 1 22 3 2
## 2 pear 2 3 1 2
## 3 orange 4 4 4 2
## 4 grape 9 55 10 4
## 5 orange 4 15 12 5
## 6 orange 6 9 8 6
df.show.dup <- df %>%
mutate(duplicate.indicator = duplicated(fruit))
df.show.dup
## fruit x y z w duplicate.indicator
## 1 apple 1 22 3 2 FALSE
## 2 pear 2 3 1 2 FALSE
## 3 orange 4 4 4 2 FALSE
## 4 grape 9 55 10 4 FALSE
## 5 orange 4 15 12 5 TRUE
## 6 orange 6 9 8 6 TRUE
df.sin.z <- df%>%
mutate(z=NULL)
df.sin.z
## fruit x y w
## 1 apple 1 22 2
## 2 pear 2 3 2
## 3 orange 4 4 2
## 4 grape 9 55 4
## 5 orange 4 15 5
## 6 orange 6 9 6
if (!require("nycflights13")) install.packages("nycflights13")
## Loading required package: nycflights13
names(flights)
## [1] "year" "month" "day" "dep_time"
## [5] "sched_dep_time" "dep_delay" "arr_time" "sched_arr_time"
## [9] "arr_delay" "carrier" "flight" "tailnum"
## [13] "origin" "dest" "air_time" "distance"
## [17] "hour" "minute" "time_hour"
## Not recommended but works
flights1 <- mutate(flights,
gain = arr_delay - dep_delay,
hours = air_time / 60,
gain_per_hour = gain / hours,
gain_per_minute = 60 * gain_per_hour)
names(flights1)
## [1] "year" "month" "day" "dep_time"
## [5] "sched_dep_time" "dep_delay" "arr_time" "sched_arr_time"
## [9] "arr_delay" "carrier" "flight" "tailnum"
## [13] "origin" "dest" "air_time" "distance"
## [17] "hour" "minute" "time_hour" "gain"
## [21] "hours" "gain_per_hour" "gain_per_minute"
## Recommended
flights2 <- flights%>%
mutate(gain = arr_delay - dep_delay,
hours = air_time / 60) %>%
mutate(gain_per_hour = gain / hours) %>%
mutate(gain_per_minute = 60 * gain_per_hour)
flights2[1:6,c(1:2,20:23)]
## # A tibble: 6 x 6
## year month gain hours gain_per_hour gain_per_minute
## <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 9 3.78 2.38 143.
## 2 2013 1 16 3.78 4.23 254.
## 3 2013 1 31 2.67 11.6 698.
## 4 2013 1 -17 3.05 -5.57 -334.
## 5 2013 1 -19 1.93 -9.83 -590.
## 6 2013 1 16 2.5 6.4 384
fruit <- c("apple","pear","orange","grape", "orange","orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
df <- data.frame(fruit,x,y,z)
df
## fruit x y z
## 1 apple 1 22 3
## 2 pear 2 3 1
## 3 orange 4 4 4
## 4 grape 9 55 10
## 5 orange 4 15 12
## 6 orange 6 9 8
df <- df %>%
mutate(new.variable = x+y+z)
df
## fruit x y z new.variable
## 1 apple 1 22 3 26
## 2 pear 2 3 1 6
## 3 orange 4 4 4 12
## 4 grape 9 55 10 74
## 5 orange 4 15 12 31
## 6 orange 6 9 8 23
df1 <- transmute(df, new.variable = x+y+z)
df1
## new.variable
## 1 26
## 2 6
## 3 12
## 4 74
## 5 31
## 6 23
double.it <- function(x) x*2
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
iris %>%
mutate(across(where(is.numeric), double.it))%>%
head()
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 10.2 7.0 2.8 0.4 setosa
## 2 9.8 6.0 2.8 0.4 setosa
## 3 9.4 6.4 2.6 0.4 setosa
## 4 9.2 6.2 3.0 0.4 setosa
## 5 10.0 7.2 2.8 0.4 setosa
## 6 10.8 7.8 3.4 0.8 setosa
row1 <- c("a","b","c","d","e","f","column.to.be.changed")
row2 <- c(1,1,1,6,6,1,2)
row3 <- c(3,4,4,6,4,4,4)
row4 <- c(4,6,25,5,5,2,9)
row5 <- c(5,3,6,3,3,6,2)
df <- as.data.frame(rbind(row2,row3,row4,row5))
names(df) <- row1
df
## a b c d e f column.to.be.changed
## row2 1 1 1 6 6 1 2
## row3 3 4 4 6 4 4 4
## row4 4 6 25 5 5 2 9
## row5 5 3 6 3 3 6 2
new.df <- df %>%
mutate(column.to.be.changed = case_when(a==2 | a==5 | a==7 | (a==1 & b==4)~2, a==0 | a==1 | a==4 |
a==3 | c==4~3, TRUE ~ NA_real_ ))
new.df
## a b c d e f column.to.be.changed
## row2 1 1 1 6 6 1 3
## row3 3 4 4 6 4 4 3
## row4 4 6 25 5 5 2 3
## row5 5 3 6 3 3 6 2
fruit <- c("apple","pear","orange","grape", "orange","orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
df <- data.frame(fruit,x,y,z) #before select
df
## fruit x y z
## 1 apple 1 22 3
## 2 pear 2 3 1
## 3 orange 4 4 4
## 4 grape 9 55 10
## 5 orange 4 15 12
## 6 orange 6 9 8
df.no.fruit <- dplyr::select(df, -fruit)
df.no.fruit
## x y z
## 1 1 22 3
## 2 2 3 1
## 3 4 4 4
## 4 9 55 10
## 5 4 15 12
## 6 6 9 8
names(mtcars)
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
mtcar.sin.startd.endt <- mtcars%>%
select(-starts_with("d"), -ends_with("t"))
names(mtcar.sin.startd.endt)
## [1] "mpg" "cyl" "hp" "qsec" "vs" "am" "gear" "carb"
state <- c("Maryland", "Alaska", "New Jersey")
income <- c(76067,74444,73702)
median.us <- c(61372,61372,61372)
life.expectancy <- c(78.8,78.3,80.3)
top.3.states <- data.frame(state, income, median.us,
life.expectancy)
top.3.states
## state income median.us life.expectancy
## 1 Maryland 76067 61372 78.8
## 2 Alaska 74444 61372 78.3
## 3 New Jersey 73702 61372 80.3
#Capitalize column names con toupper()
new.top.states <- top.3.states %>%
select_all(toupper)
new.top.states
## STATE INCOME MEDIAN.US LIFE.EXPECTANCY
## 1 Maryland 76067 61372 78.8
## 2 Alaska 74444 61372 78.3
## 3 New Jersey 73702 61372 80.3
#pull function: acts like dataframe$variable syntax
pull.first.column <- pull(top.3.states,1)
pull.first.column
## [1] "Maryland" "Alaska" "New Jersey"
top.3.states$state
## [1] "Maryland" "Alaska" "New Jersey"
pull.last.colum <- pull(top.3.states, -1)
pull.last.colum
## [1] 78.8 78.3 80.3
nrow(mtcars)
## [1] 32
mtcars.more.than.200 <- filter_all(mtcars, any_vars(. > 200))
nrow(mtcars.more.than.200)
## [1] 16
names(mtcars)
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
subset.mtcars <- mtcars %>%
select(matches("pg|gea"))
names(subset.mtcars)
## [1] "mpg" "gear"
#cbind: take data-frame arguments and combine by columns
us.state.areas <- as.data.frame(cbind(state.abb, state.area))
us.state.areas[1:3,]
## state.abb state.area
## 1 AL 51609
## 2 AK 589757
## 3 AZ 113909
us.state.abb.name <- as.data.frame(cbind(state.abb,state.name))
us.state.abb.name[1:3,]
## state.abb state.name
## 1 AL Alabama
## 2 AK Alaska
## 3 AZ Arizona
state.info.abb.area.name <- us.state.areas %>%
left_join(us.state.abb.name, by = "state.abb")
head(state.info.abb.area.name)
## state.abb state.area state.name
## 1 AL 51609 Alabama
## 2 AK 589757 Alaska
## 3 AZ 113909 Arizona
## 4 AR 53104 Arkansas
## 5 CA 158693 California
## 6 CO 104247 Colorado
#rows that do not match are dropped
names <- c("Sally","Tom","Frieda","Alfonzo")
team.scores <- c(3,5,2,7)
team.league <- c("alpha","beta","gamma", "omicron")
team.info <- data.frame(names, team.scores, team.league)
team.info
## names team.scores team.league
## 1 Sally 3 alpha
## 2 Tom 5 beta
## 3 Frieda 2 gamma
## 4 Alfonzo 7 omicron
names = c("Sally","Tom", "Bill", "Alfonzo")
school.grades <- c("A","B","C","B")
school.info <- data.frame(names, school.grades)
school.info
## names school.grades
## 1 Sally A
## 2 Tom B
## 3 Bill C
## 4 Alfonzo B
school.and.team <- inner_join(team.info, school.info, by = "names")
school.and.team
## names team.scores team.league school.grades
## 1 Sally 3 alpha A
## 2 Tom 5 beta B
## 3 Alfonzo 7 omicron B
#Keeps values that do not match
names <- c("Sally","Tom","Frieda","Alfonzo")
team.scores <- c(3,5,2,7)
team.league <- c("alpha","beta","gamma", "omicron")
team.info <- data.frame(names, team.scores, team.league)
team.info
## names team.scores team.league
## 1 Sally 3 alpha
## 2 Tom 5 beta
## 3 Frieda 2 gamma
## 4 Alfonzo 7 omicron
names <- c("Sally","Tom", "Bill", "Alfonzo")
school.grades <- c("A","B","C","B")
school.info <- data.frame(names, school.grades)
school.info
## names school.grades
## 1 Sally A
## 2 Tom B
## 3 Bill C
## 4 Alfonzo B
team.info.but.no.grades <- anti_join(team.info, school.info, by = "names")
team.info.but.no.grades
## names team.scores team.league
## 1 Frieda 2 gamma
#Keeps all values from both dataframes
team.info.and.or.grades <- full_join(team.info, school.info, by = "names")
team.info.and.or.grades
## names team.scores team.league school.grades
## 1 Sally 3 alpha A
## 2 Tom 5 beta B
## 3 Frieda 2 gamma <NA>
## 4 Alfonzo 7 omicron B
## 5 Bill NA <NA> C
#keeps all observations in dataset1 which match dataset2
team.info.with.grades <- semi_join(team.info, school.info)
## Joining, by = "names"
team.info.with.grades
## names team.scores team.league
## 1 Sally 3 alpha
## 2 Tom 5 beta
## 3 Alfonzo 7 omicron
#returns all rows of y and columns of x and y
#right_join(x, y, by= common_key)
us.state.areas<- as.data.frame(cbind(state.abb,state.area))
us.state.areas[1:3,]
## state.abb state.area
## 1 AL 51609
## 2 AK 589757
## 3 AZ 113909
us.state.abb.name <- as.data.frame(cbind(state.abb,state.name))
us.state.abb.name[1:3,]
## state.abb state.name
## 1 AL Alabama
## 2 AK Alaska
## 3 AZ Arizona
us.state.abb.name[1,1] <- "Intentional Mismatch"
us.state.with.abbreviation.and.name.and.area <- right_join(us.state.areas,us.state.abb.name, by = "state.abb")
us.state.with.abbreviation.and.name.and.area [1:3,]
## state.abb state.area state.name
## 1 AK 589757 Alaska
## 2 AZ 113909 Arizona
## 3 AR 53104 Arkansas
nrow(msleep)
## [1] 83
msleep.only.first.5 <- slice(msleep, -6:-n())
nrow(msleep.only.first.5)
## [1] 5
msleep.20.rows <- msleep %>%
slice(20:39)
nrow(msleep.20.rows)
## [1] 20
nrow(msleep) - nrow(msleep.20.rows)
## [1] 63
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
data(gehan)
gehan2 <- gehan
library(tidyverse)
gehan2 %>% summarise(kount = n())
## kount
## 1 42
gehan2 %>%
group_by(treat)%>%
summarise(kount = n())
## # A tibble: 2 x 2
## treat kount
## <fct> <int>
## 1 6-MP 21
## 2 control 21
gehan2 %>%
group_by(treat) %>%
summarise(average.remiss.time = mean(time),
median.remiss.time = median(time),
std.dev.remiss.time = sd(time),
median.abs.deviation = mad(time),
IQR.remiss.time = IQR(time))
## # A tibble: 2 x 6
## treat average.remiss.time median.remiss.t~ std.dev.remiss.~ median.abs.devi~
## <fct> <dbl> <int> <dbl> <dbl>
## 1 6-MP 17.1 16 10.0 10.4
## 2 control 8.67 8 6.47 5.93
## # ... with 1 more variable: IQR.remiss.time <dbl>
gehan2 %>%
group_by(treat) %>%
summarise(minimum.remission = min(time),
max.remission = max(time))
## # A tibble: 2 x 3
## treat minimum.remission max.remission
## <fct> <int> <int>
## 1 6-MP 6 35
## 2 control 1 23
subset.survey <- survey [1:10,]
library (dplyr)
names(subset.survey)
## [1] "Sex" "Wr.Hnd" "NW.Hnd" "W.Hnd" "Fold" "Pulse" "Clap" "Exer"
## [9] "Smoke" "Height" "M.I" "Age"
subset.survey %>%
na.omit() %>%
group_by(Sex) %>%
summarise(across(where(is.numeric), mean, .names = "mean_{col}")) %>%
head()
## # A tibble: 2 x 6
## Sex mean_Wr.Hnd mean_NW.Hnd mean_Pulse mean_Height mean_Age
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Female 17.8 17.7 76.7 168. 25.0
## 2 Male 19.1 19.2 76.8 174. 20.3
View(msleep)
new.sleep <- msleep %>%
group_by(vore, order)
s <- summarise(new.sleep, n())
## `summarise()` has grouped output by 'vore'. You can override using the `.groups` argument.
s
## # A tibble: 32 x 3
## # Groups: vore [5]
## vore order `n()`
## <chr> <chr> <int>
## 1 carni Carnivora 12
## 2 carni Cetacea 3
## 3 carni Cingulata 1
## 4 carni Didelphimorphia 1
## 5 carni Primates 1
## 6 carni Rodentia 1
## 7 herbi Artiodactyla 5
## 8 herbi Diprotodontia 1
## 9 herbi Hyracoidea 2
## 10 herbi Lagomorpha 1
## # ... with 22 more rows
new.sleep.totals <- msleep %>%
group_by(vore, order) %>%
summarise(n())
## `summarise()` has grouped output by 'vore'. You can override using the `.groups` argument.
new.sleep.totals
## # A tibble: 32 x 3
## # Groups: vore [5]
## vore order `n()`
## <chr> <chr> <int>
## 1 carni Carnivora 12
## 2 carni Cetacea 3
## 3 carni Cingulata 1
## 4 carni Didelphimorphia 1
## 5 carni Primates 1
## 6 carni Rodentia 1
## 7 herbi Artiodactyla 5
## 8 herbi Diprotodontia 1
## 9 herbi Hyracoidea 2
## 10 herbi Lagomorpha 1
## # ... with 22 more rows
state <- c("Maryland", "Alaska", "New Jersey")
income <- c(76067,74444,73702)
median.us <- c(61372,61372,61372)
life.expectancy <- c(78.8,78.3,80.3)
teen.birth.rate.2015 <- c(17,29.3,12.1)
teen.birth.rate.2007 <- c(34.3,42.9,24.9)
teen.birth.rate.1991 <- c(54.1, 66, 41.3)
top.3.states <- data.frame(state, income, median.us,
life.expectancy,
teen.birth.rate.2015, teen.birth.rate.2007,
teen.birth.rate.1991)
names(top.3.states) <- c("state", "income", "median.us",
"life.expectancy","2015","2007","1991")
top.3.states
## state income median.us life.expectancy 2015 2007 1991
## 1 Maryland 76067 61372 78.8 17.0 34.3 54.1
## 2 Alaska 74444 61372 78.3 29.3 42.9 66.0
## 3 New Jersey 73702 61372 80.3 12.1 24.9 41.3
#gather to put all three years in one column
new.top.3.states <- top.3.states %>%
gather("2015", "2007", "1991", key = "year", value = "cases")
new.top.3.states
## state income median.us life.expectancy year cases
## 1 Maryland 76067 61372 78.8 2015 17.0
## 2 Alaska 74444 61372 78.3 2015 29.3
## 3 New Jersey 73702 61372 80.3 2015 12.1
## 4 Maryland 76067 61372 78.8 2007 34.3
## 5 Alaska 74444 61372 78.3 2007 42.9
## 6 New Jersey 73702 61372 80.3 2007 24.9
## 7 Maryland 76067 61372 78.8 1991 54.1
## 8 Alaska 74444 61372 78.3 1991 66.0
## 9 New Jersey 73702 61372 80.3 1991 41.3
#key and value -> new columns
df_1 <- data_frame(Type = c("TypeA", "TypeA", "TypeB", "TypeB"),
Answer = c("Yes", "No", NA, "No"), n = 1:4)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
df_1
## # A tibble: 4 x 3
## Type Answer n
## <chr> <chr> <int>
## 1 TypeA Yes 1
## 2 TypeA No 2
## 3 TypeB <NA> 3
## 4 TypeB No 4
df_2 <- df_1 %>%
filter(!is.na(Answer)) %>%
spread(key=Answer, value=n)
df_2
## # A tibble: 2 x 3
## Type No Yes
## <chr> <int> <int>
## 1 TypeA 2 1
## 2 TypeB 4 NA
state <- c("Maryland", "Alaska", "New Jersey")
income <- c(76067,74444,73702)
median.us <- c(61372,61372,61372)
life.expectancy <- c(78.8,78.3,80.3)
teen.birth <- c("17//34.3//54.1", "29.0//42.9//66.0", "12.1//24.9//41.3")
top.3.states <- data.frame(state, income, median.us,
life.expectancy,teen.birth)
top.3.states
## state income median.us life.expectancy teen.birth
## 1 Maryland 76067 61372 78.8 17//34.3//54.1
## 2 Alaska 74444 61372 78.3 29.0//42.9//66.0
## 3 New Jersey 73702 61372 80.3 12.1//24.9//41.3
top.3.states.separated.years <- top.3.states %>%
separate(teen.birth,
into = c("2015", "2007","1991"), sep = "//")
top.3.states.separated.years
## state income median.us life.expectancy 2015 2007 1991
## 1 Maryland 76067 61372 78.8 17 34.3 54.1
## 2 Alaska 74444 61372 78.3 29.0 42.9 66.0
## 3 New Jersey 73702 61372 80.3 12.1 24.9 41.3
new.sleep
## # A tibble: 83 x 11
## # Groups: vore, order [32]
## name genus vore order conservation sleep_total sleep_rem sleep_cycle awake
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Cheet~ Acin~ carni Carn~ lc 12.1 NA NA 11.9
## 2 Owl m~ Aotus omni Prim~ <NA> 17 1.8 NA 7
## 3 Mount~ Aplo~ herbi Rode~ nt 14.4 2.4 NA 9.6
## 4 Great~ Blar~ omni Sori~ lc 14.9 2.3 0.133 9.1
## 5 Cow Bos herbi Arti~ domesticated 4 0.7 0.667 20
## 6 Three~ Brad~ herbi Pilo~ <NA> 14.4 2.2 0.767 9.6
## 7 North~ Call~ carni Carn~ vu 8.7 1.4 0.383 15.3
## 8 Vespe~ Calo~ <NA> Rode~ <NA> 7 NA NA 17
## 9 Dog Canis carni Carn~ domesticated 10.1 2.9 0.333 13.9
## 10 Roe d~ Capr~ herbi Arti~ lc 3 NA NA 21
## # ... with 73 more rows, and 2 more variables: brainwt <dbl>, bodywt <dbl>
m <- mutate(new.sleep, kount = n())
m[1:6,c(1:4,10:12)]
## # A tibble: 6 x 7
## # Groups: vore, order [6]
## name genus vore order brainwt bodywt kount
## <chr> <chr> <chr> <chr> <dbl> <dbl> <int>
## 1 Cheetah Acinonyx carni Carnivora NA 50 12
## 2 Owl monkey Aotus omni Primates 0.0155 0.48 10
## 3 Mountain beaver Aplodontia herbi Rodentia NA 1.35 16
## 4 Greater short-tailed shrew Blarina omni Soricomorp~ 0.00029 0.019 3
## 5 Cow Bos herbi Artiodacty~ 0.423 600 5
## 6 Three-toed sloth Bradypus herbi Pilosa NA 3.85 1
f <- filter(new.sleep, n() > 14)
f[1:5,c(1:4,10:11)]
## # A tibble: 5 x 6
## # Groups: vore, order [1]
## name genus vore order brainwt bodywt
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 Mountain beaver Aplodontia herbi Rodentia NA 1.35
## 2 Guinea pig Cavis herbi Rodentia 0.0055 0.728
## 3 Chinchilla Chinchilla herbi Rodentia 0.0064 0.42
## 4 Western american chipmunk Eutamias herbi Rodentia NA 0.071
## 5 Mongolian gerbil Meriones herbi Rodentia NA 0.053
salary.description <- c("Golden parachute type","Well to do",
"Average","Below average", "bring date seeds instead of flowers")
first(salary.description)
## [1] "Golden parachute type"
last(salary.description)
## [1] "bring date seeds instead of flowers"
nth(salary.description, -3)
## [1] "Average"
nth(salary.description,2)
## [1] "Well to do"
a.vector <- c(22,33,44,1,2,3,3,3,4)
original.length <- length(a.vector)
original.length
## [1] 9
distinct.a.vector <- n_distinct(a.vector)
distinct.a.vector
## [1] 7
test1 <- if_else(original.length == distinct.a.vector, "all values
unique","some duplicate values in vector")
test1
## [1] "some duplicate values in vector"
b.vector <- c(1,2,3,4,5,6)
length(b.vector)
## [1] 6
distinct.b.vector <- n_distinct(b.vector)
distinct.b.vector
## [1] 6
test2 <- if_else(length(b.vector) == distinct.b.vector, "all values
unique", "duplicates")
test2
## [1] "all values\nunique"
test <- c(100, 0, 999)
x <- 5000/test
x
## [1] 50.000000 Inf 5.005005
x <- 5000/na_if(test,0)
x
## [1] 50.000000 NA 5.005005
#Replace missing values with zero
x <- c(33,4,11,NA,9)
x
## [1] 33 4 11 NA 9
x <- coalesce(x,0)
x
## [1] 33 4 11 0 9
#Ranking via index
y <- c(100,4,12,6,8,3)
rank1 <-row_number(y)
rank1
## [1] 6 2 5 3 4 1
y[rank1[1]]
## [1] 3
y[rank1[6]]
## [1] 100
#Minimum rank
rank2 <- min_rank(y)
rank2
## [1] 6 2 5 3 4 1
#Dense rank
rank3 <- dense_rank(y)
rank3
## [1] 6 2 5 3 4 1
#Percent rank
rank4 <- percent_rank(y)
rank4
## [1] 1.0 0.2 0.8 0.4 0.6 0.0
#Cumulative distribution function
y <- c(100,4,12,6,8,3)
rank5 <- cume_dist(y)
rank5
## [1] 1.0000000 0.3333333 0.8333333 0.5000000 0.6666667 0.1666667
rank6 = ntile(y, 3)
rank6
## [1] 3 1 3 2 2 1
test.vector <- c(2,22,33,44,77,89,99)
quantile(test.vector, prob = seq(0,1,length = 11),type = 5)
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 2.0 6.0 20.0 28.6 36.3 44.0 67.1 81.8 90.0 97.0 99.0
#Randomly sample 5 rows
data("ChickWeight")
my.sample <- sample_n(ChickWeight, 5)
my.sample
## weight Time Chick Diet
## 1 92 14 9 1
## 2 124 10 25 2
## 3 116 10 36 3
## 4 205 18 26 2
## 5 136 12 26 2
set.seed(833)
my.sample <- sample_n(ChickWeight, 10, replace = TRUE)
my.sample
## weight Time Chick Diet
## 1 98 8 45 4
## 2 42 0 17 1
## 3 98 8 36 3
## 4 51 2 11 1
## 5 198 20 3 1
## 6 237 21 49 4
## 7 205 16 50 4
## 8 170 16 39 3
## 9 332 18 35 3
## 10 144 14 33 3
my.sample <- sample_n(mtcars, 12, weight = cyl)
my.sample[,1:5]
## mpg cyl disp hp drat
## AMC Javelin 15.2 8 304.0 150 3.15
## Porsche 914-2 26.0 4 120.3 91 4.43
## Merc 280 19.2 6 167.6 123 3.92
## Cadillac Fleetwood 10.4 8 472.0 205 2.93
## Merc 240D 24.4 4 146.7 62 3.69
## Datsun 710 22.8 4 108.0 93 3.85
## Merc 280C 17.8 6 167.6 123 3.92
## Mazda RX4 Wag 21.0 6 160.0 110 3.90
## Merc 450SLC 15.2 8 275.8 180 3.07
## Chrysler Imperial 14.7 8 440.0 230 3.23
## Maserati Bora 15.0 8 301.0 335 3.54
## Valiant 18.1 6 225.0 105 2.76
#to obtain a sample equal to a specific percentage
test1 <- sample_frac(ChickWeight, 0.02)
test1
## weight Time Chick Diet
## 1 48 2 13 1
## 2 62 6 12 1
## 3 197 20 45 4
## 4 234 18 42 4
## 5 58 4 28 2
## 6 163 16 3 1
## 7 103 8 41 4
## 8 103 8 42 4
## 9 120 18 19 1
## 10 48 2 36 3
## 11 80 6 48 4
## 12 137 12 33 3
View(starwars)
by_hair_color <- starwars %>%
group_by(hair_color) %>%
sample_frac(.07,replace=TRUE)
by_hair_color
## # A tibble: 5 x 14
## # Groups: hair_color [3]
## name height mass hair_color skin_color eye_color birth_year sex gender
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 Eeth Koth 171 NA black brown brown NA male mascu~
## 2 Dormé 165 NA brown light brown NA fema~ femin~
## 3 Sebulba 112 40 none grey, red orange NA male mascu~
## 4 Shaak Ti 178 57 none red, blue~ black NA fema~ femin~
## 5 Tion Medon 206 80 none grey black NA male mascu~
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## # vehicles <list>, starships <list>
row.kount.only <- ChickWeight %>% tally()
row.kount.only
## n
## 1 578
diet.kount <- ChickWeight %>% count(Diet)
diet.kount
## Diet n
## 1 1 220
## 2 2 120
## 3 3 120
## 4 4 118
single.species.kount <- starwars %>%
add_count(species) %>%
filter(n == 1)
single.species.kount[,1:6]
## # A tibble: 29 x 6
## name height mass hair_color skin_color eye_color
## <chr> <int> <dbl> <chr> <chr> <chr>
## 1 Greedo 173 74 <NA> green black
## 2 Jabba Desilijic Tiure 175 1358 <NA> green-tan, brown orange
## 3 Yoda 66 17 white green brown
## 4 Bossk 190 113 none green red
## 5 Ackbar 180 83 none brown mottle orange
## 6 Wicket Systri Warrick 88 20 brown brown brown
## 7 Nien Nunb 160 68 none grey black
## 8 Nute Gunray 191 90 none mottled green red
## 9 Watto 137 NA black blue, grey yellow
## 10 Sebulba 112 40 none grey, red orange
## # ... with 19 more rows
mtcars <- rename(mtcars, spam_mpg = mpg)
names(mtcars)
## [1] "spam_mpg" "cyl" "disp" "hp" "drat" "wt"
## [7] "qsec" "vs" "am" "gear" "carb"
new.starwars <- starwars %>%
dplyr::select(name, mass, gender, species, height) %>%
mutate(type = case_when(height > 200 | mass > 200 ~ "large",
species == "Droid" ~ "robot", TRUE ~ "other"))
new.starwars
## # A tibble: 87 x 6
## name mass gender species height type
## <chr> <dbl> <chr> <chr> <int> <chr>
## 1 Luke Skywalker 77 masculine Human 172 other
## 2 C-3PO 75 masculine Droid 167 robot
## 3 R2-D2 32 masculine Droid 96 robot
## 4 Darth Vader 136 masculine Human 202 large
## 5 Leia Organa 49 feminine Human 150 other
## 6 Owen Lars 120 masculine Human 178 other
## 7 Beru Whitesun lars 75 feminine Human 165 other
## 8 R5-D4 32 masculine Droid 97 robot
## 9 Biggs Darklighter 84 masculine Human 183 other
## 10 Obi-Wan Kenobi 77 masculine Human 182 other
## # ... with 77 more rows