This is my evolving and very incomplete collection of R How To tips and snippets. I put it together for my benefit only but am publishing it here in case it helps others. Some of it might be “borrowed” or adapted from others but done with good intentions. Please don’t sue me.
I hope to add to it regularly so it will change and expand over time.
library(gapminder) # install.packages("gapminder")
library(purrrlyr) # install.packages("purrrlyr")
library(janitor) #
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# install.packages("SnowballC")
library(widyr) # install.packages("widyr")
a <- 1:5
d <- c("Tom", "Dick", "Harry", "Jane", "Alice")
(DF0 <- tibble::tibble(a, b = a * 2, d)) # create and show data frame
## # A tibble: 5 × 3
## a b d
## <int> <dbl> <chr>
## 1 1 2 Tom
## 2 2 4 Dick
## 3 3 6 Harry
## 4 4 8 Jane
## 5 5 10 Alice
DF0$z <- 1 # add a column called z with a value of 1
DF0 # show the data frame
## # A tibble: 5 × 4
## a b d z
## <int> <dbl> <chr> <dbl>
## 1 1 2 Tom 1
## 2 2 4 Dick 1
## 3 3 6 Harry 1
## 4 4 8 Jane 1
## 5 5 10 Alice 1
(DF0 <- cbind(1, DF0)) # add a first column with a value of 1
## 1 a b d z
## 1 1 1 2 Tom 1
## 2 1 2 4 Dick 1
## 3 1 3 6 Harry 1
## 4 1 4 8 Jane 1
## 5 1 5 10 Alice 1
DF0 <- dplyr::rename(DF0, c = 1) # rename the column to / from
DF0
## c a b d z
## 1 1 1 2 Tom 1
## 2 1 2 4 Dick 1
## 3 1 3 6 Harry 1
## 4 1 4 8 Jane 1
## 5 1 5 10 Alice 1
col1 = c(2, 3, 5)
col2 = c("aa", "bb", "cc")
col3 = c(TRUE, FALSE, TRUE)
df = data.frame(col1, col2, col3) # df is a data frame
df # show the data frame
## col1 col2 col3
## 1 2 aa TRUE
## 2 3 bb FALSE
## 3 5 cc TRUE
index <- c(1, 2, 3, 4)
values <- c("G", "H", "J", "K")
df$col4 <- values[match(df$col1, index)]
df
## col1 col2 col3 col4
## 1 2 aa TRUE H
## 2 3 bb FALSE J
## 3 5 cc TRUE <NA>
suppressPackageStartupMessages(library(tidyverse))
mtcars %>%
mutate(vs_new = if_else(
vs == 1,
"one",
"zero",
NA_character_)) %>%
head()
## mpg cyl disp hp drat wt qsec vs am gear carb vs_new
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 zero
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 zero
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 one
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 one
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 zero
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 one
mtcars %>%
mutate(carb_new = case_when(.$carb == 1 ~ "one",
.$carb == 2 ~ "two",
.$carb == 4 ~ "four",
TRUE ~ "other")) %>%
head(15)
## mpg cyl disp hp drat wt qsec vs am gear carb carb_new
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 four
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 four
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 one
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 one
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 two
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 one
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 four
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 two
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 two
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 four
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4 four
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3 other
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3 other
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3 other
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4 four
Apply a function to certain columns only, by rows
library(purrr)
library(purrrlyr)
mtcars %>%
select(am, gear, carb) %>%
purrrlyr::by_row(sum, .collate = "cols", .to = "sum_am_gear_carb") -> mtcars2
head(mtcars2)
## # A tibble: 6 × 4
## am gear carb sum_am_gear_carb
## <dbl> <dbl> <dbl> <dbl>
## 1 1 4 4 9
## 2 1 4 4 9
## 3 1 4 1 6
## 4 0 3 1 4
## 5 0 3 2 5
## 6 0 3 1 4
# then you can add it into the original data frame
mtcars <- cbind(mtcars, "sum_am_gear_carb" = mtcars2$sum_am_gear_carb)
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
## sum_am_gear_carb
## Mazda RX4 9
## Mazda RX4 Wag 9
## Datsun 710 6
## Hornet 4 Drive 4
## Hornet Sportabout 5
## Valiant 4
library(tidyverse)
data(mtcars)
mtcars_x <- mtcars
mtcars_y <- mtcars_x %>%
mutate(group = sample(LETTERS[1:3], nrow(.), replace = TRUE))
head(mtcars_y)
## mpg cyl disp hp drat wt qsec vs am gear carb group
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 A
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 B
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 A
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 A
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 B
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 A
library(tidyverse)
mtcars_x <- mtcars
head(mtcars_x)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
mtcars_x$am_factor <- factor(mtcars_x$am,
levels = c(0,1),
labels = c("Automatic","Manual"))
tibble::glimpse(mtcars_x)
## Rows: 32
## Columns: 12
## $ mpg <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2, …
## $ cyl <dbl> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, 4, …
## $ disp <dbl> 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 140.…
## $ hp <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, 180…
## $ drat <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, …
## $ wt <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3.15…
## $ qsec <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 22.9…
## $ vs <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, …
## $ am <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, …
## $ gear <dbl> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, …
## $ carb <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, …
## $ am_factor <fct> Manual, Manual, Manual, Automatic, Automatic, Automatic, Aut…
iris[c("Sepal.Length","Sepal.Width")] / iris$Petal.Width
## Sepal.Length Sepal.Width
## 1 25.500000 17.500000
## 2 24.500000 15.000000
## 3 23.500000 16.000000
## 4 23.000000 15.500000
## 5 25.000000 18.000000
## 6 13.500000 9.750000
## 7 15.333333 11.333333
## 8 25.000000 17.000000
## 9 22.000000 14.500000
## 10 49.000000 31.000000
## 11 27.000000 18.500000
## 12 24.000000 17.000000
## 13 48.000000 30.000000
## 14 43.000000 30.000000
## 15 29.000000 20.000000
## 16 14.250000 11.000000
## 17 13.500000 9.750000
## 18 17.000000 11.666667
## 19 19.000000 12.666667
## 20 17.000000 12.666667
## 21 27.000000 17.000000
## 22 12.750000 9.250000
## 23 23.000000 18.000000
## 24 10.200000 6.600000
## 25 24.000000 17.000000
## 26 25.000000 15.000000
## 27 12.500000 8.500000
## 28 26.000000 17.500000
## 29 26.000000 17.000000
## 30 23.500000 16.000000
## 31 24.000000 15.500000
## 32 13.500000 8.500000
## 33 52.000000 41.000000
## 34 27.500000 21.000000
## 35 24.500000 15.500000
## 36 25.000000 16.000000
## 37 27.500000 17.500000
## 38 49.000000 36.000000
## 39 22.000000 15.000000
## 40 25.500000 17.000000
## 41 16.666667 11.666667
## 42 15.000000 7.666667
## 43 22.000000 16.000000
## 44 8.333333 5.833333
## 45 12.750000 9.500000
## 46 16.000000 10.000000
## 47 25.500000 19.000000
## 48 23.000000 16.000000
## 49 26.500000 18.500000
## 50 25.000000 16.500000
## 51 5.000000 2.285714
## 52 4.266667 2.133333
## 53 4.600000 2.066667
## 54 4.230769 1.769231
## 55 4.333333 1.866667
## 56 4.384615 2.153846
## 57 3.937500 2.062500
## 58 4.900000 2.400000
## 59 5.076923 2.230769
## 60 3.714286 1.928571
## 61 5.000000 2.000000
## 62 3.933333 2.000000
## 63 6.000000 2.200000
## 64 4.357143 2.071429
## 65 4.307692 2.230769
## 66 4.785714 2.214286
## 67 3.733333 2.000000
## 68 5.800000 2.700000
## 69 4.133333 1.466667
## 70 5.090909 2.272727
## 71 3.277778 1.777778
## 72 4.692308 2.153846
## 73 4.200000 1.666667
## 74 5.083333 2.333333
## 75 4.923077 2.230769
## 76 4.714286 2.142857
## 77 4.857143 2.000000
## 78 3.941176 1.764706
## 79 4.000000 1.933333
## 80 5.700000 2.600000
## 81 5.000000 2.181818
## 82 5.500000 2.400000
## 83 4.833333 2.250000
## 84 3.750000 1.687500
## 85 3.600000 2.000000
## 86 3.750000 2.125000
## 87 4.466667 2.066667
## 88 4.846154 1.769231
## 89 4.307692 2.307692
## 90 4.230769 1.923077
## 91 4.583333 2.166667
## 92 4.357143 2.142857
## 93 4.833333 2.166667
## 94 5.000000 2.300000
## 95 4.307692 2.076923
## 96 4.750000 2.500000
## 97 4.384615 2.230769
## 98 4.769231 2.230769
## 99 4.636364 2.272727
## 100 4.384615 2.153846
## 101 2.520000 1.320000
## 102 3.052632 1.421053
## 103 3.380952 1.428571
## 104 3.500000 1.611111
## 105 2.954545 1.363636
## 106 3.619048 1.428571
## 107 2.882353 1.470588
## 108 4.055556 1.611111
## 109 3.722222 1.388889
## 110 2.880000 1.440000
## 111 3.250000 1.600000
## 112 3.368421 1.421053
## 113 3.238095 1.428571
## 114 2.850000 1.250000
## 115 2.416667 1.166667
## 116 2.782609 1.391304
## 117 3.611111 1.666667
## 118 3.500000 1.727273
## 119 3.347826 1.130435
## 120 4.000000 1.466667
## 121 3.000000 1.391304
## 122 2.800000 1.400000
## 123 3.850000 1.400000
## 124 3.500000 1.500000
## 125 3.190476 1.571429
## 126 4.000000 1.777778
## 127 3.444444 1.555556
## 128 3.388889 1.666667
## 129 3.047619 1.333333
## 130 4.500000 1.875000
## 131 3.894737 1.473684
## 132 3.950000 1.900000
## 133 2.909091 1.272727
## 134 4.200000 1.866667
## 135 4.357143 1.857143
## 136 3.347826 1.304348
## 137 2.625000 1.416667
## 138 3.555556 1.722222
## 139 3.333333 1.666667
## 140 3.285714 1.476190
## 141 2.791667 1.291667
## 142 3.000000 1.347826
## 143 3.052632 1.421053
## 144 2.956522 1.391304
## 145 2.680000 1.320000
## 146 2.913043 1.304348
## 147 3.315789 1.315789
## 148 3.250000 1.500000
## 149 2.695652 1.478261
## 150 3.277778 1.666667
# create mydata
col1 = c(2, 3, 5)
col2 = c("aa", "bb", "cc")
col3 = c(TRUE, FALSE, TRUE)
col4 = c(123, 456, 789)
col5 = c("Mary", "Joan", "Denis")
col6 = c(111,333,444)
mydata = data.frame(col1, col2, col3, col4, col5, col6) # df is a data frame
mydata # show the data frame
## col1 col2 col3 col4 col5 col6
## 1 2 aa TRUE 123 Mary 111
## 2 3 bb FALSE 456 Joan 333
## 3 5 cc TRUE 789 Denis 444
myvars <- c("col1", "col3")
newdata <- mydata[myvars]
newdata
## col1 col3
## 1 2 TRUE
## 2 3 FALSE
## 3 5 TRUE
myvars <- paste("col", 1:2, sep="")
newdata <- mydata[myvars]
newdata
## col1 col2
## 1 2 aa
## 2 3 bb
## 3 5 cc
newdata <- mydata[c(1, 4:6)]
newdata
## col1 col4 col5 col6
## 1 2 123 Mary 111
## 2 3 456 Joan 333
## 3 5 789 Denis 444
mtcars %>%
select(-disp) %>%
head()
## mpg cyl hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 105 2.76 3.460 20.22 1 0 3 1
mtcars %>%
select_all(toupper) %>%
head(3)
## MPG CYL DISP HP DRAT WT QSEC VS AM GEAR CARB
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
tibble::rownames_to_column(mtcars, var = "car_name") %>%
head()
## car_name mpg cyl disp hp drat wt qsec vs am gear carb
## 1 Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## 2 Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## 3 Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## 4 Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## 5 Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## 6 Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
col1 = c(22, 33, 55)
col2 = c("Alice", "Bob", "Charles")
col3 = c(TRUE, FALSE, TRUE)
DF = data.frame(col1, col2, col3) # DF is a data frame
DF # show the data frame
## col1 col2 col3
## 1 22 Alice TRUE
## 2 33 Bob FALSE
## 3 55 Charles TRUE
names(DF)
## [1] "col1" "col2" "col3"
names(DF)[names(DF) == 'col1'] <- 'Age' # From / To
names(DF)[names(DF) == 'col2'] <- 'Name' # From / To
names(DF)[names(DF) == 'col3'] <- 'Smoker' # From / To
DF
## Age Name Smoker
## 1 22 Alice TRUE
## 2 33 Bob FALSE
## 3 55 Charles TRUE
my_data <- iris # take a copy of the iris dataset
names(my_data) # look at the names
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
names(my_data)[1] <- "sepal_length" # change the name of first column
names(my_data)[2] <- "sepal_width"
names(my_data) # look at the names again
## [1] "sepal_length" "sepal_width" "Petal.Length" "Petal.Width" "Species"
data(mtcars)
head(mtcars, 2) # show first two rows
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21 6 160 110 3.9 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21 6 160 110 3.9 2.875 17.02 0 1 4 4
mtcars <- dplyr::rename(mtcars, spam_mpg = mpg) # to / from
mtcars <- dplyr::rename(mtcars, cylinders = cyl) # to / from
head(mtcars, 2) # show first two rows
## spam_mpg cylinders disp hp drat wt qsec vs am gear carb
## Mazda RX4 21 6 160 110 3.9 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21 6 160 110 3.9 2.875 17.02 0 1 4 4
library(tidyverse)
# refresh the dataset
mtcars <- datasets::mtcars
# rename variable using select. New name = old name.
mtcars %>% select(1, Cylinders = cyl, 3:11) %>% head(3)
## mpg Cylinders disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
head(mtcars, 3)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
mtcars %>%
relocate(gear) %>% # move gear column to the left
head(3)
## gear mpg cyl disp hp drat wt qsec vs am carb
## Mazda RX4 4 21.0 6 160 110 3.90 2.620 16.46 0 1 4
## Mazda RX4 Wag 4 21.0 6 160 110 3.90 2.875 17.02 0 1 4
## Datsun 710 4 22.8 4 108 93 3.85 2.320 18.61 1 1 1
mtcars %>%
relocate(cyl, .before=wt) %>% # move cyl before wt
head(3)
## mpg disp hp drat cyl wt qsec vs am gear carb
## Mazda RX4 21.0 160 110 3.90 6 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 160 110 3.90 6 2.875 17.02 0 1 4 4
## Datsun 710 22.8 108 93 3.85 4 2.320 18.61 1 1 4 1
mtcars %>%
relocate(mpg:cyl, .after=carb) %>% # move mpg, cyl after carb
head(3)
## disp hp drat wt qsec vs am gear carb mpg cyl
## Mazda RX4 160 110 3.90 2.620 16.46 0 1 4 4 21.0 6
## Mazda RX4 Wag 160 110 3.90 2.875 17.02 0 1 4 4 21.0 6
## Datsun 710 108 93 3.85 2.320 18.61 1 1 4 1 22.8 4
DF1 <- readr::read_csv("Task,< 1 a week,1-4 a week,1-3 a day,>4 a day
Basic exploratory data analysis,11,32,46,12
Data cleaning,19,42,31,7
Machine learning/statistics,34,29,27,10
Creating visualizations,23,41,29,7
Presenting analysis,27,47,20,6
Extract/transform/load,43,32,20,5")
## Rows: 6 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Task
## dbl (4): < 1 a week, 1-4 a week, 1-3 a day, >4 a day
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
DF1
## # A tibble: 6 × 5
## Task `< 1 a week` `1-4 a week` `1-3 a day` `>4 a day`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Basic exploratory data analy… 11 32 46 12
## 2 Data cleaning 19 42 31 7
## 3 Machine learning/statistics 34 29 27 10
## 4 Creating visualizations 23 41 29 7
## 5 Presenting analysis 27 47 20 6
## 6 Extract/transform/load 43 32 20 5
names(DF1) <- make.names(names(DF1))
names(DF1)
## [1] "Task" "X..1.a.week" "X1.4.a.week" "X1.3.a.day" "X.4.a.day"
library(tidyverse)
(x <- data.frame(X_1 = c(NA, "Title", 1:3),
X_2 = c(NA, "Title2", 4:6)))
## X_1 X_2
## 1 <NA> <NA>
## 2 Title Title2
## 3 1 4
## 4 2 5
## 5 3 6
x %>%
janitor::row_to_names(row_number = 2)
## Title Title2
## 3 1 4
## 4 2 5
## 5 3 6
# Source: http://sfirke.github.io/janitor/reference/row_to_names.html
Note to self. Am a bit untrustworth of “within” maybe due to unfamiliarity. Make sure to test it.
First, a non-dplyr way.
df1 = structure(list(a = c(11.77, 10.9, 10.32, 10.96, 9.906, 10.7, 11.43, 11.41, 10.48512, 11.19),
b = c(2, 3, 2, 0, 0, 0, 1, 2, 4, 0),
est = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
d = c("x","x","x","x","x","x","x","x","x","x"),
e = c(1,1,1,1,2,2,2,2,3,3)), .Names = c("a", "b", "est","d","e"),
row.names = c(NA, -10L), class = "data.frame")
df1
## a b est d e
## 1 11.77000 2 0 x 1
## 2 10.90000 3 0 x 1
## 3 10.32000 2 0 x 1
## 4 10.96000 0 0 x 1
## 5 9.90600 0 0 x 2
## 6 10.70000 0 0 x 2
## 7 11.43000 1 0 x 2
## 8 11.41000 2 0 x 2
## 9 10.48512 4 0 x 3
## 10 11.19000 0 0 x 3
df2 = df1 # take a clean copy of the original data frame
df2
## a b est d e
## 1 11.77000 2 0 x 1
## 2 10.90000 3 0 x 1
## 3 10.32000 2 0 x 1
## 4 10.96000 0 0 x 1
## 5 9.90600 0 0 x 2
## 6 10.70000 0 0 x 2
## 7 11.43000 1 0 x 2
## 8 11.41000 2 0 x 2
## 9 10.48512 4 0 x 3
## 10 11.19000 0 0 x 3
# if b is zero set est to a value derived from a
df2 <- within(df2, est[b==0] <- (a[b==0]-5)/2.533)
df2
## a b est d e
## 1 11.77000 2 0.000000 x 1
## 2 10.90000 3 0.000000 x 1
## 3 10.32000 2 0.000000 x 1
## 4 10.96000 0 2.352941 x 1
## 5 9.90600 0 1.936834 x 2
## 6 10.70000 0 2.250296 x 2
## 7 11.43000 1 0.000000 x 2
## 8 11.41000 2 0.000000 x 2
## 9 10.48512 4 0.000000 x 3
## 10 11.19000 0 2.443743 x 3
df4 = df1 # take a clean copy of the original data frame
df4
## a b est d e
## 1 11.77000 2 0 x 1
## 2 10.90000 3 0 x 1
## 3 10.32000 2 0 x 1
## 4 10.96000 0 0 x 1
## 5 9.90600 0 0 x 2
## 6 10.70000 0 0 x 2
## 7 11.43000 1 0 x 2
## 8 11.41000 2 0 x 2
## 9 10.48512 4 0 x 3
## 10 11.19000 0 0 x 3
# if b is zero and e is 2, set d to a particular value
df4 <- within(df4, d[b==0 & e==2] <- "whoa")
df4
## a b est d e
## 1 11.77000 2 0 x 1
## 2 10.90000 3 0 x 1
## 3 10.32000 2 0 x 1
## 4 10.96000 0 0 x 1
## 5 9.90600 0 0 whoa 2
## 6 10.70000 0 0 whoa 2
## 7 11.43000 1 0 x 2
## 8 11.41000 2 0 x 2
## 9 10.48512 4 0 x 3
## 10 11.19000 0 0 x 3
df5 = df1 # take a clean copy of the original data frame
df5
## a b est d e
## 1 11.77000 2 0 x 1
## 2 10.90000 3 0 x 1
## 3 10.32000 2 0 x 1
## 4 10.96000 0 0 x 1
## 5 9.90600 0 0 x 2
## 6 10.70000 0 0 x 2
## 7 11.43000 1 0 x 2
## 8 11.41000 2 0 x 2
## 9 10.48512 4 0 x 3
## 10 11.19000 0 0 x 3
# if b is zero and e is 2, or b is 0 and e is 3 set d to "whoa"
df5 <- within(df5, d[(b==0 & e==2) | (b==0 & e==3)] <- "whoa")
df5
## a b est d e
## 1 11.77000 2 0 x 1
## 2 10.90000 3 0 x 1
## 3 10.32000 2 0 x 1
## 4 10.96000 0 0 x 1
## 5 9.90600 0 0 whoa 2
## 6 10.70000 0 0 whoa 2
## 7 11.43000 1 0 x 2
## 8 11.41000 2 0 x 2
## 9 10.48512 4 0 x 3
## 10 11.19000 0 0 whoa 3
A dplyr approach.
df6 = df1 # take a clean copy of the original data frame
df6
## a b est d e
## 1 11.77000 2 0 x 1
## 2 10.90000 3 0 x 1
## 3 10.32000 2 0 x 1
## 4 10.96000 0 0 x 1
## 5 9.90600 0 0 x 2
## 6 10.70000 0 0 x 2
## 7 11.43000 1 0 x 2
## 8 11.41000 2 0 x 2
## 9 10.48512 4 0 x 3
## 10 11.19000 0 0 x 3
# if b is zero set est to a value derived from a
df6 <- df6 %>%
mutate(est = ifelse(b == 0, (a - 5)/2.533, 0))
df6
## a b est d e
## 1 11.77000 2 0.000000 x 1
## 2 10.90000 3 0.000000 x 1
## 3 10.32000 2 0.000000 x 1
## 4 10.96000 0 2.352941 x 1
## 5 9.90600 0 1.936834 x 2
## 6 10.70000 0 2.250296 x 2
## 7 11.43000 1 0.000000 x 2
## 8 11.41000 2 0.000000 x 2
## 9 10.48512 4 0.000000 x 3
## 10 11.19000 0 2.443743 x 3
df7 = df1 # take a clean copy of the original data frame
df7
## a b est d e
## 1 11.77000 2 0 x 1
## 2 10.90000 3 0 x 1
## 3 10.32000 2 0 x 1
## 4 10.96000 0 0 x 1
## 5 9.90600 0 0 x 2
## 6 10.70000 0 0 x 2
## 7 11.43000 1 0 x 2
## 8 11.41000 2 0 x 2
## 9 10.48512 4 0 x 3
## 10 11.19000 0 0 x 3
df7 <- df7 %>%
mutate(d = ifelse((b == 0 & e == 2), "whoa", d))
df7
## a b est d e
## 1 11.77000 2 0 x 1
## 2 10.90000 3 0 x 1
## 3 10.32000 2 0 x 1
## 4 10.96000 0 0 x 1
## 5 9.90600 0 0 whoa 2
## 6 10.70000 0 0 whoa 2
## 7 11.43000 1 0 x 2
## 8 11.41000 2 0 x 2
## 9 10.48512 4 0 x 3
## 10 11.19000 0 0 x 3
df8 = df1 # take a clean copy of the original data frame
df8
## a b est d e
## 1 11.77000 2 0 x 1
## 2 10.90000 3 0 x 1
## 3 10.32000 2 0 x 1
## 4 10.96000 0 0 x 1
## 5 9.90600 0 0 x 2
## 6 10.70000 0 0 x 2
## 7 11.43000 1 0 x 2
## 8 11.41000 2 0 x 2
## 9 10.48512 4 0 x 3
## 10 11.19000 0 0 x 3
# if b is zero and e is 2, or b is 0 and e is 3 set d to "whoa"
df8 <- df8 %>%
mutate(d = ifelse((b==0 & e==2) | (b==0 & e==3), "whoa", d))
df8
## a b est d e
## 1 11.77000 2 0 x 1
## 2 10.90000 3 0 x 1
## 3 10.32000 2 0 x 1
## 4 10.96000 0 0 x 1
## 5 9.90600 0 0 whoa 2
## 6 10.70000 0 0 whoa 2
## 7 11.43000 1 0 x 2
## 8 11.41000 2 0 x 2
## 9 10.48512 4 0 x 3
## 10 11.19000 0 0 whoa 3
I’m not sure about this. Need to test it for a variety of strings.
library(dplyr, magrittr)
mtcars %>%
select(contains("s")) %>%
head()
## disp qsec vs
## Mazda RX4 160 16.46 0
## Mazda RX4 Wag 160 17.02 0
## Datsun 710 108 18.61 1
## Hornet 4 Drive 258 19.44 1
## Hornet Sportabout 360 17.02 0
## Valiant 225 20.22 1
starwars %>%
dplyr::add_count(species) %>%
filter(n == 2)
## # A tibble: 10 × 15
## name height mass hair_color skin_color eye_color birth_year sex gender
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 Chewbac… 228 112 brown unknown blue 200 male mascu…
## 2 Darth M… 175 80 none red yellow 54 male mascu…
## 3 Bib For… 180 NA none pale pink NA male mascu…
## 4 Ayla Se… 178 55 none blue hazel 48 fema… femin…
## 5 Eeth Ko… 171 NA black brown brown NA male mascu…
## 6 Luminar… 170 56.2 black yellow blue 58 fema… femin…
## 7 Barriss… 166 50 black yellow blue 40 fema… femin…
## 8 Lama Su 229 88 none grey black NA male mascu…
## 9 Taun We 213 NA none grey black NA fema… femin…
## 10 Tarfful 234 136 brown brown blue NA male mascu…
## # … with 6 more variables: homeworld <chr>, species <chr>, films <list>,
## # vehicles <list>, starships <list>, n <int>
(DF1 <- data.frame(
stringsAsFactors = FALSE,
T1 = c(1L, 2L, 3L, 4L, 5L),
ID = c("A", "B", "C", "D", "E"),
Type = c("L", "Y", "B", "U", "Z")
))
## T1 ID Type
## 1 1 A L
## 2 2 B Y
## 3 3 C B
## 4 4 D U
## 5 5 E Z
(DF2 <- data.frame(
stringsAsFactors = FALSE,
T1 = c(1L, 2L, 3L, 4L, 5L),
ID = c("A", "B", "F", "G", "H"),
Type = c("L", "Y", "K", "I", "T")
))
## T1 ID Type
## 1 1 A L
## 2 2 B Y
## 3 3 F K
## 4 4 G I
## 5 5 H T
unique(rbind(DF1, DF2))
## T1 ID Type
## 1 1 A L
## 2 2 B Y
## 3 3 C B
## 4 4 D U
## 5 5 E Z
## 8 3 F K
## 9 4 G I
## 10 5 H T
# # REFERENCES / CREDIT / SOURCE / INSPIRATION
# https://stackoverflow.com/questions/49116972/how-to-keep-previous-row-while-removing-duplicate-from-r-dataframe
If the whole row is duplicated, this will deduplicate it.
# Create a data frame with duplicate rows
(DF2 <- data.frame(
stringsAsFactors = FALSE,
T1 = c(1L, 2L, 2L, 4L, 5L),
ID = c("A", "B", "B", "G", "H"),
Type = c("L", "Y", "Y", "I", "T")
))
## T1 ID Type
## 1 1 A L
## 2 2 B Y
## 3 2 B Y
## 4 4 G I
## 5 5 H T
# Create a new data frame with duplicate rows removed.
(DF3 <- unique(DF2))
## T1 ID Type
## 1 1 A L
## 2 2 B Y
## 4 4 G I
## 5 5 H T
# In case you need it, create a data frame of the duplicated rows.
DF4_duplicated_rows <- DF2 %>%
mutate(duplicate = n()>1) %>%
filter(duplicate)
(DF4_duplicated_rows <- DF2[which(duplicated(DF2)),])
## T1 ID Type
## 3 2 B Y
# Show all duplicates using fromLast
# credit: https://twitter.com/joshua_ulrich/status/1323396889513963520
set.seed(21)
(x <- sample(3,7, replace = TRUE))
## [1] 3 1 3 1 2 3 1
duplicated(x)
## [1] FALSE FALSE TRUE TRUE FALSE TRUE TRUE
duplicated(x, fromLast = TRUE)
## [1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE
What if only part of a row, a couple of columns, are duplicated?
# Create a data frame with two rows that are partial duplicates.
(DF2 <- data.frame(
stringsAsFactors = FALSE,
T1 = c(1L, 2L, 2L, 4L, 5L),
ID = c("A", "B", "B", "G", "H"),
Type = c("L", "Y", "K", "I", "T")
))
## T1 ID Type
## 1 1 A L
## 2 2 B Y
## 3 2 B K
## 4 4 G I
## 5 5 H T
# Filter out the duplicates into a separate data frame
(DF4_duplicated_rows <- DF2 %>%
group_by(T1, ID) %>%
mutate(duplicate = n()>1) %>%
filter(duplicate))
## # A tibble: 2 × 4
## # Groups: T1, ID [1]
## T1 ID Type duplicate
## <int> <chr> <chr> <lgl>
## 1 2 B Y TRUE
## 2 2 B K TRUE
# Just mark the rows with partial duplicates, both of them.
(DF5_duplicate_rows_marked <- DF2 %>%
group_by(T1, ID) %>%
mutate(duplicate = n()>1))
## # A tibble: 5 × 4
## # Groups: T1, ID [4]
## T1 ID Type duplicate
## <int> <chr> <chr> <lgl>
## 1 1 A L FALSE
## 2 2 B Y TRUE
## 3 2 B K TRUE
## 4 4 G I FALSE
## 5 5 H T FALSE
DF1 <- data.frame(REFID = 1:10, startDate = c("1979-01-21","1979-03-31","1979-04-12","1979-06-01","1980-01-01", "1980-02-19","1980-02-27","1980-10-02","1980-10-31","1980-11-21"),endDate = c("1979-01-23", "1979-03-01", "NA", "1979-06-04", "1980-01-03", "NA", "1980-02-28", "1980-10-04", "1980-11-01", "1980-11-23"))
tibble::glimpse(DF1)
## Rows: 10
## Columns: 3
## $ REFID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
## $ startDate <chr> "1979-01-21", "1979-03-31", "1979-04-12", "1979-06-01", "198…
## $ endDate <chr> "1979-01-23", "1979-03-01", "NA", "1979-06-04", "1980-01-03"…
# set the columns as dates
DF1[,2:3] <- lapply(DF1[,2:3], as.Date)
tibble::glimpse(DF1)
## Rows: 10
## Columns: 3
## $ REFID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
## $ startDate <date> 1979-01-21, 1979-03-31, 1979-04-12, 1979-06-01, 1980-01-01, …
## $ endDate <date> 1979-01-23, 1979-03-01, NA, 1979-06-04, 1980-01-03, NA, 198…
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
# Set the missing date to a calculated value
DF2 <- DF1 %>%
mutate(endDate = ifelse(is.na(endDate), startDate + days(2), endDate)) %>%
mutate(endDate = as.Date(.[,"endDate"], origin = "1970-01-01"))
DF2
## REFID startDate endDate
## 1 1 1979-01-21 1979-01-23
## 2 2 1979-03-31 1979-03-01
## 3 3 1979-04-12 1979-04-14
## 4 4 1979-06-01 1979-06-04
## 5 5 1980-01-01 1980-01-03
## 6 6 1980-02-19 1980-02-21
## 7 7 1980-02-27 1980-02-28
## 8 8 1980-10-02 1980-10-04
## 9 9 1980-10-31 1980-11-01
## 10 10 1980-11-21 1980-11-23
# Set the missing date to a particular value
DF3 <- DF1 %>%
mutate(endDate = ifelse(is.na(endDate), as.Date("2049-12-25"), endDate)) %>%
mutate(endDate = as.Date(.[,"endDate"], origin = "1970-01-01"))
DF3
## REFID startDate endDate
## 1 1 1979-01-21 1979-01-23
## 2 2 1979-03-31 1979-03-01
## 3 3 1979-04-12 2049-12-25
## 4 4 1979-06-01 1979-06-04
## 5 5 1980-01-01 1980-01-03
## 6 6 1980-02-19 2049-12-25
## 7 7 1980-02-27 1980-02-28
## 8 8 1980-10-02 1980-10-04
## 9 9 1980-10-31 1980-11-01
## 10 10 1980-11-21 1980-11-23
Unix time stamps using the uniform distribution.
A unix time stamp is the number of seconds between a particular date and the Unix Epoch of January 1st 1970.
library(lubridate)
lubridate::as_datetime( runif(10, 1546290000, 1577739600))
## [1] "2019-12-25 18:43:41 UTC" "2019-11-06 01:19:27 UTC"
## [3] "2019-08-31 20:37:49 UTC" "2019-12-07 06:59:47 UTC"
## [5] "2019-01-22 01:17:54 UTC" "2019-08-14 01:13:40 UTC"
## [7] "2019-03-05 13:05:15 UTC" "2019-01-14 14:07:55 UTC"
## [9] "2019-07-11 02:08:09 UTC" "2019-04-13 14:09:41 UTC"
lubridate::as_datetime( runif(10, 1547290000, 1617618847))
## [1] "2020-02-22 16:01:13 UTC" "2020-06-11 09:03:08 UTC"
## [3] "2019-01-21 17:28:44 UTC" "2020-05-21 05:44:11 UTC"
## [5] "2020-10-04 18:42:30 UTC" "2021-02-02 13:17:07 UTC"
## [7] "2019-09-06 21:29:58 UTC" "2020-10-15 18:00:22 UTC"
## [9] "2020-04-19 04:49:13 UTC" "2020-10-07 02:07:21 UTC"
See https://github.com/dgrtwo/widyr
library(devtools)
## Loading required package: usethis
# install_github("dgrtwo/widyr")
library(widyr)
library(gapminder)
data(gapminder)
gapminder %>%
pairwise_dist(country, year, lifeExp)
## # A tibble: 20,022 × 3
## item1 item2 distance
## <fct> <fct> <dbl>
## 1 Albania Afghanistan 107.
## 2 Algeria Afghanistan 76.8
## 3 Angola Afghanistan 4.65
## 4 Argentina Afghanistan 110.
## 5 Australia Afghanistan 129.
## 6 Austria Afghanistan 124.
## 7 Bahrain Afghanistan 98.1
## 8 Bangladesh Afghanistan 45.3
## 9 Belgium Afghanistan 125.
## 10 Benin Afghanistan 39.3
## # … with 20,012 more rows
gapminder %>%
pairwise_dist(country, year, lifeExp, sort = TRUE)
## # A tibble: 20,022 × 3
## item1 item2 distance
## <fct> <fct> <dbl>
## 1 Sierra Leone Iceland 138.
## 2 Iceland Sierra Leone 138.
## 3 Sweden Sierra Leone 137.
## 4 Sierra Leone Sweden 137.
## 5 Sierra Leone Norway 135.
## 6 Norway Sierra Leone 135.
## 7 Iceland Afghanistan 135.
## 8 Afghanistan Iceland 135.
## 9 Sierra Leone Netherlands 135.
## 10 Netherlands Sierra Leone 135.
## # … with 20,012 more rows
gapminder %>%
pairwise_dist(country, year, lifeExp, upper = FALSE) %>%
arrange(distance)
## # A tibble: 10,011 × 3
## item1 item2 distance
## <fct> <fct> <dbl>
## 1 Belgium Germany 1.08
## 2 New Zealand United Kingdom 1.51
## 3 Netherlands Norway 1.56
## 4 Israel Italy 1.66
## 5 Austria Finland 1.94
## 6 Belgium United Kingdom 1.95
## 7 Iceland Sweden 2.01
## 8 Comoros Mauritania 2.01
## 9 Belgium United States 2.09
## 10 Germany Ireland 2.10
## # … with 10,001 more rows
gapminder %>%
pairwise_cor(country, year, lifeExp, upper = FALSE, sort = TRUE)
## # A tibble: 10,011 × 3
## item1 item2 correlation
## <fct> <fct> <dbl>
## 1 Indonesia Mauritania 1.00
## 2 Morocco Senegal 1.00
## 3 Saudi Arabia West Bank and Gaza 1.00
## 4 Brazil France 0.999
## 5 Bahrain Reunion 0.999
## 6 Malaysia Sao Tome and Principe 0.999
## 7 Peru Syria 0.999
## 8 Bolivia Gambia 0.999
## 9 Indonesia Morocco 0.999
## 10 Libya Senegal 0.999
## # … with 10,001 more rows
closest <- gapminder %>%
pairwise_dist(country, year, lifeExp) %>%
arrange(distance)
closest
## # A tibble: 20,022 × 3
## item1 item2 distance
## <fct> <fct> <dbl>
## 1 Germany Belgium 1.08
## 2 Belgium Germany 1.08
## 3 United Kingdom New Zealand 1.51
## 4 New Zealand United Kingdom 1.51
## 5 Norway Netherlands 1.56
## 6 Netherlands Norway 1.56
## 7 Italy Israel 1.66
## 8 Israel Italy 1.66
## 9 Finland Austria 1.94
## 10 Austria Finland 1.94
## # … with 20,012 more rows
closest %>%
filter(item1 == "United States")
## # A tibble: 141 × 3
## item1 item2 distance
## <fct> <fct> <dbl>
## 1 United States Belgium 2.09
## 2 United States Germany 2.48
## 3 United States United Kingdom 2.51
## 4 United States Ireland 2.99
## 5 United States New Zealand 3.69
## 6 United States Finland 3.76
## 7 United States Austria 4.18
## 8 United States Greece 4.30
## 9 United States France 4.63
## 10 United States Denmark 5.26
## # … with 131 more rows
library(dplyr)
library(readr)
df1 <- read_csv("id, sex, name, age
ID1, M, Alan, 40
ID2, M, Bert, 50
ID3, M, Chris, 60
ID4, F, Ava, 30
ID5, F, Barbara, 40
ID6, F, Cleo, 50")
## Rows: 6 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): id, sex, name
## dbl (1): age
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df2 <- df1 %>%
group_by(sex) %>%
mutate(mean_age_by_sex = mean(age)) %>%
filter(age > mean_age_by_sex)
df2
## # A tibble: 2 × 5
## # Groups: sex [2]
## id sex name age mean_age_by_sex
## <chr> <chr> <chr> <dbl> <dbl>
## 1 ID3 M Chris 60 50
## 2 ID6 F Cleo 50 40
date: "`r Sys.Date()`"
`)
(MX1 <- matrix (
data = c (10, 20, 30, 40),
nrow = 2,
ncol = 2
))
## [,1] [,2]
## [1,] 10 30
## [2,] 20 40
# or
(MX1 <- matrix (c(10, 20, 30, 40), 2, 2))
## [,1] [,2]
## [1,] 10 30
## [2,] 20 40
# and filling by row looks like:
(MX1 <- matrix( c(10, 20, 30, 40), 2, 2, byrow = TRUE ))
## [,1] [,2]
## [1,] 10 20
## [2,] 30 40
sapply(df0, function(x) sum(is.na(x)))
(Source: https://twitter.com/RLangTip/status/1016713670469017601)
`%divisible_by%` <- function(x,y) x %% y == 0
6 %divisible_by% 3
## [1] TRUE
6 %divisible_by% 4
## [1] FALSE
tidyr::expand(mtcars, carb, cyl) #
## # A tibble: 18 × 2
## carb cyl
## <dbl> <dbl>
## 1 1 4
## 2 1 6
## 3 1 8
## 4 2 4
## 5 2 6
## 6 2 8
## 7 3 4
## 8 3 6
## 9 3 8
## 10 4 4
## 11 4 6
## 12 4 8
## 13 6 4
## 14 6 6
## 15 6 8
## 16 8 4
## 17 8 6
## 18 8 8
plot(tidyr::expand(mtcars, carb, cyl))
sapply(mtcars, function(x) length(unique(x)))
## mpg cyl disp hp drat wt qsec vs am gear carb
## 25 3 27 22 22 29 30 2 2 3 6
# or do it the purrr way
mtcars %>% map(function(df) length(unique(df)))
## $mpg
## [1] 25
##
## $cyl
## [1] 3
##
## $disp
## [1] 27
##
## $hp
## [1] 22
##
## $drat
## [1] 22
##
## $wt
## [1] 29
##
## $qsec
## [1] 30
##
## $vs
## [1] 2
##
## $am
## [1] 2
##
## $gear
## [1] 3
##
## $carb
## [1] 6
# or use data.table
x <- sapply(mtcars, data.table::uniqueN)
x ; class(x) # vector
## mpg cyl disp hp drat wt qsec vs am gear carb
## 25 3 27 22 22 29 30 2 2 3 6
## [1] "integer"
y <- map(mtcars, data.table::uniqueN)
y; class(y) # list
## $mpg
## [1] 25
##
## $cyl
## [1] 3
##
## $disp
## [1] 27
##
## $hp
## [1] 22
##
## $drat
## [1] 22
##
## $wt
## [1] 29
##
## $qsec
## [1] 30
##
## $vs
## [1] 2
##
## $am
## [1] 2
##
## $gear
## [1] 3
##
## $carb
## [1] 6
## [1] "list"
library(datasets)
suppressPackageStartupMessages(library(tidyverse))
mtcars %>%
group_by(mtcars$cyl) %>%
summarise(incidence = n()) %>%
count(incidence)
## # A tibble: 3 × 2
## incidence n
## <int> <int>
## 1 7 1
## 2 11 1
## 3 14 1
year <-
c(2000,
2000,
2000,
2000,
2000,
2000,
2001,
2001,
2001,
2002,
2002,
2002)
id <- c(1, 1, 1, 2, 3, 3, 1, 1, 4, 5, 4, 4)
DF0 <- data.frame(year, id)
DF0
## year id
## 1 2000 1
## 2 2000 1
## 3 2000 1
## 4 2000 2
## 5 2000 3
## 6 2000 3
## 7 2001 1
## 8 2001 1
## 9 2001 4
## 10 2002 5
## 11 2002 4
## 12 2002 4
tibble::glimpse(DF0)
## Rows: 12
## Columns: 2
## $ year <dbl> 2000, 2000, 2000, 2000, 2000, 2000, 2001, 2001, 2001, 2002, 2002,…
## $ id <dbl> 1, 1, 1, 2, 3, 3, 1, 1, 4, 5, 4, 4
library(tidyverse)
DF1 <- DF0 %>%
group_by(year) %>%
mutate(number_of_unique_ids = n_distinct(id)) %>%
select(year, number_of_unique_ids) %>%
unique()
DF1
## # A tibble: 3 × 2
## # Groups: year [3]
## year number_of_unique_ids
## <dbl> <int>
## 1 2000 3
## 2 2001 2
## 3 2002 2
Read in three files, each of which has variables a, b, c with three rows of numeric data
library(tidyverse)
list_of_files <- list.files("data/",
pattern = "purrr-bind-example",
full.names = TRUE)
one_file_df <- map_df(list_of_files, read_csv)
tibble::glimpse(one_file_df)
## Rows: 0
## Columns: 0
# with file names included in the output data
list_of_files <- list.files("data/",
pattern = "purrr-bind-example",
full.names = TRUE) %>%
set_names(basename(.))
one_file_df <- map_df(list_of_files, read_csv, .id = "file")
one_file_df
## # A tibble: 0 × 0
# REFERENCES / CREDIT / SOURCE / INSPIRATION
# https://gist.github.com/ColinFay/d74d331825868b181860212cd1577b69
dim(mtcars)
## [1] 32 11
# remove those with 8 cylinders and 3 gears
mtcars2 <-mtcars[!(mtcars$cyl == 8 & mtcars$gear == 3),]
dim(mtcars2)
## [1] 20 11
# remove those with 8 cylinders and 3 gears
mtcars2 <- subset(mtcars, !(mtcars$cyl == 8 & mtcars$gear == 3))
dim(mtcars2)
## [1] 20 11
dim(mtcars2)
## [1] 20 11
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
(data.table::setorder(mtcars, -mpg))
## mpg cyl disp hp drat wt qsec vs am gear carb
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
(data.table::setorder(mtcars, gear, mpg))
## mpg cyl disp hp drat wt qsec vs am gear carb
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
Create some sample data frames.
set.seed(1L)
DT <- data.table::data.table(V1 = rep(c(1L, 2L), 5)[-10],
V2 = 1:9,
V3 = c(0.5, 1.0, 1.5),
V4 = rep(LETTERS[1:3], 3))
set.seed(1L)
DF <- dplyr::tibble(V1 = rep(c(1L, 2L), 5)[-10],
V2 = 1:9,
V3 = rep(c(0.5, 1.0, 1.5), 3),
V4 = rep(LETTERS[1:3], 3),
V5 = c("E","E E","E F","E F","eF","Ef","EE FF","FF","GG"))
DF
## # A tibble: 9 × 5
## V1 V2 V3 V4 V5
## <int> <int> <dbl> <chr> <chr>
## 1 1 1 0.5 A E
## 2 2 2 1 B E E
## 3 1 3 1.5 C E F
## 4 2 4 0.5 A E F
## 5 1 5 1 B eF
## 6 2 6 1.5 C Ef
## 7 1 7 0.5 A EE FF
## 8 2 8 1 B FF
## 9 1 9 1.5 C GG
DT[order(V3)] # sorting see also setorder
## V1 V2 V3 V4
## 1: 1 1 0.5 A
## 2: 2 4 0.5 A
## 3: 1 7 0.5 A
## 4: 2 2 1.0 B
## 5: 1 5 1.0 B
## 6: 2 8 1.0 B
## 7: 1 3 1.5 C
## 8: 2 6 1.5 C
## 9: 1 9 1.5 C
dplyr::arrange(DF, V3) # sorting
## # A tibble: 9 × 5
## V1 V2 V3 V4 V5
## <int> <int> <dbl> <chr> <chr>
## 1 1 1 0.5 A E
## 2 2 4 0.5 A E F
## 3 1 7 0.5 A EE FF
## 4 2 2 1 B E E
## 5 1 5 1 B eF
## 6 2 8 1 B FF
## 7 1 3 1.5 C E F
## 8 2 6 1.5 C Ef
## 9 1 9 1.5 C GG
DT[order(-V3)] # sorting
## V1 V2 V3 V4
## 1: 1 3 1.5 C
## 2: 2 6 1.5 C
## 3: 1 9 1.5 C
## 4: 2 2 1.0 B
## 5: 1 5 1.0 B
## 6: 2 8 1.0 B
## 7: 1 1 0.5 A
## 8: 2 4 0.5 A
## 9: 1 7 0.5 A
arrange(DF, desc(V3)) # sorting
## # A tibble: 9 × 5
## V1 V2 V3 V4 V5
## <int> <int> <dbl> <chr> <chr>
## 1 1 3 1.5 C E F
## 2 2 6 1.5 C Ef
## 3 1 9 1.5 C GG
## 4 2 2 1 B E E
## 5 1 5 1 B eF
## 6 2 8 1 B FF
## 7 1 1 0.5 A E
## 8 2 4 0.5 A E F
## 9 1 7 0.5 A EE FF
DT[order(V1, -V2)] # sorting
## V1 V2 V3 V4
## 1: 1 9 1.5 C
## 2: 1 7 0.5 A
## 3: 1 5 1.0 B
## 4: 1 3 1.5 C
## 5: 1 1 0.5 A
## 6: 2 8 1.0 B
## 7: 2 6 1.5 C
## 8: 2 4 0.5 A
## 9: 2 2 1.0 B
arrange(DF, V1, desc(V2)) # sorting
## # A tibble: 9 × 5
## V1 V2 V3 V4 V5
## <int> <int> <dbl> <chr> <chr>
## 1 1 9 1.5 C GG
## 2 1 7 0.5 A EE FF
## 3 1 5 1 B eF
## 4 1 3 1.5 C E F
## 5 1 1 0.5 A E
## 6 2 8 1 B FF
## 7 2 6 1.5 C Ef
## 8 2 4 0.5 A E F
## 9 2 2 1 B E E
x <- c("100a10", "100a5", "2b", "2a") ; stringr::str_sort(x) # sorting
## [1] "100a10" "100a5" "2a" "2b"
x <- c("100a10", "100a5", "2b", "2a") ; stringr::str_sort(x, numeric = TRUE) # sorting
## [1] "2a" "2b" "100a5" "100a10"
# Function to reverse a string or vector
strReverse <- function(x) {
sapply(lapply(strsplit(x, NULL), rev), paste, collapse="")
}
(string1 = "abcdefg")
## [1] "abcdefg"
(string2 = strReverse(string1))
## [1] "gfedcba"
# a function to reverse a string
str_rev <- function(x) {
sapply( x, function(xx) {
intToUtf8( rev( utf8ToInt( xx ) ) )
} )
}
(string1 = "abcdefg")
## [1] "abcdefg"
(string2 = str_rev(string1))
## abcdefg
## "gfedcba"
(string1 = "abcdefg")
## [1] "abcdefg"
(string2 = stringi::stri_reverse(string1))
## [1] "gfedcba"
# CREATE TIMESTAMP / DATESTAMP IN ONE LINE ----
timestamp <- gsub("\\s", "-", gsub("[[:punct:]]", "-", lubridate::parse_date_time(Sys.time(), "%Y%m% d%H%M%S", truncated = 3)))
timestamp
## [1] "2021-11-18-20-47-02"
# CREATE TIMESTAMP / DATESTAMP IN MORE THAN ONE LINE ----
x <- Sys.time()
timestamp2 <- lubridate::parse_date_time(x, "%Y%m% d%H%M%S", truncated = 3)
timestamp2 <- gsub("[[:punct:]]", "-",timestamp2)
timestamp2 <- gsub("\\s", "-",timestamp2)
tic <- print(Sys.time())
## [1] "2021-11-18 20:47:02 GMT"
mean_1 = rnorm(1e+07, mean = 0, sd = 0.8)
toc <- print(Sys.time())
## [1] "2021-11-18 20:47:03 GMT"
print(toc-tic)
## Time difference of 0.9266114 secs
aggregate row values by group
DF0 <- read.table(header=T, text="
id event
1 X
1 Y
1 Z
2 W
2 A
3 A
3 C
3 Z
")
library(dplyr)
DF0 %>%
group_by(id) %>%
summarise(events = paste(event,collapse='+'))
## # A tibble: 3 × 2
## id events
## <int> <chr>
## 1 1 X+Y+Z
## 2 2 W+A
## 3 3 A+C+Z
(df01 <- data.frame(
a = c("Jim","Jane","Joe"),
b = c(1, NA, 3),
c = c(NA, 2, 4)))
## a b c
## 1 Jim 1 NA
## 2 Jane NA 2
## 3 Joe 3 4
df01[is.na(df01)] <- 0 # set NA values to be equal to 0 zero
df01
## a b c
## 1 Jim 1 0
## 2 Jane 0 2
## 3 Joe 3 4
DF
## # A tibble: 9 × 5
## V1 V2 V3 V4 V5
## <int> <int> <dbl> <chr> <chr>
## 1 1 1 0.5 A E
## 2 2 2 1 B E E
## 3 1 3 1.5 C E F
## 4 2 4 0.5 A E F
## 5 1 5 1 B eF
## 6 2 6 1.5 C Ef
## 7 1 7 0.5 A EE FF
## 8 2 8 1 B FF
## 9 1 9 1.5 C GG
DF$V5 <- gsub(" ","",as.character(DF$V5)) # remove spaces
DF
## # A tibble: 9 × 5
## V1 V2 V3 V4 V5
## <int> <int> <dbl> <chr> <chr>
## 1 1 1 0.5 A E
## 2 2 2 1 B EE
## 3 1 3 1.5 C EF
## 4 2 4 0.5 A EF
## 5 1 5 1 B eF
## 6 2 6 1.5 C Ef
## 7 1 7 0.5 A EEFF
## 8 2 8 1 B FF
## 9 1 9 1.5 C GG
(DF1 <- data.frame(stringsAsFactors=FALSE,
V1 = c(10L, 20L, 30L, 40L, 50L, 60L, 70L, 80L, 90L),
V2 = c("X", NA, NA, NA, "Y", NA, NA, "Z", NA)
))
## V1 V2
## 1 10 X
## 2 20 <NA>
## 3 30 <NA>
## 4 40 <NA>
## 5 50 Y
## 6 60 <NA>
## 7 70 <NA>
## 8 80 Z
## 9 90 <NA>
(DF2 <- transform(DF1, V2 = zoo::na.locf(V2)))
## V1 V2
## 1 10 X
## 2 20 X
## 3 30 X
## 4 40 X
## 5 50 Y
## 6 60 Y
## 7 70 Y
## 8 80 Z
## 9 90 Z
Can also use tidyr::fill
DF1 %>% fill(V2)
## V1 V2
## 1 10 X
## 2 20 X
## 3 30 X
## 4 40 X
## 5 50 Y
## 6 60 Y
## 7 70 Y
## 8 80 Z
## 9 90 Z
VEC1 <- c("a1","b2","c3","d4","e5","f6","g7") # create vector
VEC2 <- c(1,2,3,4,5,6,7,19,0) # create vector
VEC3 <- c("a", "b", "c", "d", "a", "e", "c")
VEC1[2] # second element
## [1] "b2"
VEC1[-2] # all but the second element
## [1] "a1" "c3" "d4" "e5" "f6" "g7"
VEC1[1:3] # First three elements
## [1] "a1" "b2" "c3"
VEC1[-(1:3)] # Last n elements
## [1] "d4" "e5" "f6" "g7"
VEC1[c(1,3,5)] # specific elements
## [1] "a1" "c3" "e5"
VEC1[VEC1 > "c4"] # all elements greater than something
## [1] "d4" "e5" "f6" "g7"
VEC2[VEC2 > 3] # all elements greater than something
## [1] 4 5 6 7 19
VEC2[VEC2 > 3 & VEC2 <5 ] # all elements between 3 and 5
## [1] 4
VEC1[VEC1 %in% c("a1","d4")] # elements in the given set
## [1] "a1" "d4"
which.max(VEC2) # the index position of the greatest element
## [1] 8
which.min(VEC2) # the index position of the greatest element
## [1] 9
rev(VEC2) # reverse
## [1] 0 19 7 6 5 4 3 2 1
sort(VEC2) # sort the vector
## [1] 0 1 2 3 4 5 6 7 19
rev(sort(VEC2)) # reverse sort the vector
## [1] 19 7 6 5 4 3 2 1 0
VEC3
## [1] "a" "b" "c" "d" "a" "e" "c"
duplicated(VEC3) # find duplicates
## [1] FALSE FALSE FALSE FALSE TRUE FALSE TRUE
which(duplicated(VEC3)) # show positions of duplicated items
## [1] 5 7
index <- which(duplicated(VEC3)) # create an index of positions
index # show the index
## [1] 5 7
VEC3[index] # show the duplicates
## [1] "a" "c"
VEC3[-index] # show data without duplicates
## [1] "a" "b" "c" "d" "e"
VEC4 <- VEC3[-index] # remove the duplicates
VEC4 # show the data frame again
## [1] "a" "b" "c" "d" "e"
print(class(VEC2)) # what class is vector
## [1] "numeric"
print(class(VEC4)) # what class is vector
## [1] "character"
print(length(VEC1)) # length of vector
## [1] 7
Set all elements of this vector that are less than 6 to be equal to zero.
x <- c(3, 5, 1, 10, 12, 6)
x[x < 6] <- 0
x
## [1] 0 0 0 10 12 6