pacman::p_load(
dplyr, # data cleaning
tidyverse # data management and visualization
)
# loop over integer sequence
for (i in 1:3) {
print(i)
}
## [1] 1
## [1] 2
## [1] 3
# create the vector
for (i in c("Reto", "Ben", "Lea")) {
print(i)
}
## [1] "Reto"
## [1] "Ben"
## [1] "Lea"
# integer sequence
x <- 1:3
# use vector x for the loop
for(i in x) {
print(i)
}
## [1] 1
## [1] 2
## [1] 3
# character vector
participants <- c("Reto", "Ben", "Lea")
# Use vector participant for the loop
for(name in participants) {
print(i)
}
## [1] 3
## [1] 3
## [1] 3
output <- vector("double", length(1:10))
for (i in 1:10) {
sq <- i^2
print(paste0("i = ", i, ": sq =", sq))
output[[i]] <- sq
}
## [1] "i = 1: sq =1"
## [1] "i = 2: sq =4"
## [1] "i = 3: sq =9"
## [1] "i = 4: sq =16"
## [1] "i = 5: sq =25"
## [1] "i = 6: sq =36"
## [1] "i = 7: sq =49"
## [1] "i = 8: sq =64"
## [1] "i = 9: sq =81"
## [1] "i = 10: sq =100"
mtcars
cars <- mtcars[, c("mpg", "wt", "disp", "hp")]
cars
cars <- mtcars[, c("mpg", "wt", "disp", "hp")]
for (i in 1:4) {
print(i)
print(sd(cars[,i]))
}
## [1] 1
## [1] 6.026948
## [1] 2
## [1] 0.9784574
## [1] 3
## [1] 123.9387
## [1] 4
## [1] 68.56287
v <- rep(NA, 4)
names(v) <- names(cars)
for (i in 1:4) {
v[[i]] <- sd(cars[[i]])
}
v
## mpg wt disp hp
## 6.0269481 0.9784574 123.9386938 68.5628685
for(i in 1:4) {
v[i] <- sd(cars[, i])
}
v
## mpg wt disp hp
## 6.0269481 0.9784574 123.9386938 68.5628685
v <- matrix(NA, nrow=4, ncol=3)
rownames(v) <- names(cars) # variable names for row names (mpg, wt, disp, hp)
colnames(v) <- c("mean", "sd", "N") # statistics names for col names
for (i in 1:4) {
v[i, "mean"] <- mean(cars[,i])
v[i, "sd"] <- sd(cars[,i])
v[i, "N"] <- sum(!is.na(cars[,i]))
}
v
## mean sd N
## mpg 20.09062 6.0269481 32
## wt 3.21725 0.9784574 32
## disp 230.72188 123.9386938 32
## hp 146.68750 68.5628685 32
Because a data frame always has named columns, we could just as easily use names as positions to index a data frame. This can be especially helpful if we are trying to analyze a few selected variables from a larger data frame (for example, using mtcars.)
analysis_vars <- c("mpg", "wt", "disp", "hp") # Selected columns from a larger data frame
for (i in analysis_vars) {
print(i)
print(sd(mtcars[[i]])) # calculate sd for selected columns only
}
## [1] "mpg"
## [1] 6.026948
## [1] "wt"
## [1] 0.9784574
## [1] "disp"
## [1] 123.9387
## [1] "hp"
## [1] 68.56287
for (i in analysis_vars) {
print(i)
print(sd(mtcars[,i]))
}
## [1] "mpg"
## [1] 6.026948
## [1] "wt"
## [1] 0.9784574
## [1] "disp"
## [1] 123.9387
## [1] "hp"
## [1] 68.56287
v <- matrix(NA, nrow = length(analysis_vars), ncol = 3)
rownames(v) <- analysis_vars
colnames(v) <- c("mean", "sd", "N")
for (i in analysis_vars) {
v[i, "mean"] <- mean(mtcars[,i])
v[i, "sd"] <- mean(mtcars[,i])
v[i, "N"] <- mean(mtcars[,i])
}
v
## mean sd N
## mpg 20.09062 20.09062 20.09062
## wt 3.21725 3.21725 3.21725
## disp 230.72188 230.72188 230.72188
## hp 146.68750 146.68750 146.68750
air.data <- airquality # data frame
head(air.data)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
# Manual calculation
mean.1 <- mean(air.data[,1], na.rm = T)
mean.2 <- mean(air.data[,2], na.rm = T)
mean.3 <- mean(air.data[,3], na.rm = T)
mean.4 <- mean(air.data[,4], na.rm = T)
mean.5 <- mean(air.data[,5], na.rm = T)
mean.6 <- mean(air.data[,1], na.rm = T)
means <- c(mean.1, mean.2, mean.3, mean.4, mean.4, mean.6)
means
## [1] 42.129310 185.931507 9.957516 77.882353 77.882353 42.129310
# For loop - numeric indices
for (i in 1:ncol(air.data)) {
air.data[, i] <- scale(air.data[, i], scale = T, center = T)
}
head(air.data)
## Ozone Solar.R Wind Temp Month Day
## 1 -0.03423409 0.04517615 -0.7259482 -1.1497140 -1.407294 -1.670019
## 2 -0.18580489 -0.75430487 -0.5556388 -0.6214670 -1.407294 -1.557210
## 3 -0.91334473 -0.41008388 0.7500660 -0.4101682 -1.407294 -1.444401
## 4 -0.73145977 1.41095624 0.4378323 -1.6779609 -1.407294 -1.331592
## 5 NA NA 1.2326091 -2.3118573 -1.407294 -1.218782
## 6 -0.42831817 NA 1.4029185 -1.2553634 -1.407294 -1.105973
# For loop - numeric indices using [[]] - The "double square brackets" in R can be used to reference "data frame columns"
air.data2 <- airquality
for (i in 1:ncol(air.data)) {
air.data2[[i]] <- scale(air.data2[[i]], scale = T, center = T)
}
head(air.data2)
## Ozone Solar.R Wind Temp Month Day
## 1 -0.03423409 0.04517615 -0.7259482 -1.1497140 -1.407294 -1.670019
## 2 -0.18580489 -0.75430487 -0.5556388 -0.6214670 -1.407294 -1.557210
## 3 -0.91334473 -0.41008388 0.7500660 -0.4101682 -1.407294 -1.444401
## 4 -0.73145977 1.41095624 0.4378323 -1.6779609 -1.407294 -1.331592
## 5 NA NA 1.2326091 -2.3118573 -1.407294 -1.218782
## 6 -0.42831817 NA 1.4029185 -1.2553634 -1.407294 -1.105973
# For loop - character indices
for (var in names(air.data)) {
air.data[, var] <- scale(air.data[, var], scale = T, center = T)
}
head(air.data)
## Ozone Solar.R Wind Temp Month Day
## 1 -0.03423409 0.04517615 -0.7259482 -1.1497140 -1.407294 -1.670019
## 2 -0.18580489 -0.75430487 -0.5556388 -0.6214670 -1.407294 -1.557210
## 3 -0.91334473 -0.41008388 0.7500660 -0.4101682 -1.407294 -1.444401
## 4 -0.73145977 1.41095624 0.4378323 -1.6779609 -1.407294 -1.331592
## 5 NA NA 1.2326091 -2.3118573 -1.407294 -1.218782
## 6 -0.42831817 NA 1.4029185 -1.2553634 -1.407294 -1.105973
set.seed(20210205)
q <- as.data.frame(matrix(sample(c(1:5,NA), 35, replace=TRUE), ncol=5))
q
## V1 V2 V3 V4 V5
## 1 4 2 3 1 3
## 2 5 3 1 NA 5
## 3 NA 3 5 3 NA
## 4 5 5 3 2 5
## 5 3 5 2 NA NA
## 6 5 5 1 NA 1
## 7 NA 1 3 5 1
Vscale <- rep(NA, nrow(q))
names(Vscale) <- row.names(q)
for (i in 1:nrow(q)) {
v <- as.matrix(q[i,])
Vscale[i] <- mean(v, na.rm = TRUE)
}
Vscale
## 1 2 3 4 5 6 7
## 2.600000 3.500000 3.666667 4.000000 3.333333 3.000000 2.500000
tb <- readr::read_csv2("http://rpository.com/ds4psy/data/tb.csv")
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## Rows: 100 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## dbl (5): id, age, height, shoesize, IQ
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(tb)
## [1] 100 5
head(tb)
## # A tibble: 6 × 5
## id age height shoesize IQ
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 21 173 38 89
## 2 2 26 193 43 93
## 3 3 24 171 41 92
## 4 4 32 191 43 97
## 5 5 26 156 36 110
## 6 6 28 172 34 117
tb_2 <- tb %>% select(-1)
tb_2
## # A tibble: 100 × 4
## age height shoesize IQ
## <dbl> <dbl> <dbl> <dbl>
## 1 21 173 38 89
## 2 26 193 43 93
## 3 24 171 41 92
## 4 32 191 43 97
## 5 26 156 36 110
## 6 28 172 34 117
## 7 20 166 35 107
## 8 31 172 34 110
## 9 18 192 32 88
## 10 22 176 39 111
## # … with 90 more rows
output <- vector("double", 4) # 4 column
for (i in 1:4) {
mn <- mean(tb_2[[i]])
output[[i]] <- mn
}
output
## [1] 26.29 177.78 39.05 104.85
output <- vector("double", length(1:4)) # 4 column
for (i in 1:4) {
mn <- mean(tb_2[[i]])
output[[i]] <- mn
}
output
## [1] 26.29 177.78 39.05 104.85
output <- vector("double", length(1:4)) # 4 column
for (i in 1:ncol(tb_2)) {
mn <- mean(tb_2[[i]])
output[[i]] <- mn
}
output
## [1] 26.29 177.78 39.05 104.85
output_2 <- vector("double", 4)
for (i in seq_along(tb_2)) { # loop through COLUMNS of a df/table, return integer
mn <- mean(tb_2[[i]])
output_2[[i]] <- mn
}
output_2
## [1] 26.29 177.78 39.05 104.85
output_2 <- vector("double", 4)
for (i in 1:ncol(tb_2)) {
mn <- mean(tb_2[[i]])
output_2[[i]] <- mn
}
output_2
## [1] 26.29 177.78 39.05 104.85
output_4 <- vector("double", 4)
for (i in 2:5) {
mn <- mean(tb[[i]])
output_4[[i-1]] <- mn
}
output_4
## [1] 26.29 177.78 39.05 104.85
all.equal(output, output_4)
## [1] TRUE
summary(tb$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15.00 22.00 24.50 26.29 30.25 46.00
summary(tb$height)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 150.0 171.8 177.0 177.8 185.0 206.0
summary(tb$shoesize)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 29.00 36.00 39.00 39.05 42.00 47.00
s <- summary(tb$IQ)
typeof(s)
## [1] "double"
head(tb) # tibble
## # A tibble: 6 × 5
## id age height shoesize IQ
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 21 173 38 89
## 2 2 26 193 43 93
## 3 3 24 171 41 92
## 4 4 32 191 43 97
## 5 5 26 156 36 110
## 6 6 28 172 34 117
summarise <- vector(mode = "list", length = 4) # 4 columns
for (i in 2:5) {
sm <- summary(tb[[i]])
summarise[[i - 1]] <- sm
}
summarise
## [[1]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15.00 22.00 24.50 26.29 30.25 46.00
##
## [[2]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 150.0 171.8 177.0 177.8 185.0 206.0
##
## [[3]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 29.00 36.00 39.00 39.05 42.00 47.00
##
## [[4]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 85.0 97.0 104.0 104.8 113.0 145.0
# Create matrix
x <- matrix(c(28, 35, 13, 13, 1.62, 1.53, 1.83, 1.71, 65, 59, 72, 83),
nrow = 4,
dimnames = list(c("Veronica", "Karl", "Miriam", "Peter"), # row
c("Age", "Size", "Weight"))) # column
x
## Age Size Weight
## Veronica 28 1.62 65
## Karl 35 1.53 59
## Miriam 13 1.83 72
## Peter 13 1.71 83
for (rname in rownames(x)) {
for (cname in colnames(x)) {
print(paste("The", cname, "of", rname, "is", x[rname, cname]))
}
}
## [1] "The Age of Veronica is 28"
## [1] "The Size of Veronica is 1.62"
## [1] "The Weight of Veronica is 65"
## [1] "The Age of Karl is 35"
## [1] "The Size of Karl is 1.53"
## [1] "The Weight of Karl is 59"
## [1] "The Age of Miriam is 13"
## [1] "The Size of Miriam is 1.83"
## [1] "The Weight of Miriam is 72"
## [1] "The Age of Peter is 13"
## [1] "The Size of Peter is 1.71"
## [1] "The Weight of Peter is 83"
for (var in colnames(x)) {
m <- mean(x[, var])
print(paste("Average", var, "is", m))
}
## [1] "Average Age is 22.25"
## [1] "Average Size is 1.6725"
## [1] "Average Weight is 69.75"
Individual values: Individual values can be collected in a named
vector by combining functions c() and
setNames()
# setNames: setnames() changes the names of a data.frame or data.table by reference
x <- 1:5
x_names <- setNames(x, letters[1:5])
x_names
## a b c d e
## 1 2 3 4 5
# For
mu <- NULL
for(var in names(air.data)) {
mu <- c(mu, setNames(mean(air.data[, var], na.rm = T), var))
}
mu
## Ozone Solar.R Wind Temp Month
## -1.754632e-17 -1.763889e-17 -2.275248e-17 4.988747e-18 -2.815949e-17
## Day
## -1.585911e-17
Vectors: Vectors of the same size can be collected in a data.frame
scaled_data <- NULL
for(var in names(air.data)) {
scaled_data <- cbind(scaled_data,
scale(air.data[, var],
scale = T,
center = T))
}
# convert to data.frame and name
scaled_data <- as.data.frame(scaled_data) %>% setNames(names(air.data))
head(scaled_data)
## Ozone Solar.R Wind Temp Month Day
## 1 -0.03423409 0.04517615 -0.7259482 -1.1497140 -1.407294 -1.670019
## 2 -0.18580489 -0.75430487 -0.5556388 -0.6214670 -1.407294 -1.557210
## 3 -0.91334473 -0.41008388 0.7500660 -0.4101682 -1.407294 -1.444401
## 4 -0.73145977 1.41095624 0.4378323 -1.6779609 -1.407294 -1.331592
## 5 NA NA 1.2326091 -2.3118573 -1.407294 -1.218782
## 6 -0.42831817 NA 1.4029185 -1.2553634 -1.407294 -1.105973
mat <- matrix(nrow=5, ncol=5) # create a 30 x 30 matrix (of 30 rows and 30 columns)
for(i in 1:nrow(mat)) # for each row
{
for(j in 1:ncol(mat)) # for each column
{
mat[i,j] = i*j # assign values based on position: product of two indexes
}
}
hist(tb_2$shoesize)
h<-hist(tb$IQ)
h
## $breaks
## [1] 80 90 100 110 120 130 140 150
##
## $counts
## [1] 12 29 29 20 8 1 1
##
## $density
## [1] 0.012 0.029 0.029 0.020 0.008 0.001 0.001
##
## $mids
## [1] 85 95 105 115 125 135 145
##
## $xname
## [1] "tb$IQ"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
tb_2
## # A tibble: 100 × 4
## age height shoesize IQ
## <dbl> <dbl> <dbl> <dbl>
## 1 21 173 38 89
## 2 26 193 43 93
## 3 24 171 41 92
## 4 32 191 43 97
## 5 26 156 36 110
## 6 28 172 34 117
## 7 20 166 35 107
## 8 31 172 34 110
## 9 18 192 32 88
## 10 22 176 39 111
## # … with 90 more rows
my_blue <- ds4psy::pal_ds4psy[[4]]
# Histogram for 1 column
hist(tb_2[[1]], col = my_blue,
main = paste0("Histogram of ", names(tb_2[1]), "value:"),
xlab = names(tb_2[1]))
# For loop:
out <- vector("list", 4) # output of loop
for (i in seq_along(tb_2)) { # loop through COLUMNS of tb_2:
print(i)
var_name <- names(tb_2[i])
title <- paste0("Histogram of ", var_name, " values:")
x_lab <- var_name
out[[i]] <- hist(tb_2[[i]], col = my_blue,
main = title, xlab = x_lab)
} # end for.
## [1] 1
## [1] 2
## [1] 3
## [1] 4
plot(out[[2]], col = "gold")
set.seed(1)
tb <- tibble(
a = rnorm(10), # normal distribution
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
tb
## # A tibble: 10 × 4
## a b c d
## <dbl> <dbl> <dbl> <dbl>
## 1 -0.626 1.51 0.919 1.36
## 2 0.184 0.390 0.782 -0.103
## 3 -0.836 -0.621 0.0746 0.388
## 4 1.60 -2.21 -1.99 -0.0538
## 5 0.330 1.12 0.620 -1.38
## 6 -0.820 -0.0449 -0.0561 -0.415
## 7 0.487 -0.0162 -0.156 -0.394
## 8 0.738 0.944 -1.47 -0.0593
## 9 0.576 0.821 -0.478 1.10
## 10 -0.305 0.594 0.418 0.763
rescale01 <- function(x) {
rng <- range(x, na.rm = TRUE)
(x - rng[1]) / (rng[2] - rng[1])
}
df <- tb
df$a <- rescale01(df$a)
df$b <- rescale01(df$b)
df$c <- rescale01(df$c)
df$d <- rescale01(df$d)
df
## # A tibble: 10 × 4
## a b c d
## <dbl> <dbl> <dbl> <dbl>
## 1 0.0860 1 1 1
## 2 0.419 0.699 0.953 0.466
## 3 0 0.428 0.710 0.645
## 4 1 0 0 0.484
## 5 0.479 0.896 0.897 0
## 6 0.00624 0.582 0.665 0.352
## 7 0.544 0.590 0.630 0.359
## 8 0.647 0.848 0.178 0.482
## 9 0.581 0.815 0.520 0.905
## 10 0.218 0.754 0.828 0.782
df2 <- tb
for (i in seq_along(df2)) {
df2[[i]] <- rescale01(df2[[i]])
}
df2
## # A tibble: 10 × 4
## a b c d
## <dbl> <dbl> <dbl> <dbl>
## 1 0.0860 1 1 1
## 2 0.419 0.699 0.953 0.466
## 3 0 0.428 0.710 0.645
## 4 1 0 0 0.484
## 5 0.479 0.896 0.897 0
## 6 0.00624 0.582 0.665 0.352
## 7 0.544 0.590 0.630 0.359
## 8 0.647 0.848 0.178 0.482
## 9 0.581 0.815 0.520 0.905
## 10 0.218 0.754 0.828 0.782
# Verify equality:
all.equal(df, df2)
## [1] TRUE