#FUNCTIONS

#DEMO 1 FUNCTIONS FOR DATA EXPLORATION

#' ## Dataset
#' We will explore the datasets `swiss` and `esoph` (both automatically available in R).
#'

Object type

# ------------------------------------------------------------------------------
#' ## What type of object do I have?
class(swiss)

How large is a dataset? Dimensions

#' ## How large is this dataset?
#' Dimension of a `data.frame` or a `matrix`:
dim(swiss)

#' We can also get the rows and columns separately:
nrow(swiss)
ncol(swiss)

#' Be careful:
length(swiss)

#' Why is that?
#' Because a `data.frame` is also a `list`:
is.list(swiss)
is.data.frame(swiss)

#' but:
length(swiss$Fertility)

Structure, head and names - how does the dataset look?

#' ## How does the dataset look like?
#' Names of all the variables in the data:
names(swiss)

#' Show me the first and last few rows:
head(swiss) #show the first rows
tail(swiss) #show the last rows

#' We can adjust how many rows are shown:
head(swiss, n = 10)

# Structure of the dataset:
str(swiss) #show whether it is numeric or not, show the first few observations
#' The function `str` has many arguments to customize the output, 
#' but we will skip that here.

Descriptive statistics

# ------------------------------------------------------------------------------
#' ## Descriptive Statistics
#' ### Summary of a `data.frame` or `matrix`
summary(esoph) #provides 'table', you know how many people are within a group



#' ### Summaries per variable
#' We can also get the summary of a single variable
summary(swiss$Fertility)

#' or you can do all the work yourself:
min(swiss$Fertility) #minimum
max(swiss$Fertility) #maximum
range(swiss$Fertility) #both minimum and maximum
mean(swiss$Fertility) #mean
median(swiss$Fertility) #median
quantile(swiss$Fertility, probs = c(0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99)) #observation at a certain probability

#' Inter quartile range:
IQR(swiss$Fertility) #number at .25 and .75

#' Standard deviation
sd(swiss$Fertility) #standard deviation

#' Variance
var(swiss$Fertility) #variance

#' All the above functions (`min`, `max`, `range`, `mean`, `median`, `quantile`,
#' `sd`, `var`) have an argument `na.rm` which is by default set to `FALSE`.
# this means all summaries are calculated based on no missings. If you have a missing, you will get an error (NA). By setting it to TRUE, you will be able to calculate the summary based on all cases without missings. 
(x <- c(rnorm(5), NA))
min(x)
min(x, na.rm = TRUE)


#' Another helpful function to summarize continuous data is `ave()`.
#' It calculates a summary measure of a continuous variable per group, that are
#' defined by one or more categorical variables:
esoph$av_case_by_age <- ave(esoph$ncases, esoph$agegp) #calculate for summary for continous variables per categorical variables
esoph[12:22, ]

#' `ave()` also works with other functions than the mean:
esoph$med_case_by_age <- ave(esoph$ncases, esoph$agegp, FUN = median)
esoph[28:36, ]

#' And we can split the data by multiple categorical variables:
ave(esoph$ncases, esoph$agegp, esoph$alcgp, FUN = median)

Tables

# ------------------------------------------------------------------------------
#' ## Tables
#' The above summaries were all for continuous variables. 
#' For categorical variables we may be interested to see the different categories
#' and how many observations there are per category:
levels(esoph$agegp) #which levels do we have
table(esoph$agegp) #number of observations per group

#' By default, table only shows observations.
#' The argument `exclude` can be set to `NULL` to also include missing values:
(x <- factor(c(NA, sample(LETTERS[1:3], size = 10, replace = TRUE)),
             levels = LETTERS[1:5]))
table(x) #table excludes your NAs automatically, you don't want that!
table(x, exclude = NULL)

#' We can also get tables for multiple variables.
table(age = esoph$agegp, alc = esoph$alcgp)




#' ### Tables: Margins
#' To add summaries (e.g. the sum) for each column and/or row:
tab <- table(age = esoph$agegp, alc = esoph$alcgp)
addmargins(tab)
addmargins(tab, margin = 1) #summarizes over rows (so technically the columns)
addmargins(tab, margin = 2, FUN = mean) #gives the means over columns (so technically the row mean)

#' It is also possible to use different functions per margin:
addmargins(tab, FUN = c(mean, sum))

#' ### Tables: Proportions
#' To convert the table to proportions:
prop.table(tab) #relative to whole population

#' Here, the sum over all cells is 1:
sum(prop.table(tab)) #relative to both grams a day and age group

#' The argument `margin` allows us to get proportions relative to the row- or 
#' column sum:
prop.table(tab, margin = 1)

#' In the above table, the sum in each row is equal to 1:
addmargins(prop.table(tab, margin = 1)) # sum is equal to one, because it is relative to the population within a age group -> if you change margin to 2, you will get probabilities relative to grams a day



#' ### Tables: More Dimensions
#' It is possible to get tables with > 2 dimensions:
table(esoph[, 1:3]) # same as table(esoph$agegp, esoph$alcgp, esoph$tobgp)

#' In that case a "flat table" can be more clear:
ftable(table(esoph[, 1:3])) #threedimensional tables will be converted to a long twodimensional table
#' With the help of the arguments `row.vars` and `col.vars` we can determine 
#' which variables are given in the rows and which in the columns:
ftable(table(esoph[, 1:3]), row.vars = c(1)) #where do you want your columns aka variable? the row var should be the first var; remaining should go on the columns side
ftable(table(esoph[, 1:3]), row.vars = c(3, 2)) #now you ask r to put the third var first, than the second on the row var

Functions for matrices

# ------------------------------------------------------------------------------
#' ## Functions for matrices
#' ### Sums and Means
#' The function `colMeans()` allows us to calculate the mean for each column 
#' in a `matrix` or `data.frame`:
colMeans(swiss)

#' We can't use `colMeans()` on the `esoph` data because there not all variables 
#' are numeric:
#+ error = TRUE
colMeans(esoph)

#' The functions `colSums()`, `rowMeans()` and `rowSums()` work correspondingly,
#' but are usually less useful to summarize a whole dataset.
colSums(swiss)
rowMeans(swiss)

#' To use other functions (like `min`, `max`, `median`, `sd`, ...) on a whole
#' `matrix` or `data.frame` we need some
#' more programming and the help of  `if (...)`, `for (...)` and/or `apply()`
#' (will be covered later).
#' 



#' ### Variance, Covariance and Correlation
#' The functions `var`, and `cov` return the variance-covariance matrix when 
#' used on a `matrix` or `data.frame`:
var(swiss) #is the variance/covariance matrix 

#' This of course only gives meaningful resuls when the data are continuous:
#+ error = TRUE
cov(esoph) #with factors, you are not able to compute this matrix 

#' When there are missing values in the data:
#' Specify the argument `use = "pairs"` to exclude missing values (which would 
#' otherwise result in a `NA` value for the (co)variance)

#' A (co)variance matrix can be converted to a (pearson) correlation matrix with the help
#' of the function `cov2cor()`:
cov2cor(var(swiss)) #convert cov matrix to correlation

#' The correlation matrix can be obtained directly using `cor()`. The argument 
#' `method` allows the choice of pearson", "kendall" or "spearman" correlation.
cor(swiss, method = 'kendall') #directly measure correlation; you can specify which type of correlation

Duplicates and comparison

# ------------------------------------------------------------------------------
#' ## Duplicates and Comparison
#' To find duplicates in a `data.frame`, `matrix` or a `vector` we can use the 
#' function `duplicated()`:
duplicated(esoph) #are there duplicates in the ROWS, so it checks the rows!

(x <- sample(LETTERS[1:5], 10, replace = TRUE))
duplicated(x) #which are duplicates in this row?

#' Let's set the original variable and the duplication indicator next to each 
#' other to see what is happening:
cbind(x, duplicated(x)) #bind the variable to whether it is duplicated or not; it checks for exactly identical cases! so if one var is still different, it will not come back as a duplicate. 

#' (We will get to know the function `cbind()` later).

#' Using the argument `fromLast = TRUE` checks for duplicates starting from
#' the last value:
cbind(x,
      duplFirst = duplicated(x),
      duplLast = duplicated(x, fromLast = TRUE))
#so, you will check up -> bottom and the other way around

#' Return only the unique values:
unique(x) #second dupli is removed

#' This also works for `data.frame` and `matrix`.
(dat <- data.frame(x = x,
                  y = rbinom(length(x), size = 1, prob = 0.5))
)
unique(dat) #you remove duplicates!

(mat <- as.matrix(dat))
unique(mat) #also works for a matrix

#' With the function `data.frame()` we create a `data.frame` and the function
#' `as.matrix()` allows us to convert an object to a `matrix`.

PRACTICAL 1: EXPLORING AND SUMMARIZING DATA

# Open packages
library(survival)
lung <- survival::lung # overwrites lung dataset thats in your work space; if you 'delete a variable' because you have overwritten it, run this line!

#----------------------------------------
# GETTING TO KNOW THE DATA

#   Data dimensions

# Find out the class and dimension of the lung data.
class(lung)
dim(lung)
nrow(lung)
ncol(lung)

# Now investigate the structure of the data.
# - Which variables do the data contain?
names(lung)
head(lung)

# - What types of variables are these?
str(lung)

# - Are there missing values?
is.na(lung)
summary(lung)

#   Data structure

# All variables are coded as numeric.
# - Do you think some of them should be factors?
#Looking at the output of str(lung) we can see that status, sex and  ph.ecog may only have values 0, 1 and 2. Also inst, ph.karno, pat.karno, and wt.loss could be categorical.

# - Explore how many distinct values there are in those variables that may not actually be continuous.
#To confirm that status, sex and ph.ecog only have very few levels, unique() or  table() can be used:
unique(lung$status)
table(lung$sex, exclude = NULL)
table(lung$ph.ecog, exclude = NULL)

#To prevent possibly very lengthy output for the other variables (if they have many different values) we could first check how many different values there are:
length(unique(lung$inst))
length(unique(lung$ph.karno))
length(unique(lung$pat.karno))
length(unique(lung$wt.loss))
#We decide that they all should remain continuous variables (although for variables with few different values, like ph.karno or pat.karno, it may often not be appropriate to treat them as continuous).

#----------------------------------------
# RECODING

#   Turning continuous values into factors

# - Re-code status, sex and ph.ecog as factors.
# For ph.ecog, just convert to a factor
lung$ph.ecog <- factor(lung$ph.ecog)

# Confirm the class, either with

# - Use meaningful factor labels for status (1 = censored, 2 = dead) and  sex (1 = male, 2 = female).
# For the other two, use levels and labels
lung$sex <- factor(lung$sex, levels = c(1, 2), labels = c('male', 'female'))
lung$status <- factor(lung$status, levels = c(1, 2), labels = c('censored', 'dead'))

# - Check that the resulting variables are indeed of class factor and that they have the correct levels.
class(lung$ph.ecog)
class(lung$sex)
class(lung$ph.ecog)

str(lung)
levels(lung$ph.ecog)
table(lung$ph.ecog)
table(lung$sex)
table(lung$status)

#----------------------------------------
# DESCRIPTIVE STATISTICS

#  Summary

# Get the summary of the lung data
summary(lung)

# - Calculate each of the values of this summary for the variables  ph.karno and ph.ecog "by hand" (i.e., using other functions).
min(lung$ph.karno, na.rm=T)
max(lung$ph.karno, na.rm=T)
quantile(lung$ph.karno, c(.25, .5, .75), na.rm=T) #.5 = mean
median(lung$ph.karno, na.rm=T)
mean(lung$ph.karno, na.rm=T)
table(is.na(lung$ph.karno))

table(lung$ph.ecog, exclude = NULL)

# - Also calculate the inter-quartile range for ph.karno.
IQR(lung$ph.karno, na.rm = T)

#  Variance and Standard Deviation

# - Calculate the variance of meal.cal.
varmc <- var(lung$meal.ca, na.rm = T)

# - Calculate the standard deviation of meal.cal.
sdmc<- sd(lung$meal.ca, na.rm = T)

# - Check that the standard deviation is indeed the square root of the variance.
sqrt(varmc) == sdmc
sqrt(varmc)
sdmc

# - Check that the variance is indeed the square of the standard deviation.
round(varmc,3) == round(sdmc^2,3)
varmc
sdmc^2

#  Tables

# - Get a table of sex and status.
tab<-table(lung$status, lung$sex)
tab

# - Also calculate the probabilities for each combination:
# - relative to the total number of subjects
prop.table(tab)
# - relative to sex
prop.table(tab, margin = 2)

#Make sure choose the correct margin! When sex is in the rows, you need margin = 1. Always check that the table shows the correct numbers by roughly adding up the proportions in your head.

# - Get a table of sex and ph.ecog per status.
tab <- table(lung$sex, lung$ph.ecog, lung$status, exclude = NULL) #with status LAST, you get table PER STATUS
tab

# - Experiment with the effect the order of the variables in table() has.

# The first variable specifies the rows, the second the columns, and the 
# third variables the third dimension:
table(lung$ph.ecog, lung$status, lung$sex, exclude = NULL)

# - Convert this 3-dimensional table into a flat table.
ftable(tab, exclude = NULL)

DEMO 2 FUNCTION FOR DATA MANIPULATION

#+  echo = FALSE
# This is not part of the demo. 
# It just allows the output to be wider (to make the html look nicer)
options(width = 105)


#' ## Dataset
#' We will work with the datasets `swiss` and `esoph` (both automatically available in R).
#'

Transformations

#' ## Transformations
#' ### The logarithm
#' In R, `log()` by default calculates the natural logarithm, i.e., the following
#' two commands are equal:
log(swiss$Fertility) # if log is not specified, its a natural one
log(swiss$Fertility, base = exp(1))

#' For other commonly used bases we can use separate functions, e.g. `log2()` or
#' `log10()`.
#' The argument `base` allows us to change the base of the logarithm to any value:
log(swiss$Fertility, base = 2)
log2(swiss$Fertility)

log(swiss$Fertility, base = 10)
log10(swiss$Fertility)
#log10 = log base 10


#' ### Other transformations

#' Exponential function
exp(swiss$Fertility)

#' Square root
sqrt(swiss$Fertility)
#log of zero is -inf, but sqrt of 0 stays zero -> so sqrt is better at handling 0

#' Absolute value
x <- rnorm(10)
x
abs(x) #remove the minus!

#' Distribution function of the logistic distribution:
#' this function is for instance useful to convert the linear predictor of a 
#' logistic model to the probability scale.
plogis(x) # if you need to convert something from any range to a value between 0 and 1

Splitting & Combining

#' ## Splitting & Combining

#' ### Dividing a (continuous) variable into a `factor`
#' The function `cut()` allows us to convert a numeric variable to a factor.
#' The arguments `breaks` is used to specify the cutoffs for the
#' categories.
(x <- rnorm(20))
cut(x, breaks = c(-1, 0, 1)) #breaks is the interval
#it wil not include -1, but will include -.999999 (etc)
#only cuts at the break, other vars become na. 
#' Note that values outside the smallest and largest "break" are set to `NA`.
#' To prevent that we can include `-Inf` and `Inf`:
cut(x, breaks = c(-Inf, -1, 0, 1, Inf))

#' By default, the lower bound of each interval is excluded, the upper bound is included.
#' We can include the lowest bound by setting `include.lowest = TRUE`.
cut(x, breaks = c(-Inf, -1, 0, 1, Inf), include.lowest = TRUE)

#' Note that this only changes the lower bound of the lowest interval.
#' The argument `right` specifies that the right bound of each interval is included,
#' this chan be changed by setting `right = FALSE`.
#' By default, the resulting factor is unordered. With the argument 
#' `ordered_result = TRUE` we can change this. (More on ordered factors later.)
#' 
#' To set custom labels for the categories, the argument `labels` can be used:
cut(x, breaks = c(-Inf, -1, 0, 1, Inf), labels = c('lowest', 'low', 'high', 'highest'))
#' 
 


#' ### Splitting a `data.frame`, `matrix` or `vector`
#' The function `split()` splits a `data.frame`, `matrix` or `vector` by one 
#' or more categorical variables:
split(swiss, f = swiss$Education > 10)
#' This creates a list with one element per category of `f`.
 
#' When the splitting factor has more categories, the list has more elements:
split(swiss, f = cut(swiss$Education, c(0, 5, 10, 15, 20)))
#' Note that cases with Education > 20 are now excluded (because we set the highest
#' breakpoint in `cut()` to 20).

#' To include the "category" `NA`, we can use the function `addNA()`:
split(swiss, f = addNA(cut(swiss$Education, c(0, 5, 10, 15, 20))))
# outside of the breaks you predefined -> NA, you will miss them in your splitted data set.With addna you add a misssing values categories
# this also includes the nas you would already have!

#' The list elements will always have the same class as the original object,
#' i.e., when we split a vector, we obtain a list of vectors:
split(x, x > 0)

# so split can basicly split all data set, as long as you define how it should be splitted (as a factor, so either define cut off within continous or give a factor)

#' ### Combining `vectors` etc.
#' The function `c()` allows us to combine values into a `vector` or a `list`,
#' the functions `cbind()` and `rbind()` combine objects (usually vectors,
#' matrices or data.frames) by column or row, respectively.
(x <- 1:5)
(y <- 3:7)
c(x, y)
cbind(x, y)
rbind(x, y)

#+ error = TRUE
(X <- matrix(nrow = 3, ncol = 2, data = LETTERS[1:6]))
(Y <- matrix(nrow = 4, ncol = 2, data = LETTERS[10:17]))

rbind(X,Y) # put it underneath, it will be combined at rows
cbind(X,Y) # combine it as columns: will not work! three rows x for rows. 
#' When combining matrices or data.frames, the dimensions must match.

#' When combining vectors, the shorter object is repeated up to the length of
#' the longer vector:
(z <- 1:9)
cbind(x, z) # X JUST GET REPEATED! you get a warning, but when x is a multiple of z, you dont even get a warning

# so binding uneven matrices wont work, but combining uneven lists will work - sometimes without warning!

#' When combining lists, behaviour depends on whether both elements are lists:
(list1 <- list(a = 4, b = c(1, 4, 6)))
c(list1, LETTERS[4:7])
c(list1, list(LETTERS[4:7]))


#' ### Combining strings
#' The function `paste()` (and it's special case `paste0()`) 
#' allows us to combine objects into strings:
paste0("The mean of x is ", mean(x), ".")

#' `paste()` has arguments `sep` and `collapse` that control how the different
#' objects and elements of the objects are combined:
paste("This", "is", "a", "sentence.", sep = " +++ ") #different objects will be seperated by whatever you put in this argument
paste(c("This", "is", "a", "sentence."), collapse = " +++ ") #work the same, but pay attention at where you put the argument!



#' ### Getting a subset of a `data.frame`
#' The function `subset()` helps us to get a subset of a `data.frame`.
#' Its arguments `subset` and `select` are used to specify which cases and which
#' variables should be selected:
subset(swiss, 
       subset = Education > 15,
       select = c(Fertility, Education, Infant.Mortality))
#' Note that here we can use the variable names without quotes.
#' 


#' ### Merging data
#' The function `merge()` allows us to merge two datasets.
# Create two datasets
dat1 <- swiss
dat1$id <- rownames(swiss)

dat2 <- data.frame(id = c(paste0('newid', 1:5), rownames(swiss)[1:30]),
                   x = rnorm(35))
# random number so there is something to merge

head(dat1)
head(dat2)

mdat <- merge(dat1, dat2)
head(mdat)
dim(mdat)
#you lose row which are not in both datasets

#' The arguments `all`, `all.x` and `all.y` allow us to specify what happens with
#' cases that are only found in one of the two datasets:
mdat_all <- merge(dat1, dat2, all = TRUE)
mdat_x <- merge(dat1, dat2, all.x = TRUE)
mdat_y <- merge(dat1, dat2, all.y = TRUE)

dim(mdat_all)
head(mdat_all)

dim(mdat_x)
dim(mdat_y)

#' By default, `merge()` will take all identical column names to merge by.
#' Arguments `by.x` and `by.y` allow us to specify the names of variables
#' in each of the datasets to use for merging. This is also possible when 
#' variable names differ:
dat2$z <- sample(1:10, size = nrow(dat2), replace = TRUE) # we add a new variable to the data
dat2$Examination <- dat1$Examination[match(dat2$id, dat1$id)]

mdat3 <- merge(dat1, dat2, by.x = c('id', 'Education'), by.y = c('id', 'z'),
               all = TRUE)

head(mdat3) 
#' * We now have two rows per `id` (for most `id`s), because the values in the
#'   merging variable `Education` (and `z`) differed between `dat1` and `dat2`.
#' * The variable `Examination`, which existed in both datasets, got the suffix
#'   `.x` and `.y` in is now duplicated. The suffix can be changed using the
#'   argument `suffixes`.

#' The function `match()` returns the positions of the (first) matches of its
#' first argument in its second argument.
(a <- c('G', 'A', 'D', 'B', 'Z'))
(b <- LETTERS[1:8])
match(a, b)

Repetition and Secquences

#' ## Repetition and Sequences
#' The function `rep()` replicates elements of a `vector` or a `list`.
rep(c('A', 'B'), 4)
rep(c('A', 'B'), each = 4)
rep(c('A', 'B'), c(2, 4))

rep(list(a = 4, s = "This is a string.", b = c('A', 'B', 'C')), 
    c(1, 3, 1))

#' The function `seq()` generates a sequence:
seq(from = 2, to = 5, by = 1)
seq(from = 2, to = 5, by = 0.5)
seq(from = 2, to = 5, length = 8)

a
seq_along(a) # just counts the elements 

#' The function `expand.grid()` creates a `data.frame` with all combinations of
#' the supplied variables:
expand.grid(x = c(1, 2, 3),
            a = c('a', 'b'))
#gives all possible combinations of x and a

Transforming objects

#' ## Transforming objects
#' The function `t()` transposes a `matrix` or a `data.frame`:
(M <- matrix(nrow = 3, ncol = 2, data = 1:6))
t(M)
#' A `data.frame` is first converted to a `matrix`, then transposed.
#' A `vector` is seen as a column vector, i.e., transposing it will result in a 
#' `matrix` with one row:
(x <- c(1, 2, 3))
t(x)
t(t(x))


#' `unlist()` flattens lists:
(mylist <- list(a = c(2, 5, 1),
                b = list(name = 'Otto', age = 54, height = 182),
                c = matrix(nrow = 3, ncol = 2, data = 1:6)))

unlist(mylist)

(otherlist <- list(a = list(LETTERS[1:5]),
                  b = list(names = c('Otto', 'Max'), 
                           ages = c(54, 45),
                           height = 182),
                  c = 33))

unlist(otherlist)
unlist(otherlist, recursive = FALSE)

unname(otherlist) #removes originaldf. in front of each variable name! 
unlist(unname(otherlist), recursive = FALSE)


#' The functions `as.numeric()`, `as.matrix()` and `as.data.frame()` can be used
#' to convert objects to numeric vectors, matrices and data frames, respectively.
M
as.numeric(M) #vector
as.data.frame(M) #dataframe -> automatically makes names: v1, v2
as.matrix(head(swiss))

Sorting

#' ## Sorting 
#' The function `sort()` allows us to sort a vector
a <- c(5, 3, 9, 44, 1, 4)
sort(a) #place it in order (also)

b <- factor(c("A", "Q", "D", "M"))
sort(b, decreasing = TRUE)

#' The function `order()` returns the order of the elements in a vector:
order(a)
#' i.e.: The smallest element of `a` is on the 5th position, the 2nd smallest 
#' element on the 2nd position, ...

#' To `rank()` the elements of `a`:
rank(a)
#' i.e.: the 1st element of `a` is the 4th smallest, the 2nd element is the
#' 2nd smallest, ...

#' With `rev()` we can reverse the order:
rev(a)

PRACTICAL 2 FUNCTION WRITING

# - Write the syntax to simulate a single role of a die using the function  sample() (no function yet).
rolls = 1 #how many times will you roll the dice?
sample(1:6, size = rolls, replace = TRUE)
# if replace = F, you cannot roll multiple dices

# - Write your own function that simulates this one role of a die.
die <- function() {
  sample(1:6, size = 1)
}

die()

#Take another look at the description of the function sample(). How would you need to change the syntax from Task 1 from above to simulate rolling two dice?
die2 <- function(){
  sample(1:6, size = 2, replace = TRUE)
}

die2()

# - Write another function that simulates rolling multiple dice.
# - Write the function so that the number of dice is flexible
die3 <- function(numbdice){
  sample(1:6, size = numbdice, replace = TRUE)
}

die3(3)

# - What happens when you use the function dice() without specifying the argument n? Do you know or can you guess why that happens?
die3()
# you will get 6 number, because you get the default argument for size in sample(), which is the n from x
# or sample the default for size is the number of items inferred from the first argument, so that sample(x) generates a random permutation of the elements of x (or  1:x).

#What can we learn from this? Read the help files! Think about / try what will happen if "wrong" input is given to your function!

die4 <- function(numbdice = 1){
  sample(1:6, size = numbdice, replace = TRUE)
}

die4(3)
die4()

Control-flow Constructs

IF ELSE

#' ## `ifelse()` vs `if() ... else ...`
#' Let's see what happens when we use the `ifelse()` example from the slides
#' with `if() ... else ...`

x <- rnorm(10)
ifelse(x < 0, "<0", ">0")

if (x < 0) {
  "<0"
} else {
  ">0"
}

#this will not work - will work if you will write a for loop

#' The problem here is that `if()` expects an expression that either returns
#' `TRUE` or `FALSE`, but we gave it a vector:
x < 0

#' In this case, only the first element of this vector is used.
#'  
#' On the other hand, the `if () ... else ...` example from the slides works in
#' both cases:
if (length(x) > 5) {
  mean(x)
} else {
  x
}

ifelse(length(x) > 5, mean(x), x)
#both will work

#' But for other cases it may not do what we expect it to do:
x <- list(a = 5, b = rnorm(7), c = rnorm(4))
x

ifelse(length(x) > 5, mean(x), x) # it returns the first element of x, because the yes/no decision also only had one element
#' Here, the "test" `length(x) > 5` results in `FALSE`, but because `ifelse()`
#' expects a vector of "tests" (and only receives one "test"), it will only 
#' return the first element of `x`.
#' 

#' But `if() ... else ...` also does not work per element:
if (length(x) > 5) {
  mean(x)
} else {
  x
}

#if we want the thingy to work for the all elements of the list, we need a for loop!

FOR LOOP

#' ## `for()`-loop
#' We could solve the above issue using a `for()`-loop. This allows us to look
#' at each element of `x` separately.

#' Here, the function `seq_along()` comes in handy:
seq_along(x)
#' It produces a sequence with the same length as `x` has elements.


#' Because we are using a loop, we need to add the `print()` function to see
#' the printed output.
for (i in seq_along(x)) {
  
  if (length(x[[i]]) > 5) {
    print(mean(x[[i]]))
  } else {
    print(x[[i]])
  }

}

#now, the mean will be printed only if there are at least 5 observations
 

#' We could also "collect" the output in a new object:
output <- list()

for (i in seq_along(x)) {
  
  output[[i]] <- if (length(x[[i]]) > 5) {
    mean(x[[i]])
  } else {
    x[[i]]
  }
  
}

output

#now, output will not be printed unless you request output

WHILE

#' ## An example with `while()`
#' `while()` requires a `condition` that at some point is `FALSE` for the loop
#' to stop.
#' 
#' The following would run forever (stop it by pressing "Esc" when in the Console):
#+ eval = FALSE
i <- 0
while (i < 5) {
  print(i)
}

#hold the findings, and go on to the next one - so you update your numbers -> you iterate back 

#' This one here could also take very long. Instead of printing the output
#' we save it in a vector that we can plot afterwards.
x <- 0
xvec <- c()

while (x < 5) {
  x <- x + rnorm(1, 0, 1)
  xvec <- c(xvec, x)
}

length(xvec)
tail(xvec)

#+ fig.width = 8, fig.height = 4
plot(xvec, type = 'l')

#' We could extend the syntax from above by adding a counter
x <- i <- 0
xvec <- c()

while (x < 5 & i < 25) {
  x <- x + rnorm(1, 0, 1)
  xvec <- c(xvec, x)
  i <- i + 1
}

xvec
length(xvec)
i

#' Now we will get a maximum of 25 iterations. The loop stops as soon as `x` is
#' larger than or equal to 5, or if 25 iterations are reached.

PRACTICAL 3: CONTROL FLOWS AN FUNCTIONS

# create data
set.seed(2020)
nrj <- data.frame(sex = factor(sample(c('male', 'female'), size = 100, replace = TRUE)),
                  kcal = round(rnorm(100, mean = 2200, sd = 250)),
                  weight = runif(100, min = 45, max = 150),
                  height = rnorm(100, mean = 170, sd = 10),
                  age = rnorm(100, mean = 50, sd = 10),
                  sports = factor(sample(c('never', 'sometimes', 'regularly'),
                                         size = 100, replace = TRUE), ordered = TRUE))


head(nrj)

# Calculating the BMR

# We want to calculate the Basal Metabolic Rate (BMR) for the individuals in the nrj data.

#The formula differs for men and women:
# men: (13.75?weight)+(5?height)???(6.76?age)+66(13.75?weight)+(5?height)???(6.76?age)+66
# women: (9.56?weight)+(1.85?height)???(4.68?age)+655

# - Which function would you choose to calculate the BMR? ifelse() or  if() ... else ...?
#  are possible, but ifelse() is more straightforward in this setting.

# - Calculate the BMR using ifelse() and add it to the dataset as a new variable BMR1.
nrj$BMR1 <- ifelse((nrj$sex == 'male'),
       yes = ((13.75*nrj$weight)+(5*nrj$height)???(6.76*nrj$age)+66),
       no = ((9.56*nrj$weight)+(1.85*nrj$height)???(4.68*nrj$age)+655))

head(nrj)

# How is the condition used in if() different from the test used in  ifelse()?
# The argument test in ifelse() expects a vector of "tests" (a vector of TRUE and  FALSE) while the condition argument in if() expects a single "test" (a single  TRUE or FALSE).

#How can we check row by row if a subject is male or female?
#use (i in 1:nrow(nrj) in a for loop to run along the rows in nrj. DO NOT USE SEQ_ALONG!

for (i in 1:nrow(nrj)){
nrj$BMR2[i] <- if(nrj$sex[i] == 'male'){
  ((13.75*nrj$weight[i])+(5*nrj$height[i])???(6.76*nrj$age[i])+66)}
else {
  ((9.56*nrj$weight[i])+(1.85*nrj$height[i])???(4.68*nrj$age[i])+655)}
}

head(nrj)

# general form:
for (i in <"columns of nrj">) {
  
  if ("<subject is male">) {
    
    <"formula for males">
    
  } else {
    
    <"formula for females">
    
  }
}

#example:
nrj$BMR2 <- NA   # "empty" version of BMR2

# loop over all rows
for (i in 1:nrow(nrj)) {
  
  # test if the subject is male
  nrj$BMR2[i] <- if (nrj$sex[i] == 'male') {
    
    # formula for males
    13.75 * nrj$weight[i] + 5 * nrj$height[i] - 6.76 * nrj$age[i] + 66
    
  } else {
    
    # formula for females
    9.56 * nrj$weight[i] + 1.85 * nrj$height[i] - 4.68 * nrj$age[i] + 655
    
  }
}

#Check that BMR1 and BMR2 are the same.
round(nrj$BMR2, 1) == round(nrj$BMR1, 1)

all.equal(nrj$BMR1, nrj$BMR2)
identical(nrj$BMR1, nrj$BMR2)
table(nrj$BMR1 == nrj$BMR2)
summary(nrj$BMR1 - nrj$BMR2)

# Data Summary
# We would now like to get descriptive statistics of the nrj data. For continuous variables, we want the mean and standard deviation, for categorical data we want to report the proportion of subjects in each category.

# -  the mean and standard deviation of the variable kcal.
mean(nrj$kcal)
sd(nrj$kcal)

# - Get the proportion of observations in each category for the variable  sports.
prop.table(table(nrj$sports))
#just proptable(var) wont work, because it is an ordinal variable -> first make it a table!

#U paste() to create some nice-looking output for these summaries:
#- "kcal: ()" for kcal
paste('kcal: ', mean(nrj$kcal), ' (', round(sd(nrj$kcal),2), ')',  sep = '')
paste0("kcal: ", round(mean(nrj$kcal), 1), " (", round(sd(nrj$kcal), 1), ")")

#- "sports: % <category 1>, % , ."
paste('sports: ',levels(nrj$sport)[1],' ',(prop.table(table(nrj$sports))[1]*100), '%, ',
      levels(nrj$sport)[2],' ',(prop.table(table(nrj$sports))[2]*100), '%, ',
      levels(nrj$sport)[3],' ',(prop.table(table(nrj$sports))[3]*100), '%.', sep = '')

# another quicker option!
tab <- prop.table(table(nrj$sports))
props <- paste0(round(tab * 100, 1), "% ", names(tab), collapse = ", ")
paste('sports:', props)

#- Write a loop that creates the summary strings we created in the previous Solution for each variable in the nrj data.
#- Use print() to print each of the summaries.
for (i in 1:ncol(nrj)){
  if (is.factor(nrj[,i]) == FALSE){
    print(paste0((names(nrj)[i]),": ", 
           round(mean(nrj[,i]), 1), " (", round(sd(nrj[,i]), 1), ")"))
    } else {
      tab <- prop.table(table(nrj$sports))
      props <- paste0(round(tab * 100, 1), "% ", names(tab), collapse = ", ")
      print(paste(names(nrj)[i],': ', props))
    }  
  }


# DONT FORGET TO CHECK ALL [,i] and ADD PRINT PLEASE!!!!!

# Writing your own function

# Write two functions:
#- one that creates the summary string for a continuous variable,
summary_continuous <- function(x){
  paste0(round(mean(x), 1), " (", round(sd(x), 1), ")")}

summary_continuous(nrj$kcal)

#- one that creates the output for a categorical variable.
summary_categorical <- function(x){
  tab <- prop.table(table(x))
  props <- paste0(round(tab * 100, 1), "% ", names(tab), collapse = ", ")
  paste(props)}  

summary_categorical(nrj$sex)

# make sure you dont put print in the function, as it will print the output twice!

#Assume that the input will be a vector, such as nrj$kcal or nrj$sports.Note that when we work with these vectors, they do not contain the variable name any more, so we cannot use the variable name in the two functions.

#Write another function that has a data.frame as input and prints the summary string for each variable, using the two functions from the previous solution.

summary_all <- function(dat){
 for (i in 1:ncol(dat)){
  if (is.factor(dat[,i]) == FALSE){
    print(paste0((names(dat)[i]),": ", 
           round(mean(dat[,i]), 1), " (", round(sd(dat[,i]), 1), ")"))
    } else {
      tab <- prop.table(table(dat[,i]))
      props <- paste0(round(tab * 100, 1), "% ", names(tab), collapse = ", ")
      print(paste(names(dat)[i],': ', props))
    }  
  }  
}

summary_all(nrj)

# General form
summary_df <- function(dat) {
  
  # loop over all columns
  for (i in 1:ncol(dat)) {
    
    # check if the column is a factor
    summary_string <- if (is.factor(dat[, i])) {
      
      # syntax for a categorical variable
      summary_categorical(dat[, i])
      
    } else {
      
      # syntax for a continuous variable
      summary_continuous(dat[, i])
    }
    
    # print the result of the summary
    print(summary_string)
  }
  
}

summary_df(nrj)
summary_all(nrj)

# Modify the function so that the output strings look like the output in the previous exercise (with "variable name: .").

summary_df <- function(dat) {
  
  # loop over all columns
  for (i in 1:ncol(dat)) {
    
    # check if the column is a factor
    summary_string <- if (is.factor(dat[, i])) {
      
      # syntax for a categorical variable
      summary_categorical(dat[, i])
      
    } else {
      
      # syntax for a continuous variable
      summary_continuous(dat[, i])
    }
    
    # print the result of the summary together with the variable name
    print(paste0(names(dat)[i], ": ", summary_string))
  }
}

summary_df(dat = nrj)

# my own function already did this!

A Custom Subset Function

Selection based on an interval

Aim of this practical is to write a function that selects a subset of a dataset based on whether a particular variable is inside an interval.

Note: When writing functions, there are usually many ways that lead to the same solution. The solutions given here are suggestions, but there are many other ways the functions could be written.

#Add a check to fun4_correct that first checks if variable is of type  numeric. When a non-numeric variable is selected, the function should not try to produce a subset but instead print a message.

fun4 <- function(dat, variable, interval, incl_boundaries=TRUE, outside = FALSE) {
  if (is.numeric(dat[,variable]) == TRUE){
    filter <- 
      if (incl_boundaries != outside){
      dat[, variable] >= interval[1] & dat[, variable] <= interval[2]}
    else {dat[, variable] > (interval[1]) & dat[, variable] < (interval[2])}
    if (outside){
    filter <- !filter}
    subset(dat, subset = filter)}
  else {print('variable should be numeric')
    break}
}

fun4(df1, variable = 'var4', interval = c(-1, 1), incl_boundaries = T, outside = T)

#Solution: 
fun5 <- function(dat, variable, interval, incl_boundaries = TRUE, outside = FALSE) {
  
  if (!is.numeric(dat[, variable])) {
    print('The variable you selected is not numeric!')
    
  } else { 
  
  filter <- if (incl_boundaries != outside) {
    dat[, variable] >= interval[1] & dat[, variable] <= interval[2]
  } else {
    dat[, variable] > interval[1] & dat[, variable] < interval[2]
  }
  
  if (outside) {
    filter <- !filter
  }
  
  subset(dat, filter)
  }
}

fun5(df1, variable = 'var4', interval = c(-1, 1), incl_boundaries = T, outside = T)

#hetzelfde, startten echter met foutmelding! is sneller! Geeft ook niet de loopfoutmelding. 

#---

# When we use stop() we do not need the else part of the if() statement because the function will be stopped immediately and the rest of the syntax will not be evaluated.

# Typically we would not just want to print a message but either create a  warning() or an error message (using stop()) which would immediately stop the execution of the function:

fun5b <- function(dat, variable, interval, incl_boundaries = TRUE, outside = FALSE) {
  
  if (!is.numeric(dat[, variable])) {
    stop('The variable you selected is not numeric!', call. = FALSE)
  }
  
  filter <- if (incl_boundaries != outside) {
    dat[, variable] >= interval[1] & dat[, variable] <= interval[2]
  } else {
    dat[, variable] > interval[1] & dat[, variable] < interval[2]
  }
  
  if (outside) {
    filter <- !filter
  }
  
  subset(dat, filter)
}

fun5b(df1, variable = 'var4', interval = c(-1, 1), incl_boundaries = T, outside = T)

#actually gives an error

DEMO APPLY

#' ## Load packages 
#' If you are using the package for the first time, you will have to first install it
# install.packages("survival") 
library(survival)

#' ## Get data
#' Load data sets from package
pbc <- survival::pbc
pbcseq <- survival::pbcseq

#' ## Wide format data

#' ### apply

#' Obtain the mean of columns `time` and `age` in the `pbc` data set
names(pbc) #c(2,5)/c("time", "age") = variables, 2 = colums, mean = function you want to use
apply(pbc[, c(2,5)], 2, mean)
apply(pbc[, c("time", "age")], 2, mean)

#' Obtain the mean of rows in the `pbc` data set \
#' Before running the code, think if it is meaningful 
apply(pbc[, ], 1, mean)
#unpossible: we have factors in it, also some missings

#' Obtain the standardized values of columns `time`, `age` and `bili` in the `pbc` data set
head(apply(pbc[, c("time", "age", "bili")], 2, function(x) (x-mean(x))/sd(x)))
# with function, you state that you gonna ask for 'quite something' -> std values for every person in df

#' Other examples \
#' Create a matrix
X <-  sample(0:200, 100)
Mat <- matrix(X, 50, 50) 

#' Obtain the mean value of each row for matrix `Mat`
apply(Mat, 1, mean)

#' Obtain the mean value of each column for matrix `Mat`
apply(Mat, 2, mean)

#' Calculate the sum of each column for matrix `Mat`
apply(Mat, 2, sum)
apply(Mat, 2, function(x) sum(x))

#' Calculate the sum of each row for matrix `Mat`
apply(Mat, 1, sum)
apply(Mat, 1, function(x) sum(x))

#' There is no one way of doing things in R!
colSums(Mat)
rowSums(Mat)


#' ### lapply

#' Obtain the summary of the `pbc` data set
lapply(pbc, summary)

#' Ontain the number of missing values per `pbc` variable
lapply(pbc, function(x) sum(is.na(x))) 

#' Other examples \
#' Obtain the quadratic term of the vector `1:3` \
#' Present the results as a list
lapply(1:3, function(x) x^2)

#' Create a list that consist of `Mat` and `Mat^2` \
#' Obtain the mean of each element \
#' Present the results as a list
X <- list(Mat, Mat^2)
lapply(X, mean)

#'Create a list
A <- matrix(1:9, 3,3)
B <- matrix(4:15, 4,3)
C <- matrix(8:10, 3,2)

#' Select elements in a list
MyList <- list(A,B,C) 

#' Select the first row of each element \
#' Present the results as a list
lapply(MyList,"[", 1, )

#' Select the second column of each element \
#' Present the results as a list
lapply(MyList,"[", , 2)


#' ### sapply

#' Obtain the number of missing values per `pbc` variable
sapply(pbc, function(x) sum(is.na(x))) 

#' Other examples \
#' Obtain the quadratic term of the vector `1:3` \
#' Present the results as a vector
sapply(1:3, function(x) x^2)

#' Create a list that consist of `Mat` and `Mat^2` \
#' Obtain the mean of each element \
#' Present the results as a vector
X <- list(Mat, Mat^2)
sapply(X, mean)

#' Select the second column and first row of each element \
#' Present the results as a vector
sapply(MyList,"[", 2, 1)


#' ### tapply

#' Obtain the mean `age` and `time` per `sex`
tapply(pbc$age, pbc$sex, mean)
tapply(pbc$time, pbc$sex, mean)

#' Obtain the mean `age` and `time` (both elements of the variables devided by two) per `sex`
tapply(pbc$age, pbc$sex, function(x) mean(x/2))
tapply(pbc$time, pbc$sex, function(x) mean(x/2))

#' Obtain the mean `age` and `time` per `sex` and `status`
tapply(pbc$age, list(pbc$status, pbc$sex), mean)
tapply(pbc$time, list(pbc$status, pbc$sex), mean)


#' ### mapply

#' Create a list: \
#' 
#' * 1st element: repeats 1 four times \
#' * 2nd element: repeats 2 three times  \  \
#' * 3rd element: repeats 3 two times  \ \
#' * 4th element: repeats 4 one time
mapply(rep, 1:4, 4:1)
#### alternative run: list(rep(1, 4), rep(2, 3), rep(3, 2), rep(4, 1))

#' Create a list: \
#' 
#' * 1st element: repeats 4 one times \
#' * 2nd element: repeats 4 two times \  \
#' * 3rd element: repeats 4 three times \  \
#' * 4th element: repeats 4 four time
mapply(rep, times = 1:4, x = 4)
#### alternative run: list(rep(4, times = 1), rep(4, times = 2), rep(4, times = 3), rep(4, times = 4))

#' Create a list: \
#'
#' * 1st element: repeats 1 four times \ \
#' * 2nd element: repeats 2 four times \ \
#' * 3rd element: repeats 3 four times \  \
#' * 4th element: repeats 4 four time
mapply(rep,1:4, 4, SIMPLIFY = FALSE)
### alternative run: list(rep(1, 4), rep(2, 4), rep(3, 4), rep(4, 4))

#' Note: if the length is the same we can obtain a simplified output
mapply(rep,1:4, 4, SIMPLIFY = TRUE)
### alternative run: matrix(c(rep(1, 4), rep(2, 4), rep(3, 4), rep(4, 4)), 4, 4)

#' Other examples
mapply(function(x,y) seq_len(x) + y,
       c(a = 1, b = 2, c = 3),  
       c(A = 10, B = 0, C = -10))
#### alternative run: list(c(1) + 10, c(1, 2) + 0, c(1, 2, 3) - 10)

X <- list(Mat, Mat^2)
mapply(mean, X)

#' Note!
mapply(mean, MyList)
sapply(MyList, mean)

mapply(function(x,y) {x^y}, x = c(2, 3), y = c(4))
#### alternative run: list(2^4, 3^4)


#' ## Long format data

#' Let's assume that only the long format data set `pbcseq` is available \
#' We want to obtain the mean `serum bilirubin` of the last follow-up measurement per `status` group \
#' Each patient is counted once! \
head(pbcseq)

#' Sort data
pbcseq <- pbcseq[order(pbcseq$id, pbcseq$day), ] # day tells a bit more about last follow up

#' Select the last follow-up measurement of each patient
pbcseq.idNEW2 <- pbcseq[tapply(rownames(pbcseq), pbcseq$id, tail,  1), ]

#' Step by step
tapply(rownames(pbcseq), pbcseq$id, tail,  1)

### alternative run: pbcseq.idNEW2 <- pbcseq[!duplicated(pbcseq[c("id")], fromLast = TRUE), ]

#' Obtain the mean `serum bilirubin` per `status` group
tapply(pbcseq.idNEW2$bili, pbcseq.idNEW2$status, mean)


#' Let's again assume that only the long format data set `pbcseq` is available \
#' We want to obtain the mean `serum bilirubin` of the last stage of `edema` per `status` group \
#' Each patient and `edema` stage is counted once! \
head(pbcseq)

#' Sort data
pbcseq <- pbcseq[order(pbcseq$id, pbcseq$edema), ] # instead of last follow up, use edema status 

#' Select the last stage of `edema` of each patient
pbcseq.idNEW3 <- pbcseq[tapply(rownames(pbcseq), pbcseq$id, tail,  1), ]

#' Step by step
tapply(rownames(pbcseq), pbcseq$id, tail,  1)

### alternative run: pbcseq.idNEW3 <- pbcseq[!duplicated(pbcseq[c("id")], fromLast = TRUE), ]

#' Obtain the mean `serum bilirubin` per `status` group
tapply(pbcseq.idNEW3$bili, pbcseq.idNEW3$status, mean)

PRACTICAL APPLY FAMILY

heart <- survival::heart
retinopathy <- survival::retinopathy
str(heart)
str(retinopathy)

Obtain the mean of the columns start, stop, event, age, year, surgery of the heart data set.

Obtain the mean of the columns age, futime, risk of the retinopathy data set. ### The Apply family

apply(heart[,c(1:6)], 2, mean)
apply(retinopathy[,c('age', 'futime', 'risk')], 2, mean)

Create the matrix dataset1 <- cbind(A = 1:30, B = sample(1:100, 30)) and find the row medians of dataset1.

dataset1 <- cbind(A = 1:30, B = sample(1:100, 30)) 

apply(dataset1, 1, median)
#voert functie uit op dataset, maar je moet wel speciferen of het op rij of kolom moet

lapply

Create the following function DerivativeFunction <- function(x) {log10(x) + 10}. Apply the DerivativeFunction to dataset1 <- cbind(A = 1:30, B = sample(1:100, 30)). The output should be a list.

DerivativeFunction <- function(x) {log10(x) + 10}
dataset1 <- cbind(A = 1:30, B = sample(1:100, 30))

lapply(dataset1, DerivativeFunction)
# voert functie uit op alle lists

Create a list that consist of the variables age and year from the heart data set and the variable risk from the retinopathy data set. Give the name list1 to this list.

Obtain the median of each element of the list. The output should be a list.

ls <- list(heart$age, heart$year, retinopathy$risk)
lapply(ls, median)

sapply

Create the following function Function2 <- function(x) {exp(x) + 0.1}. Apply the Function2 to dataset2 <- cbind(A = c(1:10), B = rnorm(10, 0, 1)). The output should be simplified.

Function2 <- function(x) {exp(x) + 0.1}
dataset2 <- cbind(A = c(1:10), B = rnorm(10, 0, 1))
# voert functie uit voor alle observaties

sapply(dataset2, Function2)

Create a list that consist of the variable transplant from the heart data set and the variable status from the retinopathy data set. Give the name list2 to this list.

Obtain the percentages of 0 of each element of the list in a simplified output (as a vector)

Obtain the percentages of 1 of each element of the list in a simplified output (as a vector).

list2 <- list(heart$transplant, retinopathy$status)
sapply(list2, function(x) table(x)/length(x))
# met x specificeer je eigenlijk i in list!

# je kunt aangeven wel deel je van de tabel wil hebben
sapply(list2, function(x) table(x)[1]/length(x)) 
sapply(list2, function(x) table(x)[2]/length(x))
# denk met subsetten niet aan de versimpelde output, maar aan de originele op een list

Do you remember the practical Control_Flow_and_Functions: Writing your own function (Task 1 and 2)? Now try to create again the same function (called summary_df) but avoid the use of a for loop. Apply the function to the retinopathy dat set.

summary_continuous <- function(x) {
  paste0(round(mean(x), 1), " (", round(sd(x), 1), ")")
}

summary_categorical <- function(x) {
  tab <- prop.table(table(x))
  paste0(round(tab * 100, 1), "% ", names(tab), collapse = ", ")
}


summary_df <- function(dat) {
  vec_categorical <- sapply(dat, is.factor)
  print(sapply(dat[,vec_categorical], summary_categorical))
  vec_continuous <- sapply(dat, is.numeric)
  print(sapply(dat[,vec_continuous], summary_continuous))
}

summary_df(dat = retinopathy)

tapply

Obtain the median year per transplant group using the heart data set.

Obtain the median futime per status group using the retinopathy data set.

tapply(heart$year, heart$transplant, median)
tapply(retinopathy$futime, retinopathy$status, median)

Apply the function Fun1 <- function(x) {mean(x)/(length(x) - 2)} to year per transplant and surgery group using the heart data set. Obtain the mean futime per status, type and trt group using the retinopathy data set.

Fun1 <- function(x) {mean(x)/(length(x) - 2)}

tapply(heart$year, list(heart$transplant, heart$surgery), Fun1)
# met meerdere opdelingen: gebruik LIST

tapply(retinopathy$futime, list(retinopathy$status, retinopathy$type, retinopathy$trt), mean)
#row -> kolom -> list

DEMO EXTRA PROGRAMMING

#' ## Load packages 
#' If you are using the package for the first time, you will have to first install it
# install.packages("survival") 
library(survival)

#' ## Get data
pbc <- survival::pbc

#' ## Smart programming
#' * Keep good notes! \
#' 
#' Code 1:
x<-10
y<-10/2
z<-x+y

#' Code 2
# store a value
x <- 10
# take half of the stored value
y <- x/2
# get the sum of both values
z <- x + y

#' Which code (1 or 2) would you prefer? Prob 2, because you have to change less! \
#' 
#' * Always check if you can make your code faster
marks = sample(0:100, 200, replace = TRUE)
system.time({ifelse(marks >= 40, "pass", "fail")})

system.time({
  for (j in 1:length(marks)){
    if(marks[j] >= 40) {
      print("pass") 
    } else {
      print("fail")
    }
  }
})

#' Other example \
#' Create a matrix \
A <- matrix(rnorm(1e06), 1000, 1000)
A[1:10 , 1:10]

#' Calculate the sum per row
head(apply(A, 1, sum))
#?rnorm

#' Calculate the sum per row, repeat this procedure 100 times
res1 <- replicate(100, z <- apply(A, 1, sum))
res2 <- replicate(100, z <- rowSums(A)) # Use specialized functions

#' Compute the cumulative sum
x <- rnorm(1000000, 10, 10)

cSum <- 0
z <- numeric() # empty vector
for (k in 1:length(x)){
  cSum <- cSum + x[k]
  z[k] <- cSum
}

#' Explore how it works
i <- 1
cSum <- 0
cSum <- cSum + x[i]
x[1]

i <- 2
cSum <- cSum + x[i]
# steeds de volgende erbij optellen

res <- cumsum(x)
tail(res)

#' Explore the timing of the code
cSum <- 0
z <- numeric() # empty vector
system.time({
  cSum <- 0
  for (i in 1:length(x)){
    cSum <- cSum + x[k]
    z[k] <- cSum
  }
})

system.time({
  cumsum(x)
}) # better

# some code might be quicker!!

#' Other example \
#' Create a dichotomous variable for `age` (assume as cut-off point the value 42)
for (i in 1:dim(pbc)[1]) {  #dim(pbc)[i] is aantal rows in pbc eerste var
  pbc$ageCat[i] <- as.numeric(pbc$age[i] > 42)
}


#' Explore what is happening \
#' This will help you later create more complicated loops
i <- 1
pbc$ageCat[i] <- as.numeric(pbc$age[i] > 42)

head(pbc)

pbc[i, ]


#' Do the same thing without a loop
pbc$ageCat <- as.numeric(pbc$age > 42)
# works the same but then quicker

#' More than one ways exist to code something in R! \
#' 
#' ## Functions and loops
#' Calculate the mean weight for `males` and `females` in 100 datasets \
#' Since we do not have 100 data sets, we will create them! \
#' Let's do that first manually...
datlist <- list()

i <- 1
set.seed(2015+i)
patient <- c(1:20)
weight <- rnorm(20, 70, 10)
sex <- sample(1:2, 20, replace = TRUE)
sex <- factor(sex, levels = 1:2, labels = c("male", "female"))

datlist[[i]] <- data.frame(patient, weight, sex) #[[]] = list within list

i <- 2
set.seed(2015+i)
patient <- c(1:20)
weight <- rnorm(20, 70, 10)
sex <- sample(1:2, 20, replace = T)
sex <- factor(sex, levels = 1:2, labels = c("male", "female"))

datlist[[i]] <- data.frame(patient, weight, sex)
# ..... 


#' It is too much work!
#' Now use a loop
for (i in 1:100) {
  set.seed(2015+i)
  patient <- c(1:20)
  weight <- rnorm(20, 70, 10)
  sex <- sample(1:2, 20, replace = T)
  sex <- factor(sex, levels = 1:2, labels = c("male", "female"))
  
  datlist[[i]] <- data.frame(patient, weight, sex)
}


#' Now that we have 100 data sets we need to calculate the mean `age` per `gender` in each of them \
#' Let's do that manually...
means <- matrix(NA, length(datlist), 2) #voor iedere list een mean for age per gender

i <- 1
dat <- datlist[[i]]
means[i, ] <- tapply(dat$weight, dat$sex, mean)

i <- 2
dat <- datlist[[i]]
means[i, ] <- tapply(dat$weight, dat$sex, mean)


#' Now use a loop
for (i in 1:length(datlist)) {
  dat <- datlist[[i]]
  means[i, ] <- tapply(dat$weight, dat$sex, mean)
}

means

#' Select the data sets, where more than 39% of the patients are `females`
newList <- list()
k <- 1

for (i in 1:length(datlist)) {
  dat <- datlist[[i]]
  if (sum(dat$sex == "female")/20 > 0.39) {
    newList[[k]] <- dat
    k <- k + 1
  }
}

# je kunt het als functie gebruiken omdat het blijft itereren, dus het schrijft dat steeds over

length(newList)

#' Select the data sets, where more than 49% of the patients are `males`
newList <- list()
k <- 1

for (i in 1:length(datlist)) {
  dat <- datlist[[i]]
  if (sum(dat$sex == "male")/20 > 0.49) {
    newList[[k]] <- dat
    k <- k + 1
  }
}


#' Now make a function that takes as input the data sets in a list format, the name of the `gender` variable and the name of the `male` category \
#' This function returns only the data sets, where more than 49% of the patients are `males` \
#' The output should be a list
subset_data <- function(dataset = x, sex_var = "sex", male_cat = "male"){
  newList <- list()
  k <- 1
  for (i in 1:length(dataset)) {
    dat <- dataset[[i]]
    if (sum(dat[[sex_var]] == male_cat)/20 > 0.49) {
      newList[[k]] <- dat
      k <- k + 1
    }
  }
  newList
}

res <- subset_data(dataset = datlist, sex_var = "sex", male_cat = "male")

length(res)


#' Make a function that takes as input a data set, the name of a continuous variable and the name of a categorical variable
#' This function calculates the mean and standard deviation of the continuous variable for each group in the categorical variable
des <- function(data = x, cont = "age", cat = "group"){
  tapply(data[[cont]], data[[cat]], mean)
}
#list in list because data has these lists within it - sibsetting as if it were columns works the same!

#' Apply the function to the `pbc` data set
#' Use `age` and `gender` as continuous and categorical variables
des(data = pbc, cont = "age", cat = "sex")

#' Now change the `des` function so that the user would specify the function applied to the data
des <- function(data = x, cont = "age", cat = "group", fun = mean){
  tapply(data[[cont]], data[[cat]], fun)
}

des(data = pbc, cont = "age", cat = "sex", fun = function(x) sum(x))

Extra programming examples

heart <- survival::heart
retinopathy <- survival::retinopathy
str(heart)
str(retinopathy)

Combinations of Functions and Loops

Common R Objects

h <- function(k){
       if (k <= 20){
        3 * k
      } else {
      2 * k
    }
}

scalar <- 3
vector <- rnorm(100)
matrix <- matrix(vector, 10, 10)
data.frame <- data.frame(matrix)
list <- list(vector)

h(scalar)
h(vector)
h(matrix)
h(data.frame)
h(list) # werkt niet

# als k is boven 20, dan k x 20, anders k x 30 -> voor alle obs

h <- function(k){
  for(i in 1:length(k)){
    k[i]<- if (k[i] <= 20){
      3 * k[i]
    } else {
      2 * k[i] 
    }
  }
  k
}

h(scalar)
h(vector)
h(matrix)
h(data.frame)
h(list) # werkt niet

# werkt hetzelfde als hierboven: er wordt langs alles geitereerd

Define a function (with the name mysummary) that takes one parameter (let’s call it x) and investigates if it is a data frame, a factor, or a numeric variable and returns the message “This is a data.frame” if x is a data frame, “This is a factor” is x is a factor and “This is numeric” if x is numeric.

mysummary <- function(x){
  if (class(x)=='data.frame'){
    print('This is a data frame')
  }
  if (class(x)=='factor'){
    print('This is a factor')
  }
  if (class(x)=='numeric'){
    print('This is a numeric')
  }
}

#OR shorter

mysummary <- function(x){
  if(is.data.frame(x)){
    print('This is a data.frame')
  }else if(is.factor(x)){
    print('This is a factor')
  }else if(is.numeric(x)){
    print('This is numeric')
  }
}




mysummary(1)
mysummary(x)
mysummary(matrix)
mysummary(factor(rep(c('A', 'B'), 50)))

Data exploration

Extend the previous function mysummary to take again one parameter (let’s call it x) but to return also some descriptive statistics. We therefore modify the function in such a way that a frequency table is given for factors and the mean and standard deviation are given for numeric data.

mysummary <- function(x){
  if(is.data.frame(x)){
    print('A data.frame')
    print('Not yet implemented')
  }else if(is.factor(x)){
    print('This is afactor')
    print(table(x))
  }else if(is.numeric(x)){
    print('This variabele is numeric')
    print('mean')
    print(mean(x))
    print('sd')
    print(sd(x))
  }
}

mysummary(1:10)

The previous function mysummary still does not do anything useful when we apply it on the whole data.frame. Extend the function to take again one parameter (let’s call it x) but to return also some descriptive statistics for the data frame.

mysummary <- function(x){
  if(is.data.frame(x)){
    print('A data.frame')
    for(i in 1:ncol(x)){
      print(names(x)[i])
      mysummary(x[,i])
    }     
  }else if(is.factor(x)){
    print('A factor')
    print(table(x))
  }else if(is.numeric(x)){
    print('This variabele is numeric')
    print('mean')
    print(mean(x))
    print('sd')
    print(sd(x))
  }
}

df1_test <- data.frame(v1 = 1:10, v2 = sample(0:1, 10, replace = TRUE))
df2_test <- data.frame(v1 = 1:10, v2 = sample(c("no", "yes"), 10, replace = TRUE))

mysummary(df1_test)
mysummary(df2_test)

Data Manipulation

Create a function with the name std_num that takes as input a data frame (let’s call it DF), standardizes all the numerical variables and returns the new data frame. Apply this function to the heart data set.

std_num <- function(DF){
  for (j in 1:ncol(DF)){
    if (is.numeric(DF[,j])){
      DF[colnames(DF)[j]] <- scale(DF[,j])
    }
  }
  DF
}

#OR longer
heart$event <- as.factor(heart$event)
heart$surgery <- as.factor(heart$surgery)
heart$transplant <- as.factor(heart$transplant)
heart$id <- as.factor(heart$id)

std_num <- function(DF){
  for (j in 1:dim(DF)[2]){
    if (is.numeric(DF[,j])){
      DF[colnames(DF)[j]] <- (DF[colnames(DF)[j]] - mean(DF[[colnames(DF)[j]]]))/sd(DF[[colnames(DF)[j]]])
    }
  }
  DF
}
std_num(heart)

#DF[colnames(DF)[j]] specifies each row for the column you selected

std_num(pbc)

Data visualization

Create a for loop that goes through the columns of the retinopathy data set and plots each column. If the column is a numerical variable then use a density plot, if the column is a categorical variable then use a barchart. Create a plot with multiple panel

library(survival)
library(lattice)
library(ggplot2)
library(emojifont)
library(gtrendsR)

for (i in 1:dim(retinopathy)[2]){
  if (is.numeric(retinopathy[,i])){
    plot(density(retinopathy[,i]))
  } else if (is.factor(retinopathy[,i])){
    barchart(retinopathy[,i])}
}

# OR
for (j in 1:dim(retinopathy)[2]){
  if (is.numeric(retinopathy[,j])){
    plot(density(retinopathy[,j]), col = rgb(0,0,1,0.5))
    polygon(density(retinopathy[,j]), col = rgb(0,0,1,0.5), border = "blue")
  } else if (is.factor(retinopathy[,j])){
    plot(retinopathy[,j])
  }
}

Create a function that applies the previous code in any data set. The name of the function will be plot_summary and it will take as input a data frame (let’s call it dt) and the dimension of the panel that includes the plots. In particular, dim_row represents the rows and dim_col the columns. The function will return a plot for each column. If the column is a numerical variable then use a density plot, if the column is categorical variable then use a barchart. Apply this function to the retinopathy data set.

plot_summary <- function(dt, dim_row, dim_col){
  par(mfrow = c(dim_row, dim_col))
  for (j in 1:dim(dt)[2]){
  if (is.numeric(dt[,j])){
    plot(density(dt[,j]), col = rgb(0,0,1,0.5))
    polygon(density(dt[,j]), col = rgb(0,0,1,0.5), border = "blue")
  } else if (is.factor(dt[,j])){
    plot(dt[,j])
  }
}
}

retinopathy$trt <- as.factor(retinopathy$trt)
retinopathy$status <- as.factor(retinopathy$status)

plot_summary(retinopathy, 3, 3)

Extend the previous function plot_summary to include also plot titles indicating the variable. Apply this function to the retinopathy data set.

plot_summary <- function(dt, dim_row, dim_col){
  par(mfrow = c(dim_row, dim_col))
  for (j in 1:dim(dt)[2]){
  if (is.numeric(dt[,j])){
    plot(density(dt[,j]), col = rgb(0,0,1,0.5), main = paste0(colnames(dt)[j]))
    polygon(density(dt[,j]), col = rgb(0,0,1,0.5), border = "blue")
  } else if (is.factor(dt[,j])){
    plot(dt[,j], main = paste0(colnames(dt)[j]))
  }
}
}

retinopathy$trt <- as.factor(retinopathy$trt)
retinopathy$status <- as.factor(retinopathy$status)

plot_summary(retinopathy, 3, 3)

Subsetting

For each data set (heart and retinopathy), select only the observations that report an event.

ds1<-subset(heart, heart$event == 1)
ds2<-subset(retinopathy, retinopathy$status == 1)

# OR
heart[heart$event == 1, ] # on the row as at is not the column bot the row!!
retinopathy[retinopathy$status == 1, ]

Create a function with the name dt_events that takes as input a list with data sets (let’s call it dt_list) and a vector indicating the name of the event variable for each data set (let’s call it event_name). The function will return a list consisting of each data set but including only the observations/rows that had an event. Create a list that consists of the data sets heart and retinopathy and apply the function.

dt_events <- function(dt_list, event_name){
  new_dt <- list()
  for (i in 1:length(dt_list)){
    dt <- dt_list[[i]]
    new_dt[[k]] <- dt[dt[[event_name[i]]]==1, ]
  }
}

ls <- list(heart, retinopathy)

dt_events(ls, event_name=c('event', 'status'))

The apply family

For each data set (heart and retinopathy), obtain the mean age per event status.

tapply(heart$age, heart$event, mean)
tapply(retinopathy$age, retinopathy$status, mean)

Create a function with the name summary_list that takes as input a list with data sets (let’s call it dt_list), a vector with the names of the continuous variables (let’s call it dt_cont) and a vector with the names of the categorical variables (let’s call it dt_cat) and returns the mean of the continuous variable per group (categorical variable) for each data set. Apply the function to the heart and retinopathy data sets. In particular, use the continuous variables year (from the heart data set) and risk (from the retinopathy data set) and the categorical variables surgery (from the heart data set) and type (from the retinopathy data set).

summarylist <- function(dt_list, dt_cont, dt_cat){
  for (i in 1:length(dt_list)){
    dt <- dt_list[[i]]
    print(tapply(dt[[dt_cont[i]]], dt[[dt_cat[i]]], mean))}
}

summarylist(dt_list = list(heart, retinopathy), dt_cont = c("year", "risk"), dt_cat = c("surgery", "type"))

Extend the previous function to include the mean as input argument (let’s call it calc). Therefore, the user can decide which function to use.

summarylist <- function(dt_list, dt_cont, dt_cat, calc){
  for (i in 1:length(dt_list)){
    dt <- dt_list[[i]]
    print(tapply(dt[[dt_cont[i]]], dt[[dt_cat[i]]], calc))}
}

summarylist(dt_list = list(heart, retinopathy), dt_cont = c("year", "risk"), dt_cat = c("surgery", "type"), sd)

BST02: Part C Functions and Programming

Isabel Schuurmans

28-2-2020