https://epirhandbook.com/en/index.html
https://www.learnbyexample.org/r-introduction/
https://rkabacoff.github.io/datavis/index.html
https://erbiostat.wixsite.com/vdeda1
Consider a dataset with 5 subjects, for each we have information about the gender and the height. Let us define two vectors. The first is a factor represents the gender: \(sex=(M,M,M,F,F)\).
sex <- c("M", "M", "M", "F", "F")
sex
## [1] "M" "M" "M" "F" "F"
The second is a numeric vector represents the heights: \(height=(190, 180, 192, 165, 170)\).
height <- c(190, 180, 192, 165, 170)
height
## [1] 190 180 192 165 170
We can combine the two vectors into one data structure.
cbind(sex,height)
## sex height
## [1,] "M" "190"
## [2,] "M" "180"
## [3,] "M" "192"
## [4,] "F" "165"
## [5,] "F" "170"
The first subject is a male,
sex[1]
## [1] "M"
and his height is 190.
height[1]
## [1] 190
The last subject is a female
sex[5]
## [1] "F"
and her height is equal to 170.
height[5]
## [1] 170
We can print the heights of the males with
height[sex == "M"]
## [1] 190 180 192
Equivalently, the heights of the females
height[sex == "F"]
## [1] 165 170
Note that the vector sex == “M” is a factor with lvels equal to TRUE and FALSE
sex == "M"
## [1] TRUE TRUE TRUE FALSE FALSE
names()poker_vector <- c(140, -50, 20, -120, 240)
poker_vector
## [1] 140 -50 20 -120 240
roulette_vector <- c(-24, -50, 100, -350, 10)
roulette_vector
## [1] -24 -50 100 -350 10
days_vector <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")
names(poker_vector) <- days_vector
names(roulette_vector) <- days_vector
poker_vector
## Monday Tuesday Wednesday Thursday Friday
## 140 -50 20 -120 240
roulette_vector
## Monday Tuesday Wednesday Thursday Friday
## -24 -50 100 -350 10
sum()# Assign to total_daily how much you won/lost on each day
total_daily <- poker_vector+roulette_vector
total_daily
## Monday Tuesday Wednesday Thursday Friday
## 116 -100 120 -470 250
# Total winnings with poker
total_poker <- sum(poker_vector)
# Total winnings with roulette
total_roulette <- sum(roulette_vector)
# Total winnings overall
total_week <- total_roulette + total_poker
# Use logical comparison for vectors
selection_vector <- poker_vector>0
selection_vector
## Monday Tuesday Wednesday Thursday Friday
## TRUE FALSE TRUE FALSE TRUE
poker_winning_days <- poker_vector[selection_vector]
poker_winning_days
## Monday Wednesday Friday
## 140 20 240
# Ordering level in factor variable
temperature_vector <- c("High", "Low", "High","Low", "Medium")
factor_temperature_vector <- factor(temperature_vector, order = TRUE, levels = c("Low", "Medium", "High"))
factor_temperature_vector
## [1] High Low High Low Medium
## Levels: Low < Medium < High
# Specify the levels of factor_survey_vector
survey_vector <- c("M", "F", "F", "M", "M")
factor_survey_vector <- factor(survey_vector)
levels(factor_survey_vector)
## [1] "F" "M"
summary()# Summary of a factor vector
summary(factor_survey_vector)
## F M
## 2 3
# Summary of a continuous vector
poker_vector
## Monday Tuesday Wednesday Thursday Friday
## 140 -50 20 -120 240
summary(poker_vector)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -120 -50 20 46 140 240
matrix()# Create a matrix
matrix(1:9, nrow = 3, byrow= TRUE) # byrow indicates that the matrix is filled by row
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 4 5 6
## [3,] 7 8 9
matrix()# Create matrix from objects
new_hope <- c(460.998, 314.4) # Box office Star Wars (in millions!)
empire_strikes <- c(290.475, 247.900)
return_jedi <- c(309.306, 165.8)
# Create box_office
box_office <- c(new_hope, empire_strikes, return_jedi)
box_office
## [1] 460.998 314.400 290.475 247.900 309.306 165.800
# Construct star_wars_matrix
star_wars_matrix <- matrix(box_office, nrow = 3, byrow = TRUE)
star_wars_matrix
## [,1] [,2]
## [1,] 460.998 314.4
## [2,] 290.475 247.9
## [3,] 309.306 165.8
colnames() and rownames()# Vectors region and titles, used for naming
region <- c("US", "non-US")
titles <- c("A New Hope", "The Empire Strikes Back", "Return of the Jedi")
# Name the columns with region
rownames(star_wars_matrix) <- titles
# Name the rows with titles
colnames(star_wars_matrix) <- region
# Shoter way to name matrix
star_wars_matrix <- matrix(box_office, nrow = 3, byrow = TRUE, dimnames = list(titles, region))
# Print out star_wars_matrix
star_wars_matrix
## US non-US
## A New Hope 460.998 314.4
## The Empire Strikes Back 290.475 247.9
## Return of the Jedi 309.306 165.8
rbind() and
cbind()# Adding column for the matrix by cbind()
worldwide_vector <- rowSums(star_wars_matrix)
all_wars_matrix <- cbind(star_wars_matrix, worldwide_vector)
all_wars_matrix
## US non-US worldwide_vector
## A New Hope 460.998 314.4 775.398
## The Empire Strikes Back 290.475 247.9 538.375
## Return of the Jedi 309.306 165.8 475.106
# Adding a row to matrix by rbind()
star_wars_matrix2 <- star_wars_matrix - 20
all_wars_matrix <- rbind(star_wars_matrix, star_wars_matrix2)
all_wars_matrix
## US non-US
## A New Hope 460.998 314.4
## The Empire Strikes Back 290.475 247.9
## Return of the Jedi 309.306 165.8
## A New Hope 440.998 294.4
## The Empire Strikes Back 270.475 227.9
## Return of the Jedi 289.306 145.8
# Select the non-US revenue for all movies
non_us_all <- all_wars_matrix[,2]
non_us_all
## A New Hope The Empire Strikes Back Return of the Jedi
## 314.4 247.9 165.8
## A New Hope The Empire Strikes Back Return of the Jedi
## 294.4 227.9 145.8
# Select the non-US revenue for first two movies
non_us_some <- all_wars_matrix[c(1:2),2]
non_us_some
## A New Hope The Empire Strikes Back
## 314.4 247.9
rowSums() and colSums()# Calculate sum of rows of a matrix
worldwide_vector <- rowSums(star_wars_matrix)
worldwide_vector
## A New Hope The Empire Strikes Back Return of the Jedi
## 775.398 538.375 475.106
# Calculate sum of cols of a matrix
worldwide_vector <- colSums(star_wars_matrix)
worldwide_vector
## US non-US
## 1060.779 728.100
# Matrix can divide to other matrix
all_wars_matrix/all_wars_matrix
## US non-US
## A New Hope 1 1
## The Empire Strikes Back 1 1
## Return of the Jedi 1 1
## A New Hope 1 1
## The Empire Strikes Back 1 1
## Return of the Jedi 1 1
z<-data.frame(sex, height)
z
## # A tibble: 5 × 2
## sex height
## <chr> <dbl>
## 1 M 190
## 2 M 180
## 3 M 192
## 4 F 165
## 5 F 170
apply() functionapply(), which operates on arrays. For simplicity, the
tutorial limits itself to 2D arrays, which are also known as
matrices.
The R base manual tells you that it’s called as follows:
apply(X, MARGIN, FUN, ...) where:
X is an array or a matrix if the dimension of the
array is 2;
MARGIN is a variable defining how the function is
applied: when MARGIN=1, it applies over rows, whereas with
MARGIN=2, it works over columns. Note that when you use the
construct MARGIN=c(1,2), it applies to both rows and
columns; and
FUN, which is the function that you want to apply to
the data. It can be any R function, including a User Defined Function
(UDF)
apply(star_wars_matrix,2,FUN = sum) # sum of each column
## US non-US
## 1060.779 728.100
lapply() functionYou want to apply a given function to every element of a list and
obtain a list as a result. When you execute ?lapply, you
see that the syntax looks like the apply() function.
The difference is that:
It can be used for other objects like dataframes, lists or vectors; and
The output returned is a list (which explains the “l” in the function name), which has the same number of elements as the object passed to it.
lapply(mtcars,FUN = min) # Get the min value in each column
## $mpg
## [1] 10.4
##
## $cyl
## [1] 4
##
## $disp
## [1] 71.1
##
## $hp
## [1] 52
##
## $drat
## [1] 2.76
##
## $wt
## [1] 1.513
##
## $qsec
## [1] 14.5
##
## $vs
## [1] 0
##
## $am
## [1] 0
##
## $gear
## [1] 3
##
## $carb
## [1] 1
# The vector pioneers has already been created for you
pioneers <- c("GAUSS:1777", "BAYES:1702", "PASCAL:1623", "PEARSON:1857")
# Split names from birth year
split_math <- strsplit(pioneers, split = ":")
# Convert to lowercase strings: split_low
split_low <- lapply(split_math, tolower)
# Take a look at the structure of split_low
split_low
## [[1]]
## [1] "gauss" "1777"
##
## [[2]]
## [1] "bayes" "1702"
##
## [[3]]
## [1] "pascal" "1623"
##
## [[4]]
## [1] "pearson" "1857"
str(split_low)
## List of 4
## $ : chr [1:2] "gauss" "1777"
## $ : chr [1:2] "bayes" "1702"
## $ : chr [1:2] "pascal" "1623"
## $ : chr [1:2] "pearson" "1857"
sapply() functionsapply() works as lapply, but it tries to simplify the
output to the most elementary data structure that is possible. In
effect, as can be seen in the base manual, sapply() is a
‘wrapper’ function for lapply.
sapply(mtcars, FUN=min) # Get the min value in each column
## mpg cyl disp hp drat wt qsec vs am gear carb
## 10.400 4.000 71.100 52.000 2.760 1.513 14.500 0.000 0.000 3.000 1.000
tapply() functiontapply(mtcars$mpg, mtcars$cyl,mean) # this function get the mean value of mgp in each cyl group
## 4 6 8
## 26.66364 19.74286 15.10000
while()# Initialize the speed variable
speed <- 64
# Code the while loop
while (speed > 30) {
print("Slow down")
speed <- speed - 7
}
## [1] "Slow down"
## [1] "Slow down"
## [1] "Slow down"
## [1] "Slow down"
## [1] "Slow down"
# Print out the speed variable
speed
## [1] 29
# Extend/adapt the while loop
speed <- 88
while (speed > 30) {
print(paste("Your speed is",speed))
if (speed > 48 ) {
print("Slow donw big time")
speed <- speed - 11
} else {
print("Slow down!")
speed <- speed - 6
}
}
## [1] "Your speed is 88"
## [1] "Slow donw big time"
## [1] "Your speed is 77"
## [1] "Slow donw big time"
## [1] "Your speed is 66"
## [1] "Slow donw big time"
## [1] "Your speed is 55"
## [1] "Slow donw big time"
## [1] "Your speed is 44"
## [1] "Slow down!"
## [1] "Your speed is 38"
## [1] "Slow down!"
## [1] "Your speed is 32"
## [1] "Slow down!"
# Break the loop
# Initialize the speed variable
speed <- 88
while (speed > 30) {
print(paste("Your speed is", speed))
# Break the while loop when speed exceeds 80
if (speed > 80 ) {
break
}
if (speed > 48) {
print("Slow down big time!")
speed <- speed - 11
} else {
print("Slow down!")
speed <- speed - 6
}
}
## [1] "Your speed is 88"
for()for() loopSuppose that we would like to draw a sample of 10 observations from N(0,1) 1000 times. To do this we can use a “for loop” in the following way. First, we define a vector that will be used to store the sample means (the R object mx ).
mx<-c(1:1000)
for(i in 1:1000)
{
x<-rnorm(10,0,1) # a random sample from N(0,1) with sample size = 10
mx[i]<-mean(x)
}
hist(mx,nclass=20)
for() loop for bootstrap# A sample of 10 observations
x <- c(11.201, 10.035, 11.118, 9.055, 9.434, 9.663, 10.403, 11.662, 9.285,8.84)
mean(x)
## [1] 10.0696
# Nonparametric bootstrap
n <- length(x)
B <- 1000
mx <- c(1:B)
set.seed(091296)
for(i in 1:B){
boot.i<- sample(x,n,replace=T)
mx[i]<- mean(boot.i)
}
var(mx)
## [1] 0.09047223
hist(mx,probability = TRUE)
# Parametric bootstrap
B <- 1000
MLx <- mean(x)
Varx <- var(x)
mx <- c(1:B)
for(i in 1:B){
boot.i <- rnorm(n,MLx,sqrt(Varx))
mx[i] <- mean(boot.i)
}
var(mx)
## [1] 0.09959499
hist(mx)
for() function with break and
next# The nyc list is already specified
nyc <- list(pop = 8405837,
boroughs = c("Manhattan", "Bronx", "Brooklyn", "Queens", "Staten Island"),
capital = FALSE)
# Loop version 1
for (i in nyc){
print(i)
}
## [1] 8405837
## [1] "Manhattan" "Bronx" "Brooklyn" "Queens"
## [5] "Staten Island"
## [1] FALSE
# Loop version 2
for (i in 1:length(nyc)){
print(nyc[[i]])
}
## [1] 8405837
## [1] "Manhattan" "Bronx" "Brooklyn" "Queens"
## [5] "Staten Island"
## [1] FALSE
# Mix it up with control flow
# The linkedin vector has already been defined for you
linkedin <- c(16, 9, 13, 5, 2, 17, 14)
# Code the for loop with conditionals
for (li in linkedin) {
if (li >10 ) {
print("You're popular!")
} else {
print("Be more visible!")
}
print(li)
}
## [1] "You're popular!"
## [1] 16
## [1] "Be more visible!"
## [1] 9
## [1] "You're popular!"
## [1] 13
## [1] "Be more visible!"
## [1] 5
## [1] "Be more visible!"
## [1] 2
## [1] "You're popular!"
## [1] 17
## [1] "You're popular!"
## [1] 14
# Use break
# Adapt/extend the for loop
for (li in linkedin) {
if (li > 10) {
print("You're popular!")
} else {
print("Be more visible!")
}
# Add if statement with break
if (li > 16){
print("This is ridiculous, I'm outta here!")
break }
# Add if statement with next
if (li <5){
print("This is too embarrassing!")
next # the current evaluation stops (value is not printed) but the loop continues with the next iteration.
}
print(li)
}
## [1] "You're popular!"
## [1] 16
## [1] "Be more visible!"
## [1] 9
## [1] "You're popular!"
## [1] 13
## [1] "Be more visible!"
## [1] 5
## [1] "Be more visible!"
## [1] "This is too embarrassing!"
## [1] "You're popular!"
## [1] "This is ridiculous, I'm outta here!"
# Counting how many r before u
# Pre-defined variables
rquote <- "r's internals are irrefutably intriguing"
chars <- strsplit(rquote, split = "")[[1]]
chars
## [1] "r" "'" "s" " " "i" "n" "t" "e" "r" "n" "a" "l" "s" " " "a" "r" "e" " " "i"
## [20] "r" "r" "e" "f" "u" "t" "a" "b" "l" "y" " " "i" "n" "t" "r" "i" "g" "u" "i"
## [39] "n" "g"
# Initialize rcount
rcount <- 0
# Finish the for loop
for (char in chars) {
if(char == "r"){
rcount <- rcount + 1
}
if (char == "u"){
break
}
}
# Print out rcount
rcount
## [1] 5
function()fun20<-function(x) {
mean.x<-mean(x)
med.x<-median(x)
hist(x)
return(c(mean.x,med.x))
}
fun20(chickwts$weight)
## [1] 261.3099 258.0000
function()# The linkedin and facebook vectors have already been created for you
linkedin <- c(16, 9, 13, 5, 2, 17, 14)
facebook <- c(17, 7, 5, 16, 8, 13, 14)
# The interpret() can be used inside interpret_all()
interpret <- function(num_views) {
if (num_views > 15) {
print("You're popular!")
return(num_views)
} else {
print("Try to be more visible!")
return(0)
}
}
# Define the interpret_all() function
# views: vector with data to interpret
# return_sum: return total number of views on popular days?
interpret_all <- function(views, return_sum = TRUE) {
count <- 0
for (v in views) {
count <- count + interpret(v)
}
if (return_sum == TRUE) {
return(count)
} else {
return(NULL)
}
}
# Call the interpret_all() function on both linkedin and facebook
interpret_all(linkedin)
## [1] "You're popular!"
## [1] "Try to be more visible!"
## [1] "Try to be more visible!"
## [1] "Try to be more visible!"
## [1] "Try to be more visible!"
## [1] "You're popular!"
## [1] "Try to be more visible!"
## [1] 33
interpret_all(facebook)
## [1] "You're popular!"
## [1] "Try to be more visible!"
## [1] "Try to be more visible!"
## [1] "You're popular!"
## [1] "Try to be more visible!"
## [1] "Try to be more visible!"
## [1] "Try to be more visible!"
## [1] 33
sum.stat <- function(x){
mean <- mean(x)
median <- median(x)
variance <- var(x)
IQR <- quantile(x)
range <- range(x)
return(list(
mean = mean,
median = median,
variance = variance,
quartiles = IQR,
range = range)
)
}
sum.stat(linelist$bmi)
## $mean
## [1] 46.89023
##
## $median
## [1] 32.12396
##
## $variance
## [1] 3067.862
##
## $quartiles
## 0% 25% 50% 75% 100%
## -1200.00000 24.56033 32.12396 50.00676 1250.00000
##
## $range
## [1] -1200 1250
identical()identical(linkedin,facebook)
## [1] FALSE
str_split()# The vector pioneers has already been created for you
pioneers <- c("GAUSS:1777", "BAYES:1702", "PASCAL:1623", "PEARSON:1857")
# Split names from birth year
split_math <- str_split(pioneers, pattern = ":", simplify = T)
split_math
## [,1] [,2]
## [1,] "GAUSS" "1777"
## [2,] "BAYES" "1702"
## [3,] "PASCAL" "1623"
## [4,] "PEARSON" "1857"
split_math[,1]
## [1] "GAUSS" "BAYES" "PASCAL" "PEARSON"
split_math[,2]
## [1] "1777" "1702" "1623" "1857"
str_sub()# Start at the end of the characters: use "-"
pioneers_number <- str_sub(pioneers, start = -4, end = -1)
pioneers_number
## [1] "1777" "1702" "1623" "1857"
# Start at the beginning of the characters until a specific character
pioneers_number <- str_sub(pioneers,
start = 1,
end = regexpr(":", pioneers, fixed = TRUE) - 1 )
pioneers_number
## [1] "GAUSS" "BAYES" "PASCAL" "PEARSON"
seq(): Generate sequences, by specifying the from,
to, and by arguments.
rep(): Replicate elements of vectors and
lists.
sort(): Sort a vector in ascending order. Works on
numerics, but also on character strings and logicals.
rev(): Reverse the elements in a data structures for
which reversal is defined.
str(): Display the structure of any R
object.
append(): Merge vectors or lists.
is.*(): Check for the class of an R object.
as.*(): Convert an R object from one class to
another.
unlist(): Flatten (possibly embedded) lists to
produce a vector.
sub()
and gsub() metacharacter“.*”: A usual suspect! It can be read as”any character that is matched zero or more times”.
\s: Match a space. The “s” is normally a character, escaping it (\) makes it a metacharacter.
[0-9]+: Match the numbers 0 to 9, at least once (+).
([0-9]+): The parentheses are used to make parts of the matching string available to define the replacement. The \1 in the replacement argument of sub() gets set to the string that is captured by the regular expression [0-9]+.
awards <- c("Won 1 Oscar.",
"Won 1 Oscar. Another 9 wins & 24 nominations.",
"1 win and 2 nominations.",
"2 wins & 3 nominations.",
"Nominated for 2 Golden Globes. 1 more win & 2 nominations.",
"4 wins & 1 nomination.")
sub(".*\\s([0-9]+)\\snomination.*$", "\\1", awards)
## [1] "Won 1 Oscar." "24" "2" "3" "2"
## [6] "1"
# Definition of character strings representing dates
str1 <- "May 23, '96"
str2 <- "2012-03-15"
str3 <- "30/January/2006"
# Convert the strings to dates: date1, date2, date3
date1 <- as.Date(str1, format = "%b %d, '%y")
date2 <- as.Date(str2, format = "%Y-%m-%d")
date3 <- as.Date(str3, format = "%d/%B/%Y")
# Convert dates to formatted strings
format(date1, "%A")
## [1] "Thursday"
format(date2, "%d")
## [1] "15"
format(date3, "%b %Y")
## [1] "Jan 2006"
# Definition of character strings representing times
str1 <- "May 23, '96 hours:23 minutes:01 seconds:45"
str2 <- "2012-3-12 14:23:08"
# Convert the strings to POSIXct objects: time1, time2
time1 <- as.POSIXct(str1, format = "%B %d, '%y hours:%H minutes:%M seconds:%S")
time2 <- as.POSIXct(str2, format = "%Y-%m-%d %T")
# Convert times to formatted strings
format(time1, format = "%M")
## [1] "01"
format(time2, format = "%I:%M %p")
## [1] "02:23 PM"
par() functionpar(mfrow=c(1,2)) #put 2 figures in the same page
plot(airquality$Wind,airquality$Ozone)
hist(airquality$Ozone)
plot(airquality$Wind,airquality$Ozone)
# Frequecy histogram
hist(airquality$Ozone)
abline(v = mean(airquality$Ozone,na.rm = TRUE), col = 2, lwd = 2, lty = 2)
abline(v = median(airquality$Ozone,na.rm = TRUE), col = 1, lwd = 2, lty = 2)
# Probability histogram with density line
hist(airquality$Ozone, probability = TRUE)
lines(density(airquality$Ozone, na.rm=TRUE))
boxplot(split(chickwts$weight,chickwts$feed))
x <- rnorm(1000, 0, 1)
qqnorm(x)
abline(0, 1)
t.test()# Two sample test
t.obj.2 <- t.test(sleep$extra~sleep$group,var.equal=TRUE)
t.obj.2
##
## Two Sample t-test
##
## data: sleep$extra by sleep$group
## t = -1.8608, df = 18, p-value = 0.07919
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
## -3.363874 0.203874
## sample estimates:
## mean in group 1 mean in group 2
## 0.75 2.33
summary(t.obj.2)
## Length Class Mode
## statistic 1 -none- numeric
## parameter 1 -none- numeric
## p.value 1 -none- numeric
## conf.int 2 -none- numeric
## estimate 2 -none- numeric
## null.value 1 -none- numeric
## stderr 1 -none- numeric
## alternative 1 -none- character
## method 1 -none- character
## data.name 1 -none- character
# One sample test
t.obj.1 <- t.test(linelist$bmi,mu=25)
t.obj.1
##
## One Sample t-test
##
## data: linelist$bmi
## t = 30.326, df = 5887, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 25
## 95 percent confidence interval:
## 45.47518 48.30528
## sample estimates:
## mean of x
## 46.89023
summary(t.obj.1)
## Length Class Mode
## statistic 1 -none- numeric
## parameter 1 -none- numeric
## p.value 1 -none- numeric
## conf.int 2 -none- numeric
## estimate 1 -none- numeric
## null.value 1 -none- numeric
## stderr 1 -none- numeric
## alternative 1 -none- character
## method 1 -none- character
## data.name 1 -none- character
shapiro.test()shapiro.test(linelist$ht_cm[1:5000])
##
## Shapiro-Wilk normality test
##
## data: linelist$ht_cm[1:5000]
## W = 0.99016, p-value < 2.2e-16
chisq.test()chisq.test(linelist$gender,linelist$fever)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: linelist$gender and linelist$fever
## X-squared = 0.0089611, df = 1, p-value = 0.9246
aov()Fit.aov<-aov(chickwts$weight~chickwts$feed)
summary(Fit.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## chickwts$feed 5 231129 46226 15.37 5.94e-10 ***
## Residuals 65 195556 3009
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lm() functionlm() to fit the linear regression
model# We use cars dataset to fit simple linear regression between distance and speed
fit.lm <- lm(cars$dist ~ cars$speed)
fit.lm
##
## Call:
## lm(formula = cars$dist ~ cars$speed)
##
## Coefficients:
## (Intercept) cars$speed
## -17.579 3.932
summary() to see the parameter estimates,
standard errors, t-test (and p-value)summary(fit.lm)
##
## Call:
## lm(formula = cars$dist ~ cars$speed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.069 -9.525 -2.272 9.215 43.201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.5791 6.7584 -2.601 0.0123 *
## cars$speed 3.9324 0.4155 9.464 1.49e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared: 0.6511, Adjusted R-squared: 0.6438
## F-statistic: 89.57 on 1 and 48 DF, p-value: 1.49e-12
fit.lm$fit contains fitted valuesfit.lm$fitted.values
## 1 2 3 4 5 6 7 8
## -1.849460 -1.849460 9.947766 9.947766 13.880175 17.812584 21.744993 21.744993
## 9 10 11 12 13 14 15 16
## 21.744993 25.677401 25.677401 29.609810 29.609810 29.609810 29.609810 33.542219
## 17 18 19 20 21 22 23 24
## 33.542219 33.542219 33.542219 37.474628 37.474628 37.474628 37.474628 41.407036
## 25 26 27 28 29 30 31 32
## 41.407036 41.407036 45.339445 45.339445 49.271854 49.271854 49.271854 53.204263
## 33 34 35 36 37 38 39 40
## 53.204263 53.204263 53.204263 57.136672 57.136672 57.136672 61.069080 61.069080
## 41 42 43 44 45 46 47 48
## 61.069080 61.069080 61.069080 68.933898 72.866307 76.798715 76.798715 76.798715
## 49 50
## 76.798715 80.731124
plot(cars$speed, cars$dist)
lines(cars$speed, fit.lm$fitted.values)
title("data and fitted model")
par(mfrow = c(1, 2))
hist(fit.lm$residuals)
qqnorm(fit.lm$residuals)
fit.lmplot(fit.lm)
cor.test()cor.test(linelist$generation,linelist$age, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: linelist$generation and linelist$age
## t = -1.6908, df = 5800, p-value = 0.09093
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.047900342 0.003538572
## sample estimates:
## cor
## -0.02219557
table()table(linelist$gender,linelist$outcome, useNA = "always")
##
## Death Recover <NA>
## f 1227 953 627
## m 1228 950 625
## <NA> 127 80 71
density()x <- rnorm(5,0,2)
density(x)
##
## Call:
## density.default(x = x)
##
## Data: x (5 obs.); Bandwidth 'bw' = 1.914
##
## x y
## Min. :-9.5033 Min. :0.000473
## 1st Qu.:-4.8584 1st Qu.:0.010583
## Median :-0.2135 Median :0.053670
## Mean :-0.2135 Mean :0.053737
## 3rd Qu.: 4.4313 3rd Qu.:0.091137
## Max. : 9.0762 Max. :0.117555
sort(x)
## [1] -3.7601268 -1.3809010 0.9843555 2.6441446 3.3330697
quantile()quantile(faithful$eruptions)
## 0% 25% 50% 75% 100%
## 1.60000 2.16275 4.00000 4.45425 5.10000
stem(faithful$eruptions)
##
## The decimal point is 1 digit(s) to the left of the |
##
## 16 | 070355555588
## 18 | 000022233333335577777777888822335777888
## 20 | 00002223378800035778
## 22 | 0002335578023578
## 24 | 00228
## 26 | 23
## 28 | 080
## 30 | 7
## 32 | 2337
## 34 | 250077
## 36 | 0000823577
## 38 | 2333335582225577
## 40 | 0000003357788888002233555577778
## 42 | 03335555778800233333555577778
## 44 | 02222335557780000000023333357778888
## 46 | 0000233357700000023578
## 48 | 00000022335800333
## 50 | 0370
The stem-and-leaf plot of the Old faithful geyser data reveals a clear bimodel for the duration ofeuroption.
hist(faithful$eruptions, nclass = 20)
normal <- rnorm(n = 1000, mean = 10, sd = sqrt(5))
hist(normal, nclass = 20)
unif <- runif(1000, min = 10, max = 100)
hist(unif)
geom_jitter()ggplot(singer, aes(voice.part,height)) +
geom_jitter(position = position_jitter(width = .05))
geom_point()# Normal scatter plot
ggplot(singer, aes(voice.part,height)) +
geom_point() +
stat_summary(geom = "point", fun = "mean", colour = "red", size = 4)
# With v-line
child %>%
drop_na(HAZ, weight) %>%
mutate(new_haz_cat = case_when(
HAZ < -2 & HAZ >= -3 ~'moderate stunting',
HAZ >= -2 & HAZ <6 ~ 'normal',
HAZ < -3 & HAZ>= -6 ~'severe stunting',
HAZ < -6 & HAZ > 6 ~ "outliers")) %>%
filter(new_haz_cat != "outlier") %>%
ggplot() +
geom_point(aes(x = HAZ, y = bmi, color=new_haz_cat), alpha = 0.5) +
geom_vline(xintercept = -3, color = "dark grey") +
geom_vline(xintercept = -2, color = "dark grey") +
theme_classic()
## Warning: Removed 6 rows containing missing values (`geom_point()`).
geom_histogram()# Basic historgram
linelist %>%
drop_na(age,gender) %>%
mutate(gender = factor(gender, levels = c("f","m"), labels = c("Female", "Male"))) %>% ggplot() +
geom_histogram(aes(x = age), colour = "white") +
labs(y= "number of patients", x="Age") +
guides(fill=guide_legend(title="Gender")) +
theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Stacked histogram
linelist %>%
drop_na(age,gender) %>%
mutate(gender = factor(gender, levels = c("f","m"), labels = c("Female", "Male"))) %>% ggplot() +
geom_histogram(aes(x = age,fill = gender), colour = "white") +
labs(y= "number of patients", x="Age") +
guides(fill=guide_legend(title="Gender")) +
theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
geom_density()ggplot(singer, aes(height)) +
geom_density() +
geom_histogram(aes(y=..density..)) +
facet_wrap(~voice.part, scales = "free")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
geom_bar# Y-axis is count
linelist %>%
drop_na(age_cat) %>%
ggplot() +
geom_bar(aes(x=age_cat),
alpha=1,
colour = "white",
width = 1, position="dodge") +
labs(y= "Number of patients", x="age group") +
theme_classic()
# Y-axis is proportion
linelist %>%
drop_na(age_cat, gender) %>%
mutate(gender = factor(gender, levels = c("f","m"), labels = c("Female", "Male"))) %>%
ggplot(aes(fill = gender)) +
geom_bar(aes(x = age_cat, y = after_stat(count/sum(count)*100)),
alpha=1,
colour = "white",
width = 1, position="dodge") +
labs(y= "proportion of patients", x="age group") +
guides(fill=guide_legend(title="Gender")) +
theme_classic() +
facet_wrap(~gender)
geom_col()linelist %>%
drop_na(age_cat, gender) %>%
mutate(gender = factor(gender, levels = c("f","m"), labels = c("Female", "Male"))) %>%
group_by(gender, age_cat) %>%
summarise(count = n()) %>%
ggplot(aes(fill = gender)) +
geom_col(aes(x = age_cat, y = count),
alpha=1,
colour = "white") +
labs(y= "proportion of patients", x="age group") +
guides(fill=guide_legend(title="Gender")) +
theme_classic() +
facet_wrap(~gender)
## `summarise()` has grouped output by 'gender'. You can override using the
## `.groups` argument.
data1 <- titanic %>%
drop_na() %>%
group_by(Survived, Sex, PClass) %>%
count() %>%
group_by(Sex, Survived) %>%
mutate(percent = round(n/sum(n)*100,2),
pos = cumsum(percent)- 0.5*percent) %>% #pos variable used for label in pie chart, old way
ungroup()
data1
## # A tibble: 12 × 6
## Survived Sex PClass n percent pos
## <int> <fct> <fct> <int> <dbl> <dbl>
## 1 0 female 1st 5 7.04 3.52
## 2 0 female 2nd 10 14.1 14.1
## 3 0 female 3rd 56 78.9 60.6
## 4 0 male 1st 82 22.0 11.0
## 5 0 male 2nd 106 28.5 36.3
## 6 0 male 3rd 184 49.5 75.3
## 7 1 female 1st 96 44.2 22.1
## 8 1 female 2nd 75 34.6 61.5
## 9 1 female 3rd 46 21.2 89.4
## 10 1 male 1st 43 44.8 22.4
## 11 1 male 2nd 21 21.9 55.7
## 12 1 male 3rd 32 33.3 83.3
# More updated way
data1 %>%
mutate(label = paste0(percent,"%")) %>%
ggplot(aes(x = "", y =percent, fill = PClass)) +
geom_col() +
geom_label(aes(label = label),size =5 ,
position = position_stack(vjust =0.5),show.legend = FALSE) +
coord_polar("y", start = 0)+
labs(x = "Survived", y ="")+
theme(panel.grid = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank()) +
facet_grid(Survived~Sex)
# Old way
data1 %>%
mutate(label = paste0(percent,"%")) %>%
ggplot(aes(x=1,y = percent, fill = PClass)) +
geom_col() +
geom_label(aes(x = 1 ,y = 100-pos, label = label))+
coord_polar("y", start = 0)+
labs(x = "Survived", y ="")+
theme(panel.grid = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank()) +
facet_grid(Survived~Sex)
geom_box()ggplot(singer, aes(x = voice.part,height)) +
geom_boxplot()
geom_violin()ggplot(singer, aes(voice.part,height)) +
geom_violin()
geom_density_ridges()ggplot(singer, aes(x=height,y=voice.part,fill = voice.part)) +
geom_density_ridges(alpha = 0.5) +
theme_ridges() +
theme(legend.position = "none")
## Picking joint bandwidth of 1.09
head(linelist)
## # A tibble: 6 × 30
## case_id generation date_infe…¹ date_onset date_hos…² date_out…³ outcome gender
## <chr> <dbl> <date> <date> <date> <date> <chr> <chr>
## 1 5fe599 4 2014-05-08 2014-05-13 2014-05-15 NA <NA> m
## 2 8689b7 4 NA 2014-05-13 2014-05-14 2014-05-18 Recover f
## 3 11f8ea 2 NA 2014-05-16 2014-05-18 2014-05-30 Recover m
## 4 b8812a 3 2014-05-04 2014-05-18 2014-05-20 NA <NA> f
## 5 893f25 3 2014-05-18 2014-05-21 2014-05-22 2014-05-29 Recover m
## 6 be99c8 3 2014-05-03 2014-05-22 2014-05-23 2014-05-24 Recover f
## # … with 22 more variables: age <dbl>, age_unit <chr>, age_years <dbl>,
## # age_cat <fct>, age_cat5 <fct>, hospital <chr>, lon <dbl>, lat <dbl>,
## # infector <chr>, source <chr>, wt_kg <dbl>, ht_cm <dbl>, ct_blood <dbl>,
## # fever <chr>, chills <chr>, cough <chr>, aches <chr>, vomit <chr>,
## # temp <dbl>, time_admission <chr>, bmi <dbl>, days_onset_hosp <dbl>, and
## # abbreviated variable names ¹date_infection, ²date_hospitalisation,
## # ³date_outcome
skimr::skim(linelist)
| Name | linelist |
| Number of rows | 5888 |
| Number of columns | 30 |
| _______________________ | |
| Column type frequency: | |
| character | 13 |
| Date | 4 |
| factor | 2 |
| numeric | 11 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| case_id | 0 | 1.00 | 6 | 6 | 0 | 5888 | 0 |
| outcome | 1323 | 0.78 | 5 | 7 | 0 | 2 | 0 |
| gender | 278 | 0.95 | 1 | 1 | 0 | 2 | 0 |
| age_unit | 0 | 1.00 | 5 | 6 | 0 | 2 | 0 |
| hospital | 0 | 1.00 | 5 | 36 | 0 | 6 | 0 |
| infector | 2088 | 0.65 | 6 | 6 | 0 | 2697 | 0 |
| source | 2088 | 0.65 | 5 | 7 | 0 | 2 | 0 |
| fever | 249 | 0.96 | 2 | 3 | 0 | 2 | 0 |
| chills | 249 | 0.96 | 2 | 3 | 0 | 2 | 0 |
| cough | 249 | 0.96 | 2 | 3 | 0 | 2 | 0 |
| aches | 249 | 0.96 | 2 | 3 | 0 | 2 | 0 |
| vomit | 249 | 0.96 | 2 | 3 | 0 | 2 | 0 |
| time_admission | 765 | 0.87 | 5 | 5 | 0 | 1072 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| date_infection | 2087 | 0.65 | 2014-03-19 | 2015-04-27 | 2014-10-11 | 359 |
| date_onset | 256 | 0.96 | 2014-04-07 | 2015-04-30 | 2014-10-23 | 367 |
| date_hospitalisation | 0 | 1.00 | 2014-04-17 | 2015-04-30 | 2014-10-23 | 363 |
| date_outcome | 936 | 0.84 | 2014-04-19 | 2015-06-04 | 2014-11-01 | 371 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| age_cat | 86 | 0.99 | FALSE | 8 | 0-4: 1095, 5-9: 1095, 20-: 1073, 10-: 941 |
| age_cat5 | 86 | 0.99 | FALSE | 17 | 0-4: 1095, 5-9: 1095, 10-: 941, 15-: 743 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| generation | 0 | 1.00 | 16.56 | 5.79 | 0.00 | 13.00 | 16.00 | 20.00 | 37.00 | ▁▆▇▂▁ |
| age | 86 | 0.99 | 16.07 | 12.62 | 0.00 | 6.00 | 13.00 | 23.00 | 84.00 | ▇▅▁▁▁ |
| age_years | 86 | 0.99 | 16.02 | 12.64 | 0.00 | 6.00 | 13.00 | 23.00 | 84.00 | ▇▅▁▁▁ |
| lon | 0 | 1.00 | -13.23 | 0.02 | -13.27 | -13.25 | -13.23 | -13.22 | -13.21 | ▅▃▃▆▇ |
| lat | 0 | 1.00 | 8.47 | 0.01 | 8.45 | 8.46 | 8.47 | 8.48 | 8.49 | ▅▇▇▇▆ |
| wt_kg | 0 | 1.00 | 52.64 | 18.58 | -11.00 | 41.00 | 54.00 | 66.00 | 111.00 | ▁▃▇▅▁ |
| ht_cm | 0 | 1.00 | 124.96 | 49.52 | 4.00 | 91.00 | 129.00 | 159.00 | 295.00 | ▂▅▇▂▁ |
| ct_blood | 0 | 1.00 | 21.21 | 1.69 | 16.00 | 20.00 | 22.00 | 22.00 | 26.00 | ▁▃▇▃▁ |
| temp | 149 | 0.97 | 38.56 | 0.98 | 35.20 | 38.20 | 38.80 | 39.20 | 40.80 | ▁▂▂▇▁ |
| bmi | 0 | 1.00 | 46.89 | 55.39 | -1200.00 | 24.56 | 32.12 | 50.01 | 1250.00 | ▁▁▇▁▁ |
| days_onset_hosp | 256 | 0.96 | 2.06 | 2.26 | 0.00 | 1.00 | 1.00 | 3.00 | 22.00 | ▇▁▁▁▁ |
duplicated() and distinct()# Basic R
sum(duplicated(linelist)) #how many r duplicated lines
## [1] 0
linelist[duplicated(linelist),] # see which row is duplicated
## # A tibble: 0 × 30
## # … with 30 variables: case_id <chr>, generation <dbl>, date_infection <date>,
## # date_onset <date>, date_hospitalisation <date>, date_outcome <date>,
## # outcome <chr>, gender <chr>, age <dbl>, age_unit <chr>, age_years <dbl>,
## # age_cat <fct>, age_cat5 <fct>, hospital <chr>, lon <dbl>, lat <dbl>,
## # infector <chr>, source <chr>, wt_kg <dbl>, ht_cm <dbl>, ct_blood <dbl>,
## # fever <chr>, chills <chr>, cough <chr>, aches <chr>, vomit <chr>,
## # temp <dbl>, time_admission <chr>, bmi <dbl>, days_onset_hosp <dbl>
linelist_c <- unique(linelist)
# Tidyverse
linelist_c <- linelist %>%
distinct()
select() to keep, remove or change the reorder of the
columnslinelist_c <- linelist %>%
select(case_id, generation, date_infection)
names(linelist_c)
## [1] "case_id" "generation" "date_infection"
# Remove column
linelist_c <- linelist %>%
select(-case_id)
names(linelist_c)
## [1] "generation" "date_infection" "date_onset"
## [4] "date_hospitalisation" "date_outcome" "outcome"
## [7] "gender" "age" "age_unit"
## [10] "age_years" "age_cat" "age_cat5"
## [13] "hospital" "lon" "lat"
## [16] "infector" "source" "wt_kg"
## [19] "ht_cm" "ct_blood" "fever"
## [22] "chills" "cough" "aches"
## [25] "vomit" "temp" "time_admission"
## [28] "bmi" "days_onset_hosp"
# Change order
linelist_c <- linelist %>%
select(case_id, date_infection, generation)
mutate()linelist_c <- linelist %>%
mutate(bmi = wt_kg/((ht_cm/100)^2))
head(linelist_c$bmi)
## [1] 117.18750 71.81844 16.06525 22.49657 71.41440 41.61712
mutate() and across()str(linelist$outcome)
## chr [1:5888] NA "Recover" "Recover" NA "Recover" "Recover" "Recover" ...
linelist_c <- linelist %>%
mutate(across(.cols = c(outcome, gender),
.fns = as.factor))
str(linelist_c$outcome)
## Factor w/ 2 levels "Death","Recover": NA 2 2 NA 2 2 2 1 2 1 ...
mutate() and case_when()linelist_c <- linelist %>%
mutate(bmi = as.numeric(bmi),
bmi_group = case_when(
bmi <= 18.5 ~ "<=18.5",
bmi >"30" ~ ">30",
TRUE ~ "Other"))
table(linelist_c$bmi_group)
##
## <=18.5 >30 Other
## 409 2796 2683
pivot_longer()DT::datatable(data_wide, option = list(pageLength=10))
data_long <- data_wide %>%
pivot_longer(cols = !person, names_to = "point" )
DT::datatable(data_long, option = list(pageLength=10))
pivot_wider()DT::datatable(data_long, option = list(pageLength=10))
data_wide <- data_long %>%
pivot_wider(names_from = point, values_from = value)
DT::datatable(data_wide, option = list(pageLength=10))
tidyr.summary.wide <- linelist %>%
drop_na(bmi,temp) %>%
summarise(across(.col = c(bmi,temp),
.fns = list("mean" = mean,
"var" = var,
"sd" = sd,
"min" = min,
"max" = max,
"iqr" = IQR,
"1stquantile" = ~quantile(., probs = 0.25,na.rm = TRUE),
"3rdquantile" = ~quantile(.,probs = 0.75,na.rm = TRUE))))
DT::datatable(tidyr.summary.wide,
options = list(pageLength = 25,
scrollX = TRUE,
scrollCollapse = TRUE))
tidyr.summary.long <- tidyr.summary.wide %>%
pivot_longer(cols = everything(),
names_to = "parameters",
values_to = "values")
DT::datatable(tidyr.summary.long, options = list(pageLength = 25))
# Normal table
linelist %>%
select(bmi, temp) %>%
tbl_summary(type = all_continuous() ~ "continuous2",
statistic = all_continuous() ~ c(
"{mean}({sd})",
"{median} ({p25}-{p75})",
"{min}-{max}"),
missing = "always")
| Characteristic | N = 5,888 |
|---|---|
| bmi | |
| Mean(SD) | 47(55) |
| Median (25%-75%) | 32 (25-50) |
| Minimum-Maximum | -1,200-1,250 |
| Unknown | 0 |
| temp | |
| Mean(SD) | 38.56(0.98) |
| Median (25%-75%) | 38.80 (38.20-39.20) |
| Minimum-Maximum | 35.20-40.80 |
| Unknown | 149 |
# By group, and modify header with save to docx
my_table_gt <- PIPO_R %>%
select(-ID) %>%
mutate(Gender_Child = factor(Gender_Child,labels = c("Boy", "Girl"))) %>%
tbl_summary(by = Gender_Child,
label = list(FEV1pre ~ "FEV1",
FVCpre ~ "FVC",
WEIGHT ~ "Wgt (kg)",
LENGTH ~ "Length (cm)",
AGE ~ "Age (yrs)"),
# type = all_continuous()~"continuous2",
statistic = all_continuous()~c("{median} [{IQR}]"),
missing_text = "(Missing)") %>%
add_overall() %>%
modify_header(all_stat_cols() ~ "**{level}**\nN = {N}")
my_table_gt
| Characteristic | Overall N = 4961 | Boy N = 4961 | Girl N = 4961 |
|---|---|---|---|
| Wgt (kg) | 30.0 [6.0] | 30.0 [5.5] | 29.5 [6.5] |
| Length (cm) | 136 [8] | 137 [7] | 136 [10] |
| FEV1 | 1.89 [0.36] | 1.94 [0.38] | 1.85 [0.34] |
| FVC | 2.18 [0.43] | 2.28 [0.42] | 2.11 [0.41] |
| Age (yrs) | 9.12 [0.72] | 9.13 [0.69] | 9.12 [0.74] |
| 1 Median [IQR] | |||
my_table_gt_flex <- as_flex_table(my_table_gt)
my_table_gt_flex <- my_table_gt_flex %>%
align(i = 1, align = "center", part = "header") %>%
border_remove() %>%
hline(part="body") %>%
hline_top() %>%
bg(part = "header",bg = "#E5E7E9") %>%
fontsize(i=1, part = "header", size = 12)
my_table_gt_flex
Characteristic | Overall | Boy | Girl |
|---|---|---|---|
Wgt (kg) | 30.0 [6.0] | 30.0 [5.5] | 29.5 [6.5] |
Length (cm) | 136 [8] | 137 [7] | 136 [10] |
FEV1 | 1.89 [0.36] | 1.94 [0.38] | 1.85 [0.34] |
FVC | 2.18 [0.43] | 2.28 [0.42] | 2.11 [0.41] |
Age (yrs) | 9.12 [0.72] | 9.13 [0.69] | 9.12 [0.74] |
1Median [IQR] | |||
save_as_docx(my_table_gt_flex,path ="my_table_gt.docx")
linelist %>%
get_summary_stats(
bmi, temp,
type = "common")
## # A tibble: 2 × 10
## variable n min max median iqr mean sd se ci
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 bmi 5888 -1200 1250 32.1 25.4 46.9 55.4 0.722 1.42
## 2 temp 5739 35.2 40.8 38.8 1 38.6 0.977 0.013 0.025
#Overall data
table1(~age+hospital+bmi+temp, data = linelist)
| Overall (N=5888) |
|
|---|---|
| age | |
| Mean (SD) | 16.1 (12.6) |
| Median [Min, Max] | 13.0 [0, 84.0] |
| Missing | 86 (1.5%) |
| hospital | |
| Central Hospital | 454 (7.7%) |
| Military Hospital | 896 (15.2%) |
| Missing | 1469 (24.9%) |
| Other | 885 (15.0%) |
| Port Hospital | 1762 (29.9%) |
| St. Mark's Maternity Hospital (SMMH) | 422 (7.2%) |
| bmi | |
| Mean (SD) | 46.9 (55.4) |
| Median [Min, Max] | 32.1 [-1200, 1250] |
| temp | |
| Mean (SD) | 38.6 (0.977) |
| Median [Min, Max] | 38.8 [35.2, 40.8] |
| Missing | 149 (2.5%) |
# Stratify
table1(~age+hospital+bmi+temp|gender,
data = linelist[!is.na(linelist$gender),])
| f (N=2807) |
m (N=2803) |
Overall (N=5610) |
|
|---|---|---|---|
| age | |||
| Mean (SD) | 12.7 (9.58) | 19.6 (14.3) | 16.1 (12.6) |
| Median [Min, Max] | 11.0 [0, 52.0] | 17.0 [0, 84.0] | 13.0 [0, 84.0] |
| hospital | |||
| Central Hospital | 202 (7.2%) | 230 (8.2%) | 432 (7.7%) |
| Military Hospital | 421 (15.0%) | 435 (15.5%) | 856 (15.3%) |
| Missing | 700 (24.9%) | 696 (24.8%) | 1396 (24.9%) |
| Other | 418 (14.9%) | 423 (15.1%) | 841 (15.0%) |
| Port Hospital | 857 (30.5%) | 823 (29.4%) | 1680 (29.9%) |
| St. Mark's Maternity Hospital (SMMH) | 209 (7.4%) | 196 (7.0%) | 405 (7.2%) |
| bmi | |||
| Mean (SD) | 55.5 (72.9) | 38.4 (28.1) | 46.9 (55.9) |
| Median [Min, Max] | 36.8 [-1200, 1250] | 29.0 [10.9, 269] | 32.1 [-1200, 1250] |
| temp | |||
| Mean (SD) | 38.6 (0.979) | 38.6 (0.975) | 38.6 (0.977) |
| Median [Min, Max] | 38.8 [35.2, 40.8] | 38.8 [35.8, 40.6] | 38.8 [35.2, 40.8] |
| Missing | 72 (2.6%) | 76 (2.7%) | 148 (2.6%) |
# If you want to add Q1 and Q3
render.new <- function(x){
mean <- mean(x, na.rm=TRUE)
sd <- sd(x, na.rm=TRUE)
median <- median(x, na.rm = TRUE)
Q1 <- quantile(x,probs = 0.25, na.rm=TRUE)
Q3 <- quantile(x,probs = 0.75, na.rm=TRUE)
out <- c("",
"Median [Q1, Q3]" = paste0(sprintf("%.2f", median), " [",
sprintf("%.2f", Q1), ", ",
sprintf("%.2f", Q3), "]"),
"Mean (SD)" = paste0(sprintf("%.2f", mean), " (",
sprintf("%.2f", sd), ")"))
out
}
table1(~age+hospital+bmi+temp|gender,
data = linelist[!is.na(linelist$gender),],
render.continuous = render.new)
| f (N=2807) |
m (N=2803) |
Overall (N=5610) |
|
|---|---|---|---|
| age | |||
| Median [Q1, Q3] | 11.00 [5.00, 19.00] | 17.00 [8.00, 28.00] | 13.00 [6.00, 23.00] |
| Mean (SD) | 12.66 (9.58) | 19.57 (14.26) | 16.11 (12.62) |
| hospital | |||
| Central Hospital | 202 (7.2%) | 230 (8.2%) | 432 (7.7%) |
| Military Hospital | 421 (15.0%) | 435 (15.5%) | 856 (15.3%) |
| Missing | 700 (24.9%) | 696 (24.8%) | 1396 (24.9%) |
| Other | 418 (14.9%) | 423 (15.1%) | 841 (15.0%) |
| Port Hospital | 857 (30.5%) | 823 (29.4%) | 1680 (29.9%) |
| St. Mark's Maternity Hospital (SMMH) | 209 (7.4%) | 196 (7.0%) | 405 (7.2%) |
| bmi | |||
| Median [Q1, Q3] | 36.85 [27.13, 63.04] | 29.04 [22.65, 40.32] | 32.14 [24.56, 50.00] |
| Mean (SD) | 55.50 (72.86) | 38.36 (28.06) | 46.93 (55.88) |
| temp | |||
| Median [Q1, Q3] | 38.80 [38.20, 39.20] | 38.80 [38.20, 39.20] | 38.80 [38.20, 39.20] |
| Mean (SD) | 38.57 (0.98) | 38.55 (0.97) | 38.56 (0.98) |
| Missing | 72 (2.6%) | 76 (2.7%) | 148 (2.6%) |
tabyl <- linelist %>% # case linelist
tabyl(age_cat5, gender,show_na = FALSE) %>% # cross-tabulate counts
adorn_totals(where = "row") %>% # add a total row
adorn_percentages(denominator = "col") %>% # convert to proportions
adorn_pct_formatting() %>% # convert to percents
adorn_ns(position = "front") %>% # display as: "count (percent)"
adorn_title( # adjust titles
row_name = "Age Category",
col_name = "Gender")
knitr::kable(tabyl)
| Gender | ||
|---|---|---|
| Age Category | f | m |
| 0-4 | 640 (22.8%) | 416 (14.8%) |
| 5-9 | 641 (22.8%) | 412 (14.7%) |
| 10-14 | 518 (18.5%) | 383 (13.7%) |
| 15-19 | 359 (12.8%) | 364 (13.0%) |
| 20-24 | 305 (10.9%) | 316 (11.3%) |
| 25-29 | 163 (5.8%) | 259 (9.2%) |
| 30-34 | 104 (3.7%) | 213 (7.6%) |
| 35-39 | 42 (1.5%) | 157 (5.6%) |
| 40-44 | 25 (0.9%) | 107 (3.8%) |
| 45-49 | 8 (0.3%) | 80 (2.9%) |
| 50-54 | 2 (0.1%) | 37 (1.3%) |
| 55-59 | 0 (0.0%) | 30 (1.1%) |
| 60-64 | 0 (0.0%) | 12 (0.4%) |
| 65-69 | 0 (0.0%) | 12 (0.4%) |
| 70-74 | 0 (0.0%) | 4 (0.1%) |
| 75-79 | 0 (0.0%) | 0 (0.0%) |
| 80-84 | 0 (0.0%) | 1 (0.0%) |
| 85+ | 0 (0.0%) | 0 (0.0%) |
| Total | 2807 (100.0%) | 2803 (100.0%) |
rstatix: t_test()linelist %>% t_test(formula = ht_cm ~ gender)
## # A tibble: 1 × 10
## .y. group1 group2 n1 n2 statistic df p p.adj p.adj.si…¹
## <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 ht_cm f m 2807 2803 -26.3 5597. 3.52e-144 3.52e-144 ****
## # … with abbreviated variable name ¹p.adj.signif
rstatix: shapiro_wilk()linelist %>%
head(5000)%>%
shapiro_test(ht_cm)
## # A tibble: 1 × 3
## variable statistic p
## <chr> <dbl> <dbl>
## 1 ht_cm 0.990 3.54e-18
correlation_table = linelist%>% #Use package (corrr)
select(bmi,age) %>% #Select numeric variables of interest
correlate()
## Correlation computed with
## • Method: 'pearson'
## • Missing treated using: 'pairwise.complete.obs'
correlation_table
## # A tibble: 2 × 3
## term bmi age
## <chr> <dbl> <dbl>
## 1 bmi NA -0.394
## 2 age -0.394 NA
A left or right join is commonly used to add information to a data frame.
In using these joins, the written order of the data frames in the command is important.
In a left join, the first data frame written is the baseline
In a right join, the second data frame written is the baseline
All rows of the baseline data frame are kept.
General syntax
# Join based on common values between column "ID" (first data frame) and column "identifier" (second data frame)
joined_data <- left_join(df1, df2, by = c("ID" = "identifier"))
# Joint based on common values in column "ID" in both data frames
joined_data <- left_join(df1, df2, by = "ID")
# join based on same first name, last name, and age
joined_data <- left_join(df1, df2,
by = c("name" = "firstname", "surname" = "lastname", "Age" = "age"))
# In the example below, df1 is is passed through the pipes, df2 is joined to it, and df is thus modified and re-defined.
df1 <- df1 %>%
left_join(df2, by = c("ID" = "identifier"), # join df2 to df1
suffix = c("_df1", ".df2") # put suffix in variable name
)
Example
glimpse(linelist_mini)
## Rows: 10
## Columns: 3
## $ case_id <chr> "5fe599", "8689b7", "11f8ea", "b8812a", "893f25", "be99c8",…
## $ date_onset <date> 2014-05-13, 2014-05-13, 2014-05-16, 2014-05-18, 2014-05-21…
## $ hospital <chr> "Other", "Missing", "St. Mark's Maternity Hospital (SMMH)",…
glimpse(hosp_info)
## Rows: 7
## Columns: 3
## $ hosp_name <chr> "Central Hospital", "Military Hospital", "Military Hospi…
## $ catchment_pop <dbl> 1950280, 40500, 10000, 50280, 12000, 5000, 4200
## $ level <chr> "Tertiary", "Secondary", "Primary", "Secondary", "Second…
linelist_mini %>%
left_join(hosp_info, by = c("hospital" = "hosp_name"))
## # A tibble: 11 × 5
## case_id date_onset hospital catchment_pop level
## <chr> <date> <chr> <dbl> <chr>
## 1 5fe599 2014-05-13 Other NA <NA>
## 2 8689b7 2014-05-13 Missing NA <NA>
## 3 11f8ea 2014-05-16 St. Mark's Maternity Hospital (SMMH) 12000 Second…
## 4 b8812a 2014-05-18 Port Hospital 50280 Second…
## 5 893f25 2014-05-21 Military Hospital 40500 Second…
## 6 893f25 2014-05-21 Military Hospital 10000 Primary
## 7 be99c8 2014-05-22 Port Hospital 50280 Second…
## 8 07e3e8 2014-05-27 Missing NA <NA>
## 9 369449 2014-06-02 Missing NA <NA>
## 10 f393b4 2014-06-05 Missing NA <NA>
## 11 1389ca 2014-06-05 Missing NA <NA>
linelist_mini %>%
right_join(hosp_info, by = c("hospital" = "hosp_name"))
## # A tibble: 8 × 5
## case_id date_onset hospital catchment_pop level
## <chr> <date> <chr> <dbl> <chr>
## 1 11f8ea 2014-05-16 St. Mark's Maternity Hospital (SMMH) 12000 Seconda…
## 2 b8812a 2014-05-18 Port Hospital 50280 Seconda…
## 3 893f25 2014-05-21 Military Hospital 40500 Seconda…
## 4 893f25 2014-05-21 Military Hospital 10000 Primary
## 5 be99c8 2014-05-22 Port Hospital 50280 Seconda…
## 6 <NA> NA Central Hospital 1950280 Tertiary
## 7 <NA> NA ignace 5000 Primary
## 8 <NA> NA sisters 4200 Primary
A full join is the most inclusive of the joins - it returns all rows from both data frames.
linelist_mini %>%
full_join(hosp_info, by = c("hospital" = "hosp_name"))
## # A tibble: 14 × 5
## case_id date_onset hospital catchment_pop level
## <chr> <date> <chr> <dbl> <chr>
## 1 5fe599 2014-05-13 Other NA <NA>
## 2 8689b7 2014-05-13 Missing NA <NA>
## 3 11f8ea 2014-05-16 St. Mark's Maternity Hospital (SMMH) 12000 Second…
## 4 b8812a 2014-05-18 Port Hospital 50280 Second…
## 5 893f25 2014-05-21 Military Hospital 40500 Second…
## 6 893f25 2014-05-21 Military Hospital 10000 Primary
## 7 be99c8 2014-05-22 Port Hospital 50280 Second…
## 8 07e3e8 2014-05-27 Missing NA <NA>
## 9 369449 2014-06-02 Missing NA <NA>
## 10 f393b4 2014-06-05 Missing NA <NA>
## 11 1389ca 2014-06-05 Missing NA <NA>
## 12 <NA> NA Central Hospital 1950280 Tertia…
## 13 <NA> NA ignace 5000 Primary
## 14 <NA> NA sisters 4200 Primary
An inner join is the most restrictive of the joins - it returns only rows with matches across both data frames.
linelist_mini %>%
inner_join(hosp_info, by = c("hospital" = "hosp_name"))
## # A tibble: 5 × 5
## case_id date_onset hospital catchment_pop level
## <chr> <date> <chr> <dbl> <chr>
## 1 11f8ea 2014-05-16 St. Mark's Maternity Hospital (SMMH) 12000 Seconda…
## 2 b8812a 2014-05-18 Port Hospital 50280 Seconda…
## 3 893f25 2014-05-21 Military Hospital 40500 Seconda…
## 4 893f25 2014-05-21 Military Hospital 10000 Primary
## 5 be99c8 2014-05-22 Port Hospital 50280 Seconda…
A semi-join keeps all observations in the baseline data frame that have a match in the secondary data frame. (but does not add new columns nor duplicate any rows for multiple matches).
hosp_info %>%
semi_join(linelist_mini, by = c("hosp_name" = "hospital"))
## # A tibble: 4 × 3
## hosp_name catchment_pop level
## <chr> <dbl> <chr>
## 1 Military Hospital 40500 Secondary
## 2 Military Hospital 10000 Primary
## 3 Port Hospital 50280 Secondary
## 4 St. Mark's Maternity Hospital (SMMH) 12000 Secondary
The anti join is another “filtering join” that returns rows in the baseline data frame that do not have a match in the secondary data frame.
hosp_info %>%
anti_join(linelist_mini, by = c("hosp_name" = "hospital"))
## # A tibble: 3 × 3
## hosp_name catchment_pop level
## <chr> <dbl> <chr>
## 1 Central Hospital 1950280 Tertiary
## 2 ignace 5000 Primary
## 3 sisters 4200 Primary
rm(list = ls(all.names = TRUE))