Cancer dataset

#Install the survival package
library(survival)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
#Load the cancer dataset
head(cancer)
##   inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss
## 1    3  306      2  74   1       1       90       100     1175      NA
## 2    3  455      2  68   1       0       90        90     1225      15
## 3    3 1010      1  56   1       0       90        90       NA      15
## 4    5  210      2  57   1       1       90        60     1150      11
## 5    1  883      2  60   1       0      100        90       NA       0
## 6   12 1022      1  74   1       1       50        80      513       0
tail(cancer)
##     inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss
## 223    1  116      2  76   1       1       80        80       NA       0
## 224    1  188      1  77   1       1       80        60       NA       3
## 225   13  191      1  39   1       0       90        90     2350      -5
## 226   32  105      1  75   2       2       60        70     1025       5
## 227    6  174      1  66   1       1       90       100     1075       1
## 228   22  177      1  58   2       1       80        90     1060       0
summary(cancer)
##       inst            time            status           age       
##  Min.   : 1.00   Min.   :   5.0   Min.   :1.000   Min.   :39.00  
##  1st Qu.: 3.00   1st Qu.: 166.8   1st Qu.:1.000   1st Qu.:56.00  
##  Median :11.00   Median : 255.5   Median :2.000   Median :63.00  
##  Mean   :11.09   Mean   : 305.2   Mean   :1.724   Mean   :62.45  
##  3rd Qu.:16.00   3rd Qu.: 396.5   3rd Qu.:2.000   3rd Qu.:69.00  
##  Max.   :33.00   Max.   :1022.0   Max.   :2.000   Max.   :82.00  
##  NA's   :1                                                       
##       sex           ph.ecog          ph.karno        pat.karno     
##  Min.   :1.000   Min.   :0.0000   Min.   : 50.00   Min.   : 30.00  
##  1st Qu.:1.000   1st Qu.:0.0000   1st Qu.: 75.00   1st Qu.: 70.00  
##  Median :1.000   Median :1.0000   Median : 80.00   Median : 80.00  
##  Mean   :1.395   Mean   :0.9515   Mean   : 81.94   Mean   : 79.96  
##  3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.: 90.00   3rd Qu.: 90.00  
##  Max.   :2.000   Max.   :3.0000   Max.   :100.00   Max.   :100.00  
##                  NA's   :1        NA's   :1        NA's   :3       
##     meal.cal         wt.loss       
##  Min.   :  96.0   Min.   :-24.000  
##  1st Qu.: 635.0   1st Qu.:  0.000  
##  Median : 975.0   Median :  7.000  
##  Mean   : 928.8   Mean   :  9.832  
##  3rd Qu.:1150.0   3rd Qu.: 15.750  
##  Max.   :2600.0   Max.   : 68.000  
##  NA's   :47       NA's   :14
#Compute the mean, median of the survival times,irrespective of the censoring of survival times
mean(cancer$time)
## [1] 305.2325
median(cancer$time)
## [1] 255.5

Estriol dataset

#Import the Estriol dataset
estriol <- read.csv2('C:/Users/thuyb/OneDrive - Universiteit Antwerpen/University of Antwerp/Semester 1/Data Management/Practice/Practical session/Rpractical session1_student_files/Estriol.csv')
head(estriol)
##   NR ESTRIOL BIRTHWGT
## 1  1       7       25
## 2  2       9       25
## 3  3       9       25
## 4  4      12       27
## 5  5      14       27
## 6  6      16       27
hist(estriol$ESTRIOL,
     xlab = "Estriol level", ylab = "Relative frequency",
     freq = F, main = "Histogram of estriol lever")

Aml data set

data(aml)
## Warning in data(aml): data set 'aml' not found
#Calculate descriptive measures (mean, median, variance, standard deviation, quartiles and range) for the variable `time’
summary(aml$time)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00   12.50   23.00   29.48   33.50  161.00
par(mfrow = c(1,2))
hist(aml$time, main = "", xlab = "Time", freq = FALSE)
boxplot(aml$time, ylab = "Time")

How many observations are right-censored (calculate percentage).

table(aml$status)
## 
##  0  1 
##  5 18
sum(aml$status==0)/length(aml$status)
## [1] 0.2173913

Calculate the descriptive measure for all uncensored times (i.e. cens = 1) and seperately for treatment and control groups.

table(aml$status, aml$x)
##    
##     Maintained Nonmaintained
##   0          4             1
##   1          7            11
aml_1 <- aml %>% 
  rename(group=x) %>% 
  filter(status==1)

tapply(aml_1$time,aml_1$group, mean)
##    Maintained Nonmaintained 
##      25.14286      21.72727
tapply(aml_1$time,aml_1$group, median)
##    Maintained Nonmaintained 
##            23            23
tapply(aml_1$time,aml_1$group, var)
##    Maintained Nonmaintained 
##      183.1429      225.0182