Reading the data

data <- read.table("~/IMB MVA 2021/Example/UTF-8/Marathon.csv", 
                   header = TRUE,
                   sep = ";",
                   dec = ",",
                   encoding = "UTF-8")

print(data)
##    Weight Height Pressure Heart_beat Hemoglobin Hematocrit
## 1      70  179.0      105         64        160         50
## 2      68  178.0      105         60        158         51
## 3      64  174.0      109         54        155         51
## 4      63  174.0      112         54        153         58
## 5      61  173.5      100         53        152         59
## 6      60  173.0       99         53        158         49
## 7      57  170.0       98         59        163         48
## 8      57  168.0       90         50        172         52
## 9      56  168.0      100         49        150         45
## 10     78  186.0      115         55        143         45
## 11     78  185.0      112         54        144         55
## 12     77  182.0      119         54        146         55
## 13     75  181.0      110         53        147         47
## 14     75  181.0      110         58        156         48
## 15     73  181.0      135         63        160         49
## 16     66  177.0      103         59        157         53
## 17     65  175.0      105         55        157         55
## 18     58  170.0       98         57        158         49
## 19     57  170.0      107         55        169         52
## 20     55  166.0      128         51        183         69
##    Cholesterol Glucoze Gender
## 1          4.9    4.70      0
## 2          4.8    4.90      0
## 3          4.5    7.00      0
## 4          8.0    7.20      0
## 5          4.6    6.70      0
## 6          3.9    6.00      0
## 7          4.1    5.90      0
## 8          4.8    5.70      0
## 9          4.9    5.60      0
## 10         4.0    4.20      1
## 11         4.0    4.70      1
## 12         4.9    3.80      1
## 13         3.5    4.10      1
## 14         5.1    4.60      1
## 15         5.7    4.80      1
## 16         4.7    5.30      1
## 17         4.6    3.80      1
## 18         3.9    6.00      1
## 19         4.4    5.95      1
## 20         5.3    6.00      1

Showing the data

head(data)
##   Weight Height Pressure Heart_beat Hemoglobin Hematocrit Cholesterol
## 1     70  179.0      105         64        160         50         4.9
## 2     68  178.0      105         60        158         51         4.8
## 3     64  174.0      109         54        155         51         4.5
## 4     63  174.0      112         54        153         58         8.0
## 5     61  173.5      100         53        152         59         4.6
## 6     60  173.0       99         53        158         49         3.9
##   Glucoze Gender
## 1     4.7      0
## 2     4.9      0
## 3     7.0      0
## 4     7.2      0
## 5     6.7      0
## 6     6.0      0

Creating new variable

data$BMI <- round((data$Weight / (data$Height/100)^2), 2)
#data$BMI <- NULL

Renaming variables

colnames(data)[4] <- "HeartBeat"
head(data)
##   Weight Height Pressure HeartBeat Hemoglobin Hematocrit Cholesterol
## 1     70  179.0      105        64        160         50         4.9
## 2     68  178.0      105        60        158         51         4.8
## 3     64  174.0      109        54        155         51         4.5
## 4     63  174.0      112        54        153         58         8.0
## 5     61  173.5      100        53        152         59         4.6
## 6     60  173.0       99        53        158         49         3.9
##   Glucoze Gender   BMI
## 1     4.7      0 21.85
## 2     4.9      0 21.46
## 3     7.0      0 21.14
## 4     7.2      0 20.81
## 5     6.7      0 20.26
## 6     6.0      0 20.05
colnames(data)[colnames(data) == "Height"] <- "Height_cm"
head(data)
##   Weight Height_cm Pressure HeartBeat Hemoglobin Hematocrit
## 1     70     179.0      105        64        160         50
## 2     68     178.0      105        60        158         51
## 3     64     174.0      109        54        155         51
## 4     63     174.0      112        54        153         58
## 5     61     173.5      100        53        152         59
## 6     60     173.0       99        53        158         49
##   Cholesterol Glucoze Gender   BMI
## 1         4.9     4.7      0 21.85
## 2         4.8     4.9      0 21.46
## 3         4.5     7.0      0 21.14
## 4         8.0     7.2      0 20.81
## 5         4.6     6.7      0 20.26
## 6         3.9     6.0      0 20.05

Deleting variabels and columns

data_new <- data[c(-2, -3) , -5]
head(data_new)
##   Weight Height_cm Pressure HeartBeat Hematocrit Cholesterol Glucoze
## 1     70     179.0      105        64         50         4.9     4.7
## 4     63     174.0      112        54         58         8.0     7.2
## 5     61     173.5      100        53         59         4.6     6.7
## 6     60     173.0       99        53         49         3.9     6.0
## 7     57     170.0       98        59         48         4.1     5.9
## 8     57     168.0       90        50         52         4.8     5.7
##   Gender   BMI
## 1      0 21.85
## 4      0 20.81
## 5      0 20.26
## 6      0 20.05
## 7      0 19.72
## 8      0 20.20

Replacing the value

data_new[2, 3] <- 100
head(data_new)
##   Weight Height_cm Pressure HeartBeat Hematocrit Cholesterol Glucoze
## 1     70     179.0      105        64         50         4.9     4.7
## 4     63     174.0      100        54         58         8.0     7.2
## 5     61     173.5      100        53         59         4.6     6.7
## 6     60     173.0       99        53         49         3.9     6.0
## 7     57     170.0       98        59         48         4.1     5.9
## 8     57     168.0       90        50         52         4.8     5.7
##   Gender   BMI
## 1      0 21.85
## 4      0 20.81
## 5      0 20.26
## 6      0 20.05
## 7      0 19.72
## 8      0 20.20

Creating factors

data$GenderFactor <- factor(data$Gender,
                            levels = c (0, 1), 
                            labels = c ("F", "M"))
str(data)
## 'data.frame':    20 obs. of  11 variables:
##  $ Weight      : int  70 68 64 63 61 60 57 57 56 78 ...
##  $ Height_cm   : num  179 178 174 174 174 ...
##  $ Pressure    : int  105 105 109 112 100 99 98 90 100 115 ...
##  $ HeartBeat   : int  64 60 54 54 53 53 59 50 49 55 ...
##  $ Hemoglobin  : int  160 158 155 153 152 158 163 172 150 143 ...
##  $ Hematocrit  : int  50 51 51 58 59 49 48 52 45 45 ...
##  $ Cholesterol : num  4.9 4.8 4.5 8 4.6 3.9 4.1 4.8 4.9 4 ...
##  $ Glucoze     : num  4.7 4.9 7 7.2 6.7 6 5.9 5.7 5.6 4.2 ...
##  $ Gender      : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ BMI         : num  21.9 21.5 21.1 20.8 20.3 ...
##  $ GenderFactor: Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 2 ...
boxplot(data$Height_cm ~ data$GenderFactor,
        ylab = "Height in cm",
        xlab = "Gender")

# Descriptive statistics

mean(data$Pressure)
## [1] 108
sapply(data, FUN = mean)
## Warning in mean.default(X[[i]], ...): argument is not numeric or
## logical: returning NA
##       Weight    Height_cm     Pressure    HeartBeat   Hemoglobin 
##      65.6500     175.5750     108.0000      55.5000     157.0500 
##   Hematocrit  Cholesterol      Glucoze       Gender          BMI 
##      52.0000       4.7300       5.3475       0.5500      21.2010 
## GenderFactor 
##           NA
data_numeric <- data[ , !colnames(data) %in% c("GenderFactor")]
sapply(data_numeric, FUN = var)
##      Weight   Height_cm    Pressure   HeartBeat  Hemoglobin 
##  65.2921053  34.8756579 111.8947368  15.9473684  93.8394737 
##  Hematocrit Cholesterol     Glucoze      Gender         BMI 
##  31.0526316   0.8811579   1.0361776   0.2605263   1.4918832
data_ordered <- data[order(data$Height_cm, data$Pressure), ]
data_ordered
##    Weight Height_cm Pressure HeartBeat Hemoglobin Hematocrit
## 20     55     166.0      128        51        183         69
## 8      57     168.0       90        50        172         52
## 9      56     168.0      100        49        150         45
## 7      57     170.0       98        59        163         48
## 18     58     170.0       98        57        158         49
## 19     57     170.0      107        55        169         52
## 6      60     173.0       99        53        158         49
## 5      61     173.5      100        53        152         59
## 3      64     174.0      109        54        155         51
## 4      63     174.0      112        54        153         58
## 17     65     175.0      105        55        157         55
## 16     66     177.0      103        59        157         53
## 2      68     178.0      105        60        158         51
## 1      70     179.0      105        64        160         50
## 13     75     181.0      110        53        147         47
## 14     75     181.0      110        58        156         48
## 15     73     181.0      135        63        160         49
## 12     77     182.0      119        54        146         55
## 11     78     185.0      112        54        144         55
## 10     78     186.0      115        55        143         45
##    Cholesterol Glucoze Gender   BMI GenderFactor
## 20         5.3    6.00      1 19.96            M
## 8          4.8    5.70      0 20.20            F
## 9          4.9    5.60      0 19.84            F
## 7          4.1    5.90      0 19.72            F
## 18         3.9    6.00      1 20.07            M
## 19         4.4    5.95      1 19.72            M
## 6          3.9    6.00      0 20.05            F
## 5          4.6    6.70      0 20.26            F
## 3          4.5    7.00      0 21.14            F
## 4          8.0    7.20      0 20.81            F
## 17         4.6    3.80      1 21.22            M
## 16         4.7    5.30      1 21.07            M
## 2          4.8    4.90      0 21.46            F
## 1          4.9    4.70      0 21.85            F
## 13         3.5    4.10      1 22.89            M
## 14         5.1    4.60      1 22.89            M
## 15         5.7    4.80      1 22.28            M
## 12         4.9    3.80      1 23.25            M
## 11         4.0    4.70      1 22.79            M
## 10         4.0    4.20      1 22.55            M
data_ordered <- data[order(data$Height_cm, -data$Pressure), ]
data_ordered
##    Weight Height_cm Pressure HeartBeat Hemoglobin Hematocrit
## 20     55     166.0      128        51        183         69
## 9      56     168.0      100        49        150         45
## 8      57     168.0       90        50        172         52
## 19     57     170.0      107        55        169         52
## 7      57     170.0       98        59        163         48
## 18     58     170.0       98        57        158         49
## 6      60     173.0       99        53        158         49
## 5      61     173.5      100        53        152         59
## 4      63     174.0      112        54        153         58
## 3      64     174.0      109        54        155         51
## 17     65     175.0      105        55        157         55
## 16     66     177.0      103        59        157         53
## 2      68     178.0      105        60        158         51
## 1      70     179.0      105        64        160         50
## 15     73     181.0      135        63        160         49
## 13     75     181.0      110        53        147         47
## 14     75     181.0      110        58        156         48
## 12     77     182.0      119        54        146         55
## 11     78     185.0      112        54        144         55
## 10     78     186.0      115        55        143         45
##    Cholesterol Glucoze Gender   BMI GenderFactor
## 20         5.3    6.00      1 19.96            M
## 9          4.9    5.60      0 19.84            F
## 8          4.8    5.70      0 20.20            F
## 19         4.4    5.95      1 19.72            M
## 7          4.1    5.90      0 19.72            F
## 18         3.9    6.00      1 20.07            M
## 6          3.9    6.00      0 20.05            F
## 5          4.6    6.70      0 20.26            F
## 4          8.0    7.20      0 20.81            F
## 3          4.5    7.00      0 21.14            F
## 17         4.6    3.80      1 21.22            M
## 16         4.7    5.30      1 21.07            M
## 2          4.8    4.90      0 21.46            F
## 1          4.9    4.70      0 21.85            F
## 15         5.7    4.80      1 22.28            M
## 13         3.5    4.10      1 22.89            M
## 14         5.1    4.60      1 22.89            M
## 12         4.9    3.80      1 23.25            M
## 11         4.0    4.70      1 22.79            M
## 10         4.0    4.20      1 22.55            M
summary(data)
##      Weight        Height_cm        Pressure     HeartBeat    
##  Min.   :55.00   Min.   :166.0   Min.   : 90   Min.   :49.00  
##  1st Qu.:57.75   1st Qu.:170.0   1st Qu.:100   1st Qu.:53.00  
##  Median :64.50   Median :174.5   Median :106   Median :54.50  
##  Mean   :65.65   Mean   :175.6   Mean   :108   Mean   :55.50  
##  3rd Qu.:73.50   3rd Qu.:181.0   3rd Qu.:112   3rd Qu.:58.25  
##  Max.   :78.00   Max.   :186.0   Max.   :135   Max.   :64.00  
##    Hemoglobin      Hematocrit     Cholesterol       Glucoze     
##  Min.   :143.0   Min.   :45.00   Min.   :3.500   Min.   :3.800  
##  1st Qu.:151.5   1st Qu.:48.75   1st Qu.:4.075   1st Qu.:4.675  
##  Median :157.0   Median :51.00   Median :4.650   Median :5.450  
##  Mean   :157.1   Mean   :52.00   Mean   :4.730   Mean   :5.348  
##  3rd Qu.:160.0   3rd Qu.:55.00   3rd Qu.:4.900   3rd Qu.:6.000  
##  Max.   :183.0   Max.   :69.00   Max.   :8.000   Max.   :7.200  
##      Gender          BMI        GenderFactor
##  Min.   :0.00   Min.   :19.72   F: 9        
##  1st Qu.:0.00   1st Qu.:20.07   M:11        
##  Median :1.00   Median :21.11               
##  Mean   :0.55   Mean   :21.20               
##  3rd Qu.:1.00   3rd Qu.:22.35               
##  Max.   :1.00   Max.   :23.25
#install.packages("psych")
library(psych)
psych::describe(data)
##               vars  n   mean    sd median trimmed   mad    min    max
## Weight           1 20  65.65  8.08  64.50   65.38 11.12  55.00  78.00
## Height_cm        2 20 175.57  5.91 174.50  175.41  6.67 166.00 186.00
## Pressure         3 20 108.00 10.58 106.00  106.81  8.90  90.00 135.00
## HeartBeat        4 20  55.50  3.99  54.50   55.25  2.97  49.00  64.00
## Hemoglobin       5 20 157.05  9.69 157.00  156.19  6.67 143.00 183.00
## Hematocrit       6 20  52.00  5.57  51.00   51.38  4.45  45.00  69.00
## Cholesterol      7 20   4.73  0.94   4.65    4.59  0.52   3.50   8.00
## Glucoze          8 20   5.35  1.02   5.45    5.32  1.04   3.80   7.20
## Gender           9 20   0.55  0.51   1.00    0.56  0.00   0.00   1.00
## BMI             10 20  21.20  1.22  21.10   21.15  1.63  19.72  23.25
## GenderFactor*   11 20   1.55  0.51   2.00    1.56  0.00   1.00   2.00
##               range  skew kurtosis   se
## Weight        23.00  0.24    -1.53 1.81
## Height_cm     20.00  0.11    -1.27 1.32
## Pressure      45.00  0.83     0.39 2.37
## HeartBeat     15.00  0.50    -0.56 0.89
## Hemoglobin    40.00  0.82     0.53 2.17
## Hematocrit    24.00  1.31     1.86 1.25
## Cholesterol    4.50  1.94     4.68 0.21
## Glucoze        3.40  0.12    -1.15 0.23
## Gender         1.00 -0.19    -2.06 0.11
## BMI            3.53  0.29    -1.52 0.27
## GenderFactor*  1.00 -0.19    -2.06 0.11
#install.packages("pastecs")
library(pastecs)
round(stat.desc(data_numeric), 2)
##               Weight Height_cm Pressure HeartBeat Hemoglobin
## nbr.val        20.00     20.00    20.00     20.00      20.00
## nbr.null        0.00      0.00     0.00      0.00       0.00
## nbr.na          0.00      0.00     0.00      0.00       0.00
## min            55.00    166.00    90.00     49.00     143.00
## max            78.00    186.00   135.00     64.00     183.00
## range          23.00     20.00    45.00     15.00      40.00
## sum          1313.00   3511.50  2160.00   1110.00    3141.00
## median         64.50    174.50   106.00     54.50     157.00
## mean           65.65    175.57   108.00     55.50     157.05
## SE.mean         1.81      1.32     2.37      0.89       2.17
## CI.mean.0.95    3.78      2.76     4.95      1.87       4.53
## var            65.29     34.88   111.89     15.95      93.84
## std.dev         8.08      5.91    10.58      3.99       9.69
## coef.var        0.12      0.03     0.10      0.07       0.06
##              Hematocrit Cholesterol Glucoze Gender    BMI
## nbr.val           20.00       20.00   20.00  20.00  20.00
## nbr.null           0.00        0.00    0.00   9.00   0.00
## nbr.na             0.00        0.00    0.00   0.00   0.00
## min               45.00        3.50    3.80   0.00  19.72
## max               69.00        8.00    7.20   1.00  23.25
## range             24.00        4.50    3.40   1.00   3.53
## sum             1040.00       94.60  106.95  11.00 424.02
## median            51.00        4.65    5.45   1.00  21.10
## mean              52.00        4.73    5.35   0.55  21.20
## SE.mean            1.25        0.21    0.23   0.11   0.27
## CI.mean.0.95       2.61        0.44    0.48   0.24   0.57
## var               31.05        0.88    1.04   0.26   1.49
## std.dev            5.57        0.94    1.02   0.51   1.22
## coef.var           0.11        0.20    0.19   0.93   0.06

Subsetting

data_male <- data[data$GenderFactor == "M", ]
quantile(data$Pressure, 0.75)
## 75% 
## 112
data_maleandpressure <- data[data$GenderFactor == "M" & data$Pressure >= 112, ]
data_maleorpressure <- data[data$GenderFactor == "M" | data$Pressure >= 112, ]

Describing by categories

library(psych)
describeBy(data$Height_cm, data$GenderFactor)
## 
##  Descriptive statistics by group 
## group: F
##    vars n   mean   sd median trimmed  mad min max range skew kurtosis
## X1    1 9 173.06 3.91  173.5  173.06 5.19 168 179    11 0.09    -1.46
##     se
## X1 1.3
## ---------------------------------------------------- 
## group: M
##    vars  n   mean   sd median trimmed  mad min max range  skew
## X1    1 11 177.64 6.61    181     178 5.93 166 186    20 -0.41
##    kurtosis   se
## X1    -1.39 1.99

Aggregation

data_agg <- aggregate(data[ , c(1, 2)],
                      by = list(data$GenderFactor),
                      FUN = sum
                      )
aggregate(data[ , c(1, 2)],
          by = list(data$GenderFactor),
          FUN = median
          )
##   Group.1 Weight Height_cm
## 1       F     61     173.5
## 2       M     73     181.0

Logical test

data$HighBloodPressure <- ifelse(data$Pressure >= 112, 1, 0)
mean(data$HighBloodPressure)
## [1] 0.3