data <- read.table("~/IMB MVA 2021/Example/UTF-8/Marathon.csv",
header = TRUE,
sep = ";",
dec = ",",
encoding = "UTF-8")
print(data)
## Weight Height Pressure Heart_beat Hemoglobin Hematocrit
## 1 70 179.0 105 64 160 50
## 2 68 178.0 105 60 158 51
## 3 64 174.0 109 54 155 51
## 4 63 174.0 112 54 153 58
## 5 61 173.5 100 53 152 59
## 6 60 173.0 99 53 158 49
## 7 57 170.0 98 59 163 48
## 8 57 168.0 90 50 172 52
## 9 56 168.0 100 49 150 45
## 10 78 186.0 115 55 143 45
## 11 78 185.0 112 54 144 55
## 12 77 182.0 119 54 146 55
## 13 75 181.0 110 53 147 47
## 14 75 181.0 110 58 156 48
## 15 73 181.0 135 63 160 49
## 16 66 177.0 103 59 157 53
## 17 65 175.0 105 55 157 55
## 18 58 170.0 98 57 158 49
## 19 57 170.0 107 55 169 52
## 20 55 166.0 128 51 183 69
## Cholesterol Glucoze Gender
## 1 4.9 4.70 0
## 2 4.8 4.90 0
## 3 4.5 7.00 0
## 4 8.0 7.20 0
## 5 4.6 6.70 0
## 6 3.9 6.00 0
## 7 4.1 5.90 0
## 8 4.8 5.70 0
## 9 4.9 5.60 0
## 10 4.0 4.20 1
## 11 4.0 4.70 1
## 12 4.9 3.80 1
## 13 3.5 4.10 1
## 14 5.1 4.60 1
## 15 5.7 4.80 1
## 16 4.7 5.30 1
## 17 4.6 3.80 1
## 18 3.9 6.00 1
## 19 4.4 5.95 1
## 20 5.3 6.00 1
head(data)
## Weight Height Pressure Heart_beat Hemoglobin Hematocrit Cholesterol
## 1 70 179.0 105 64 160 50 4.9
## 2 68 178.0 105 60 158 51 4.8
## 3 64 174.0 109 54 155 51 4.5
## 4 63 174.0 112 54 153 58 8.0
## 5 61 173.5 100 53 152 59 4.6
## 6 60 173.0 99 53 158 49 3.9
## Glucoze Gender
## 1 4.7 0
## 2 4.9 0
## 3 7.0 0
## 4 7.2 0
## 5 6.7 0
## 6 6.0 0
data$BMI <- round((data$Weight / (data$Height/100)^2), 2)
#data$BMI <- NULL
colnames(data)[4] <- "HeartBeat"
head(data)
## Weight Height Pressure HeartBeat Hemoglobin Hematocrit Cholesterol
## 1 70 179.0 105 64 160 50 4.9
## 2 68 178.0 105 60 158 51 4.8
## 3 64 174.0 109 54 155 51 4.5
## 4 63 174.0 112 54 153 58 8.0
## 5 61 173.5 100 53 152 59 4.6
## 6 60 173.0 99 53 158 49 3.9
## Glucoze Gender BMI
## 1 4.7 0 21.85
## 2 4.9 0 21.46
## 3 7.0 0 21.14
## 4 7.2 0 20.81
## 5 6.7 0 20.26
## 6 6.0 0 20.05
colnames(data)[colnames(data) == "Height"] <- "Height_cm"
head(data)
## Weight Height_cm Pressure HeartBeat Hemoglobin Hematocrit
## 1 70 179.0 105 64 160 50
## 2 68 178.0 105 60 158 51
## 3 64 174.0 109 54 155 51
## 4 63 174.0 112 54 153 58
## 5 61 173.5 100 53 152 59
## 6 60 173.0 99 53 158 49
## Cholesterol Glucoze Gender BMI
## 1 4.9 4.7 0 21.85
## 2 4.8 4.9 0 21.46
## 3 4.5 7.0 0 21.14
## 4 8.0 7.2 0 20.81
## 5 4.6 6.7 0 20.26
## 6 3.9 6.0 0 20.05
data_new <- data[c(-2, -3) , -5]
head(data_new)
## Weight Height_cm Pressure HeartBeat Hematocrit Cholesterol Glucoze
## 1 70 179.0 105 64 50 4.9 4.7
## 4 63 174.0 112 54 58 8.0 7.2
## 5 61 173.5 100 53 59 4.6 6.7
## 6 60 173.0 99 53 49 3.9 6.0
## 7 57 170.0 98 59 48 4.1 5.9
## 8 57 168.0 90 50 52 4.8 5.7
## Gender BMI
## 1 0 21.85
## 4 0 20.81
## 5 0 20.26
## 6 0 20.05
## 7 0 19.72
## 8 0 20.20
data_new[2, 3] <- 100
head(data_new)
## Weight Height_cm Pressure HeartBeat Hematocrit Cholesterol Glucoze
## 1 70 179.0 105 64 50 4.9 4.7
## 4 63 174.0 100 54 58 8.0 7.2
## 5 61 173.5 100 53 59 4.6 6.7
## 6 60 173.0 99 53 49 3.9 6.0
## 7 57 170.0 98 59 48 4.1 5.9
## 8 57 168.0 90 50 52 4.8 5.7
## Gender BMI
## 1 0 21.85
## 4 0 20.81
## 5 0 20.26
## 6 0 20.05
## 7 0 19.72
## 8 0 20.20
data$GenderFactor <- factor(data$Gender,
levels = c (0, 1),
labels = c ("F", "M"))
str(data)
## 'data.frame': 20 obs. of 11 variables:
## $ Weight : int 70 68 64 63 61 60 57 57 56 78 ...
## $ Height_cm : num 179 178 174 174 174 ...
## $ Pressure : int 105 105 109 112 100 99 98 90 100 115 ...
## $ HeartBeat : int 64 60 54 54 53 53 59 50 49 55 ...
## $ Hemoglobin : int 160 158 155 153 152 158 163 172 150 143 ...
## $ Hematocrit : int 50 51 51 58 59 49 48 52 45 45 ...
## $ Cholesterol : num 4.9 4.8 4.5 8 4.6 3.9 4.1 4.8 4.9 4 ...
## $ Glucoze : num 4.7 4.9 7 7.2 6.7 6 5.9 5.7 5.6 4.2 ...
## $ Gender : int 0 0 0 0 0 0 0 0 0 1 ...
## $ BMI : num 21.9 21.5 21.1 20.8 20.3 ...
## $ GenderFactor: Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 2 ...
boxplot(data$Height_cm ~ data$GenderFactor,
ylab = "Height in cm",
xlab = "Gender")
# Descriptive statistics
mean(data$Pressure)
## [1] 108
sapply(data, FUN = mean)
## Warning in mean.default(X[[i]], ...): argument is not numeric or
## logical: returning NA
## Weight Height_cm Pressure HeartBeat Hemoglobin
## 65.6500 175.5750 108.0000 55.5000 157.0500
## Hematocrit Cholesterol Glucoze Gender BMI
## 52.0000 4.7300 5.3475 0.5500 21.2010
## GenderFactor
## NA
data_numeric <- data[ , !colnames(data) %in% c("GenderFactor")]
sapply(data_numeric, FUN = var)
## Weight Height_cm Pressure HeartBeat Hemoglobin
## 65.2921053 34.8756579 111.8947368 15.9473684 93.8394737
## Hematocrit Cholesterol Glucoze Gender BMI
## 31.0526316 0.8811579 1.0361776 0.2605263 1.4918832
data_ordered <- data[order(data$Height_cm, data$Pressure), ]
data_ordered
## Weight Height_cm Pressure HeartBeat Hemoglobin Hematocrit
## 20 55 166.0 128 51 183 69
## 8 57 168.0 90 50 172 52
## 9 56 168.0 100 49 150 45
## 7 57 170.0 98 59 163 48
## 18 58 170.0 98 57 158 49
## 19 57 170.0 107 55 169 52
## 6 60 173.0 99 53 158 49
## 5 61 173.5 100 53 152 59
## 3 64 174.0 109 54 155 51
## 4 63 174.0 112 54 153 58
## 17 65 175.0 105 55 157 55
## 16 66 177.0 103 59 157 53
## 2 68 178.0 105 60 158 51
## 1 70 179.0 105 64 160 50
## 13 75 181.0 110 53 147 47
## 14 75 181.0 110 58 156 48
## 15 73 181.0 135 63 160 49
## 12 77 182.0 119 54 146 55
## 11 78 185.0 112 54 144 55
## 10 78 186.0 115 55 143 45
## Cholesterol Glucoze Gender BMI GenderFactor
## 20 5.3 6.00 1 19.96 M
## 8 4.8 5.70 0 20.20 F
## 9 4.9 5.60 0 19.84 F
## 7 4.1 5.90 0 19.72 F
## 18 3.9 6.00 1 20.07 M
## 19 4.4 5.95 1 19.72 M
## 6 3.9 6.00 0 20.05 F
## 5 4.6 6.70 0 20.26 F
## 3 4.5 7.00 0 21.14 F
## 4 8.0 7.20 0 20.81 F
## 17 4.6 3.80 1 21.22 M
## 16 4.7 5.30 1 21.07 M
## 2 4.8 4.90 0 21.46 F
## 1 4.9 4.70 0 21.85 F
## 13 3.5 4.10 1 22.89 M
## 14 5.1 4.60 1 22.89 M
## 15 5.7 4.80 1 22.28 M
## 12 4.9 3.80 1 23.25 M
## 11 4.0 4.70 1 22.79 M
## 10 4.0 4.20 1 22.55 M
data_ordered <- data[order(data$Height_cm, -data$Pressure), ]
data_ordered
## Weight Height_cm Pressure HeartBeat Hemoglobin Hematocrit
## 20 55 166.0 128 51 183 69
## 9 56 168.0 100 49 150 45
## 8 57 168.0 90 50 172 52
## 19 57 170.0 107 55 169 52
## 7 57 170.0 98 59 163 48
## 18 58 170.0 98 57 158 49
## 6 60 173.0 99 53 158 49
## 5 61 173.5 100 53 152 59
## 4 63 174.0 112 54 153 58
## 3 64 174.0 109 54 155 51
## 17 65 175.0 105 55 157 55
## 16 66 177.0 103 59 157 53
## 2 68 178.0 105 60 158 51
## 1 70 179.0 105 64 160 50
## 15 73 181.0 135 63 160 49
## 13 75 181.0 110 53 147 47
## 14 75 181.0 110 58 156 48
## 12 77 182.0 119 54 146 55
## 11 78 185.0 112 54 144 55
## 10 78 186.0 115 55 143 45
## Cholesterol Glucoze Gender BMI GenderFactor
## 20 5.3 6.00 1 19.96 M
## 9 4.9 5.60 0 19.84 F
## 8 4.8 5.70 0 20.20 F
## 19 4.4 5.95 1 19.72 M
## 7 4.1 5.90 0 19.72 F
## 18 3.9 6.00 1 20.07 M
## 6 3.9 6.00 0 20.05 F
## 5 4.6 6.70 0 20.26 F
## 4 8.0 7.20 0 20.81 F
## 3 4.5 7.00 0 21.14 F
## 17 4.6 3.80 1 21.22 M
## 16 4.7 5.30 1 21.07 M
## 2 4.8 4.90 0 21.46 F
## 1 4.9 4.70 0 21.85 F
## 15 5.7 4.80 1 22.28 M
## 13 3.5 4.10 1 22.89 M
## 14 5.1 4.60 1 22.89 M
## 12 4.9 3.80 1 23.25 M
## 11 4.0 4.70 1 22.79 M
## 10 4.0 4.20 1 22.55 M
summary(data)
## Weight Height_cm Pressure HeartBeat
## Min. :55.00 Min. :166.0 Min. : 90 Min. :49.00
## 1st Qu.:57.75 1st Qu.:170.0 1st Qu.:100 1st Qu.:53.00
## Median :64.50 Median :174.5 Median :106 Median :54.50
## Mean :65.65 Mean :175.6 Mean :108 Mean :55.50
## 3rd Qu.:73.50 3rd Qu.:181.0 3rd Qu.:112 3rd Qu.:58.25
## Max. :78.00 Max. :186.0 Max. :135 Max. :64.00
## Hemoglobin Hematocrit Cholesterol Glucoze
## Min. :143.0 Min. :45.00 Min. :3.500 Min. :3.800
## 1st Qu.:151.5 1st Qu.:48.75 1st Qu.:4.075 1st Qu.:4.675
## Median :157.0 Median :51.00 Median :4.650 Median :5.450
## Mean :157.1 Mean :52.00 Mean :4.730 Mean :5.348
## 3rd Qu.:160.0 3rd Qu.:55.00 3rd Qu.:4.900 3rd Qu.:6.000
## Max. :183.0 Max. :69.00 Max. :8.000 Max. :7.200
## Gender BMI GenderFactor
## Min. :0.00 Min. :19.72 F: 9
## 1st Qu.:0.00 1st Qu.:20.07 M:11
## Median :1.00 Median :21.11
## Mean :0.55 Mean :21.20
## 3rd Qu.:1.00 3rd Qu.:22.35
## Max. :1.00 Max. :23.25
#install.packages("psych")
library(psych)
psych::describe(data)
## vars n mean sd median trimmed mad min max
## Weight 1 20 65.65 8.08 64.50 65.38 11.12 55.00 78.00
## Height_cm 2 20 175.57 5.91 174.50 175.41 6.67 166.00 186.00
## Pressure 3 20 108.00 10.58 106.00 106.81 8.90 90.00 135.00
## HeartBeat 4 20 55.50 3.99 54.50 55.25 2.97 49.00 64.00
## Hemoglobin 5 20 157.05 9.69 157.00 156.19 6.67 143.00 183.00
## Hematocrit 6 20 52.00 5.57 51.00 51.38 4.45 45.00 69.00
## Cholesterol 7 20 4.73 0.94 4.65 4.59 0.52 3.50 8.00
## Glucoze 8 20 5.35 1.02 5.45 5.32 1.04 3.80 7.20
## Gender 9 20 0.55 0.51 1.00 0.56 0.00 0.00 1.00
## BMI 10 20 21.20 1.22 21.10 21.15 1.63 19.72 23.25
## GenderFactor* 11 20 1.55 0.51 2.00 1.56 0.00 1.00 2.00
## range skew kurtosis se
## Weight 23.00 0.24 -1.53 1.81
## Height_cm 20.00 0.11 -1.27 1.32
## Pressure 45.00 0.83 0.39 2.37
## HeartBeat 15.00 0.50 -0.56 0.89
## Hemoglobin 40.00 0.82 0.53 2.17
## Hematocrit 24.00 1.31 1.86 1.25
## Cholesterol 4.50 1.94 4.68 0.21
## Glucoze 3.40 0.12 -1.15 0.23
## Gender 1.00 -0.19 -2.06 0.11
## BMI 3.53 0.29 -1.52 0.27
## GenderFactor* 1.00 -0.19 -2.06 0.11
#install.packages("pastecs")
library(pastecs)
round(stat.desc(data_numeric), 2)
## Weight Height_cm Pressure HeartBeat Hemoglobin
## nbr.val 20.00 20.00 20.00 20.00 20.00
## nbr.null 0.00 0.00 0.00 0.00 0.00
## nbr.na 0.00 0.00 0.00 0.00 0.00
## min 55.00 166.00 90.00 49.00 143.00
## max 78.00 186.00 135.00 64.00 183.00
## range 23.00 20.00 45.00 15.00 40.00
## sum 1313.00 3511.50 2160.00 1110.00 3141.00
## median 64.50 174.50 106.00 54.50 157.00
## mean 65.65 175.57 108.00 55.50 157.05
## SE.mean 1.81 1.32 2.37 0.89 2.17
## CI.mean.0.95 3.78 2.76 4.95 1.87 4.53
## var 65.29 34.88 111.89 15.95 93.84
## std.dev 8.08 5.91 10.58 3.99 9.69
## coef.var 0.12 0.03 0.10 0.07 0.06
## Hematocrit Cholesterol Glucoze Gender BMI
## nbr.val 20.00 20.00 20.00 20.00 20.00
## nbr.null 0.00 0.00 0.00 9.00 0.00
## nbr.na 0.00 0.00 0.00 0.00 0.00
## min 45.00 3.50 3.80 0.00 19.72
## max 69.00 8.00 7.20 1.00 23.25
## range 24.00 4.50 3.40 1.00 3.53
## sum 1040.00 94.60 106.95 11.00 424.02
## median 51.00 4.65 5.45 1.00 21.10
## mean 52.00 4.73 5.35 0.55 21.20
## SE.mean 1.25 0.21 0.23 0.11 0.27
## CI.mean.0.95 2.61 0.44 0.48 0.24 0.57
## var 31.05 0.88 1.04 0.26 1.49
## std.dev 5.57 0.94 1.02 0.51 1.22
## coef.var 0.11 0.20 0.19 0.93 0.06
data_male <- data[data$GenderFactor == "M", ]
quantile(data$Pressure, 0.75)
## 75%
## 112
data_maleandpressure <- data[data$GenderFactor == "M" & data$Pressure >= 112, ]
data_maleorpressure <- data[data$GenderFactor == "M" | data$Pressure >= 112, ]
library(psych)
describeBy(data$Height_cm, data$GenderFactor)
##
## Descriptive statistics by group
## group: F
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 9 173.06 3.91 173.5 173.06 5.19 168 179 11 0.09 -1.46
## se
## X1 1.3
## ----------------------------------------------------
## group: M
## vars n mean sd median trimmed mad min max range skew
## X1 1 11 177.64 6.61 181 178 5.93 166 186 20 -0.41
## kurtosis se
## X1 -1.39 1.99
data_agg <- aggregate(data[ , c(1, 2)],
by = list(data$GenderFactor),
FUN = sum
)
aggregate(data[ , c(1, 2)],
by = list(data$GenderFactor),
FUN = median
)
## Group.1 Weight Height_cm
## 1 F 61 173.5
## 2 M 73 181.0
data$HighBloodPressure <- ifelse(data$Pressure >= 112, 1, 0)
mean(data$HighBloodPressure)
## [1] 0.3