mydata <- read.table("./Marathon.csv",
header=TRUE,
sep=";",
dec=",")
head(mydata)
## ID Weight Height Pressure Beat Hemoglobin Hematocrit Cholesterol
## 1 1 72 179.0 105 64 160 50 4.9
## 2 2 68 178.0 105 60 158 51 4.8
## 3 3 64 174.0 109 54 155 51 4.5
## 4 4 63 174.0 112 54 153 58 8.0
## 5 5 61 173.5 100 53 152 59 4.6
## 6 6 60 173.0 99 53 158 49 3.9
## Glucose Gender
## 1 4.7 1
## 2 4.9 0
## 3 7.0 0
## 4 7.2 0
## 5 6.7 0
## 6 6.0 0
Unit: Athlete
Variables: Heightm Weight, Glucose, Hematocrit, Gender …
All are numerical, except gender
Explanation of the variable:
mean(mydata$Height, na.rm = TRUE)
## [1] 176.9571
sd(mydata$Height, na.rm = TRUE)
## [1] 5.85156
Average: is the aritmetic mean, on everage everybody have 177 cm
SD: To additionaly describe a data.
In which interval 68.3% of all athlets fall: 177 +- 5.85 (sd)
mydata$Gender <- factor(mydata$Gender,
levels = c(0, 1),
labels = c("F", "M"))
by(mydata$Glucose, mydata$Gender, summary)
## mydata$Gender: F
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.600 5.300 5.800 5.957 6.925 7.200
## ----------------------------------------------------
## mydata$Gender: M
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.800 4.000 4.600 4.536 4.700 6.000
boxplot(Glucose ~ Gender, data = mydata)
boxplot(mydata$Glucose, mydata$Gender, data=mydata, main="Glucose by Gender")
library(ggplot2)
ggplot(mydata, aes(x = Gender, y = Glucose, fill = Gender)) +
geom_boxplot() +
labs(
title = "Glucose Levels by Gender",
x = "Gender",
y = "Glucose")+
scale_fill_manual(values = c("pink", "lightblue"))
Drop the units ID 18, 19 and 20
Example 1
mydata2 <- subset(mydata, !(ID %in% c(18, 19, 20)))
Example 2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mydata4 <- mydata %>%
filter(!(ID %in% c(18,19,20)))
library(pastecs)
##
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
##
## first, last
round(stat.desc(mydata[ , c(-1, -10)]),2)
## Weight Height Pressure Beat Hemoglobin Hematocrit
## nbr.val 35.00 35.00 35.00 35.00 35.00 35.00
## nbr.null 0.00 0.00 0.00 0.00 0.00 0.00
## nbr.na 0.00 0.00 0.00 0.00 0.00 0.00
## min 55.00 166.00 90.00 49.00 143.00 45.00
## max 81.00 189.00 135.00 64.00 183.00 69.00
## range 26.00 23.00 45.00 15.00 40.00 24.00
## sum 2375.00 6193.50 3838.00 1967.00 5445.00 1801.00
## median 68.00 177.00 108.00 55.00 157.00 51.00
## mean 67.86 176.96 109.66 56.20 155.57 51.46
## SE.mean 1.30 0.99 1.79 0.67 1.45 0.82
## CI.mean.0.95 2.64 2.01 3.64 1.37 2.94 1.66
## var 59.01 34.24 112.47 15.81 73.13 23.49
## std.dev 7.68 5.85 10.61 3.98 8.55 4.85
## coef.var 0.11 0.03 0.10 0.07 0.05 0.09
## Cholesterol Glucose
## nbr.val 35.00 35.00
## nbr.null 0.00 0.00
## nbr.na 0.00 0.00
## min 3.40 3.80
## max 8.00 7.20
## range 4.60 3.40
## sum 167.60 178.65
## median 4.70 4.80
## mean 4.79 5.10
## SE.mean 0.17 0.18
## CI.mean.0.95 0.34 0.36
## var 1.00 1.12
## std.dev 1.00 1.06
## coef.var 0.21 0.21
95 % Confidence interval for weight: 67.857 + - 2.639
se(y_hat)=1.298
t.test(mydata$Weight, conf.level = 0.95)
##
## One Sample t-test
##
## data: mydata$Weight
## t = 52.26, df = 34, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 65.21839 70.49590
## sample estimates:
## mean of x
## 67.85714
We are 95 % confident that the population mean is withing this range.
library(ggplot2)
ggplot(mydata, aes(x = Hematocrit)) +
geom_histogram(binwidth = 2, colour = "black", fill = "lightblue") +
labs(title = "Hematocrit distribution",
x = "Hematocrit (%)",
y = "Frequency")
Explanation: asymetrical to the right, maybe we have also some outliars