Homework 1

Anja Kadič

a)

mydata <- read.table("./Marathon.csv",
                     header=TRUE,
                     sep=";",
                     dec=",")
head(mydata)
##   ID Weight Height Pressure Beat Hemoglobin Hematocrit Cholesterol
## 1  1     72  179.0      105   64        160         50         4.9
## 2  2     68  178.0      105   60        158         51         4.8
## 3  3     64  174.0      109   54        155         51         4.5
## 4  4     63  174.0      112   54        153         58         8.0
## 5  5     61  173.5      100   53        152         59         4.6
## 6  6     60  173.0       99   53        158         49         3.9
##   Glucose Gender
## 1     4.7      1
## 2     4.9      0
## 3     7.0      0
## 4     7.2      0
## 5     6.7      0
## 6     6.0      0

Unit: Athlete

Variables: Heightm Weight, Glucose, Hematocrit, Gender …

All are numerical, except gender

Explanation of the variable:

  • Gender: 0:F, 1:M

b)

mean(mydata$Height, na.rm = TRUE)
## [1] 176.9571
sd(mydata$Height, na.rm = TRUE)
## [1] 5.85156

Average: is the aritmetic mean, on everage everybody have 177 cm

SD: To additionaly describe a data.

In which interval 68.3% of all athlets fall: 177 +- 5.85 (sd)

c)

mydata$Gender <- factor(mydata$Gender,
                        levels = c(0, 1),
                        labels = c("F", "M"))

d)

by(mydata$Glucose, mydata$Gender, summary)
## mydata$Gender: F
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.600   5.300   5.800   5.957   6.925   7.200 
## ---------------------------------------------------- 
## mydata$Gender: M
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.800   4.000   4.600   4.536   4.700   6.000

h)

boxplot(Glucose ~ Gender, data = mydata)

boxplot(mydata$Glucose, mydata$Gender, data=mydata, main="Glucose by Gender")

library(ggplot2)
ggplot(mydata, aes(x = Gender, y = Glucose, fill = Gender)) +
  geom_boxplot() +
  labs(
    title = "Glucose Levels by Gender",
    x = "Gender",
    y = "Glucose")+
scale_fill_manual(values = c("pink", "lightblue"))

additional excercise

Drop the units ID 18, 19 and 20

Example 1

mydata2 <- subset(mydata, !(ID %in% c(18, 19, 20)))

Example 2

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
mydata4 <- mydata %>%
  filter(!(ID %in% c(18,19,20)))

f)

library(pastecs)
## 
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
round(stat.desc(mydata[  , c(-1, -10)]),2)
##               Weight  Height Pressure    Beat Hemoglobin Hematocrit
## nbr.val        35.00   35.00    35.00   35.00      35.00      35.00
## nbr.null        0.00    0.00     0.00    0.00       0.00       0.00
## nbr.na          0.00    0.00     0.00    0.00       0.00       0.00
## min            55.00  166.00    90.00   49.00     143.00      45.00
## max            81.00  189.00   135.00   64.00     183.00      69.00
## range          26.00   23.00    45.00   15.00      40.00      24.00
## sum          2375.00 6193.50  3838.00 1967.00    5445.00    1801.00
## median         68.00  177.00   108.00   55.00     157.00      51.00
## mean           67.86  176.96   109.66   56.20     155.57      51.46
## SE.mean         1.30    0.99     1.79    0.67       1.45       0.82
## CI.mean.0.95    2.64    2.01     3.64    1.37       2.94       1.66
## var            59.01   34.24   112.47   15.81      73.13      23.49
## std.dev         7.68    5.85    10.61    3.98       8.55       4.85
## coef.var        0.11    0.03     0.10    0.07       0.05       0.09
##              Cholesterol Glucose
## nbr.val            35.00   35.00
## nbr.null            0.00    0.00
## nbr.na              0.00    0.00
## min                 3.40    3.80
## max                 8.00    7.20
## range               4.60    3.40
## sum               167.60  178.65
## median              4.70    4.80
## mean                4.79    5.10
## SE.mean             0.17    0.18
## CI.mean.0.95        0.34    0.36
## var                 1.00    1.12
## std.dev             1.00    1.06
## coef.var            0.21    0.21

95 % Confidence interval for weight: 67.857 + - 2.639

se(y_hat)=1.298

t.test(mydata$Weight, conf.level = 0.95)
## 
##  One Sample t-test
## 
## data:  mydata$Weight
## t = 52.26, df = 34, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  65.21839 70.49590
## sample estimates:
## mean of x 
##  67.85714

We are 95 % confident that the population mean is withing this range.

g)

library(ggplot2)
ggplot(mydata, aes(x = Hematocrit)) +
  geom_histogram(binwidth = 2, colour = "black", fill = "lightblue") +
  labs(title = "Hematocrit distribution", 
       x = "Hematocrit (%)", 
       y = "Frequency")

Explanation: asymetrical to the right, maybe we have also some outliars