Task 10 is meant to be a litte more advanced and combines many aspects you have learnt today.
load("../data/mlb.RData")
ls()
## [1] "mlb"
nrow(mlb)
## [1] 1034
head(mlb)
## player team position height weight age
## 1 Adam_Donachie BAL Catcher 74 180 22.99
## 2 Paul_Bako BAL Catcher 74 215 34.69
## 3 Ramon_Hernandez BAL Catcher 72 210 30.78
## 4 Kevin_Millar BAL First_Baseman 72 210 35.43
## 5 Chris_Gomez BAL First_Baseman 73 188 35.71
## 6 Brian_Roberts BAL Second_Baseman 69 176 29.39
mlb[which.max(mlb$height), ]
## player team position height weight age
## 929 Jon_Rauch WAS Relief_Pitcher 83 260 28.42
# in cm
cm(mlb[which.max(mlb$height), "height"])
## [1] 210.8
mlb_eu_measure <- mlb
lbs_to_kg <- function(lbs, b = 0.45359237) lbs * b
mlb_eu_measure$weight <- lbs_to_kg(mlb_eu_measure$weight)
mlb_eu_measure$height <- cm(mlb_eu_measure$height)
# okay let's split the dataset first using position as a factor
pos <- split(mlb_eu_measure, mlb_eu_measure$position)
lapply(pos, function(x) x[which.min(x$weight), ])
## $Catcher
## player team position height weight age
## 796 Carlos_Ruiz PHI Catcher 182.9 77.11 28.1
##
## $Designated_Hitter
## player team position height weight age
## 53 Jerry_Owens CWS Designated_Hitter 190.5 88.45 26.03
##
## $First_Baseman
## player team position height weight age
## 76 Howie_Kendrick ANA First_Baseman 177.8 81.65 23.64
##
## $Outfielder
## player team position height weight age
## 14 Brandon_Fahey BAL Outfielder 188 72.57 26.11
##
## $Relief_Pitcher
## player team position height weight age
## 828 Fabio_Castro PHI Relief_Pitcher 172.7 68.04 22.11
##
## $Second_Baseman
## player team position height weight age
## 457 Alexi_Casilla MIN Second_Baseman 175.3 72.57 22.61
##
## $Shortstop
## player team position height weight age
## 288 Oswaldo_Navarro SEA Shortstop 182.9 68.04 22.41
##
## $Starting_Pitcher
## player team position height weight age
## 368 Odalis_Perez KC Starting_Pitcher 182.9 68.04 29.73
##
## $Third_Baseman
## player team position height weight age
## 80 Maicer_Izturis ANA Third_Baseman 172.7 70.31 26.46
tm <- split(mlb_eu_measure, mlb_eu_measure$team)
# sapply returns a vector
which.max(sapply(tm, function(x) mean(x$age)))
## NYM
## 18
max(sapply(tm, function(x) mean(x$age)))
## [1] 30.52
a <- sapply(pos, function(x) mean(x$age))
a[which.min(a)]
## Starting_Pitcher
## 28.24
nms <- levels(mlb$position)
# let's do a bxoplot
par(mar = c(3, 10, 3, 3))
boxplot(height ~ position, las = 1, data = mlb_eu_measure, horizontal = T)
boxplot(weight ~ position, data = mlb_eu_measure, las = 1, horizontal = T)
boxplot(height ~ position, data = mlb_eu_measure, las = 1, horizontal = T)
!! *10. Write a function that compares the mean height of Starting Pitchers pairwise to another position by running a t-test. Why could such a function by useful? * !!
# truncate level names
levels(mlb_eu_measure$position) <- substr(levels(mlb_eu_measure$position), 1,
5)
# define a simple function
compare <- function(var_pos, ref_pos = "Start", data = mlb_eu_measure) {
ref <- data[data$position %in% ref_pos, "height"]
var <- data[data$position %in% var_pos, "height"]
t.test(ref, var)
}
# call the function one time:
compare("Desig")
##
## Welch Two Sample t-test
##
## data: ref and var
## t = 1.103, df = 21.58, p-value = 0.282
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.113 3.639
## sample estimates:
## mean of x mean of y
## 189.8 188.5
levels(mlb_eu_measure$position)
## [1] "Catch" "Desig" "First" "Outfi" "Relie" "Secon" "Short" "Start" "Third"
# ok let's run a pairwise comparison of starting pitchers with ANY position
lapply(setNames(levels(mlb_eu_measure$position), levels(mlb_eu_measure$position)),
compare)
## $Catch
##
## Welch Two Sample t-test
##
## data: ref and var
## t = 7.899, df = 164.9, p-value = 3.761e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 3.802 6.336
## sample estimates:
## mean of x mean of y
## 189.8 184.7
##
##
## $Desig
##
## Welch Two Sample t-test
##
## data: ref and var
## t = 1.103, df = 21.58, p-value = 0.282
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.113 3.639
## sample estimates:
## mean of x mean of y
## 189.8 188.5
##
##
## $First
##
## Welch Two Sample t-test
##
## data: ref and var
## t = 2.368, df = 93.1, p-value = 0.01994
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.2952 3.3597
## sample estimates:
## mean of x mean of y
## 189.8 188.0
##
##
## $Outfi
##
## Welch Two Sample t-test
##
## data: ref and var
## t = 8.161, df = 412.8, p-value = 4.066e-15
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 3.296 5.387
## sample estimates:
## mean of x mean of y
## 189.8 185.4
##
##
## $Relie
##
## Welch Two Sample t-test
##
## data: ref and var
## t = 1.762, df = 466.8, p-value = 0.07867
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1008 1.8526
## sample estimates:
## mean of x mean of y
## 189.8 188.9
##
##
## $Secon
##
## Welch Two Sample t-test
##
## data: ref and var
## t = 12.48, df = 115.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 7.175 9.881
## sample estimates:
## mean of x mean of y
## 189.8 181.3
##
##
## $Short
##
## Welch Two Sample t-test
##
## data: ref and var
## t = 9.66, df = 92.72, p-value = 1.104e-15
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 5.681 8.622
## sample estimates:
## mean of x mean of y
## 189.8 182.6
##
##
## $Start
##
## Welch Two Sample t-test
##
## data: ref and var
## t = 0, df = 440, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.069 1.069
## sample estimates:
## mean of x mean of y
## 189.8 189.8
##
##
## $Third
##
## Welch Two Sample t-test
##
## data: ref and var
## t = 4.759, df = 65.57, p-value = 1.112e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2.469 6.040
## sample estimates:
## mean of x mean of y
## 189.8 185.5