install.packages(“readr”)

Question 1

library(readr)
file_path <- "/Users/nicoleborunda/Downloads/decath.csv"
read_csv(file_path)
## Rows: 33 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (10): 100, long, poid, haut, 400, 110, disq, perc, jave, 1500
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 33 × 10
##    `100`  long  poid  haut `400` `110`  disq  perc  jave `1500`
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
##  1  11.2  7.43  15.5  2.27  48.9  15.1  49.3   4.7  61.3   269.
##  2  10.9  7.45  15.0  1.97  47.7  14.5  44.4   5.1  61.8   273.
##  3  11.2  7.44  14.2  1.97  48.3  14.8  43.7   5.2  64.2   263.
##  4  10.6  7.38  15.0  2.03  49.1  14.7  44.8   4.9  64.0   285.
##  5  11.0  7.43  12.9  1.97  47.4  14.4  41.2   5.2  57.5   257.
##  6  10.8  7.72  13.6  2.12  48.3  14.2  43.1   4.9  52.2   274.
##  7  11.2  7.05  14.1  2.06  49.3  14.4  41.7   5.7  61.6   291.
##  8  11.0  6.95  15.3  2     48.2  14.4  41.3   4.8  63     266.
##  9  11.2  7.12  14.5  2.03  49.2  14.7  42.4   4.9  66.5   270.
## 10  11.2  7.28  15.2  1.97  48.6  14.8  48.0   5.2  59.5   292.
## # ℹ 23 more rows
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ purrr     1.0.1
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
is.tibble(file_path)
## Warning: `is.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `is_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## [1] FALSE

It says “a tibble” in the top left but I guess it isn’t tibble because when I run is.tibble (or is_tibble), it says FALSE.

data <- read.csv("/Users/nicoleborunda/Downloads/decath.csv")
print(data, nrow = nrow(data), ncol = ncol(data))
##     X100 long  poid haut  X400  X110  disq perc  jave  X1500
## 1  11.25 7.43 15.48 2.27 48.90 15.13 49.28  4.7 61.32 268.95
## 2  10.87 7.45 14.97 1.97 47.71 14.46 44.36  5.1 61.76 273.02
## 3  11.18 7.44 14.20 1.97 48.29 14.81 43.66  5.2 64.16 263.20
## 4  10.62 7.38 15.02 2.03 49.06 14.72 44.80  4.9 64.04 285.11
## 5  11.02 7.43 12.92 1.97 47.44 14.40 41.20  5.2 57.46 256.64
## 6  10.83 7.72 13.58 2.12 48.34 14.18 43.06  4.9 52.18 274.07
## 7  11.18 7.05 14.12 2.06 49.34 14.39 41.68  5.7 61.60 291.20
## 8  11.05 6.95 15.34 2.00 48.21 14.36 41.32  4.8 63.00 265.86
## 9  11.15 7.12 14.52 2.03 49.15 14.66 42.36  4.9 66.46 269.62
## 10 11.23 7.28 15.25 1.97 48.60 14.76 48.02  5.2 59.48 292.24
## 11 10.94 7.45 15.34 1.97 49.94 14.25 41.86  4.8 66.64 295.89
## 12 11.18 7.34 14.48 1.94 49.02 15.11 42.76  4.7 65.84 256.74
## 13 11.02 7.29 12.92 2.06 48.23 14.94 39.54  5.0 56.80 257.85
## 14 10.99 7.37 13.61 1.97 47.83 14.70 43.88  4.3 66.54 268.97
## 15 11.03 7.45 14.20 1.97 48.94 15.44 41.66  4.7 64.00 267.48
## 16 11.09 7.08 14.51 2.03 49.89 14.78 43.20  4.9 57.18 268.54
## 17 11.46 6.75 16.07 2.00 51.28 16.06 50.66  4.8 72.60 302.42
## 18 11.57 7.00 16.60 1.94 49.84 15.00 46.66  4.9 60.20 286.04
## 19 11.07 7.04 13.41 1.94 47.97 14.96 40.38  4.5 51.50 262.41
## 20 10.89 7.07 15.84 1.79 49.68 15.38 45.32  4.9 60.48 277.84
## 21 11.52 7.36 13.93 1.94 49.99 15.64 38.82  4.6 67.04 266.42
## 22 11.49 7.02 13.80 2.03 50.60 15.22 39.08  4.7 60.92 262.93
## 23 11.38 7.08 14.31 2.00 50.24 14.97 46.34  4.4 55.68 272.68
## 24 11.30 6.97 13.23 2.15 49.98 15.38 38.72  4.6 54.34 277.84
## 25 11.00 7.23 13.15 2.03 49.73 14.96 38.06  4.5 52.82 285.57
## 26 11.33 6.83 11.63 2.06 48.37 15.39 37.52  4.6 55.42 270.07
## 27 11.10 6.98 12.69 1.82 48.63 15.13 38.04  4.7 49.52 261.90
## 28 11.51 7.01 14.17 1.94 51.16 15.18 45.84  4.6 56.28 303.17
## 29 11.26 6.90 12.41 1.88 48.24 15.61 38.02  4.4 52.68 272.06
## 30 11.50 7.09 12.94 1.82 49.27 15.56 42.32  4.5 53.50 293.85
## 31 11.43 6.22 13.98 1.91 51.25 15.88 46.18  4.6 57.84 294.99
## 32 11.47 6.43 12.33 1.94 50.30 15.00 38.72  4.0 57.26 293.72
## 33 11.57 7.19 10.27 1.91 50.71 16.20 34.36  4.1 54.94 269.98

Question 2

mean_scores <- apply(data[, -1], 2, mean)
mean_scores
##       long       poid       haut       X400       X110       disq       perc 
##   7.133333  13.976364   1.982727  49.276667  15.048788  42.353939   4.739394 
##       jave      X1500 
##  59.438788 276.038485
sd_scores <- apply(data[, -1], 2, sd)
sd_scores
##       long       poid       haut       X400       X110       disq       perc 
##  0.3043401  1.3319906  0.0939838  1.0696602  0.5067652  3.7191312  0.3344206 
##       jave      X1500 
##  5.4959984 13.6570975

Question 3

cov_matrix <- cov(data)
cor_matrix <- cor(data)
rounded_cor_matrix <- round(cor_matrix, 2)
print(rounded_cor_matrix)
##        X100  long  poid  haut  X400  X110  disq  perc  jave X1500
## X100   1.00 -0.54 -0.21 -0.15  0.61  0.64 -0.05 -0.39 -0.06  0.26
## long  -0.54  1.00  0.14  0.27 -0.52 -0.48  0.04  0.35  0.18 -0.40
## poid  -0.21  0.14  1.00  0.12  0.09 -0.30  0.81  0.48  0.60  0.27
## haut  -0.15  0.27  0.12  1.00 -0.09 -0.31  0.15  0.21  0.12 -0.11
## X400   0.61 -0.52  0.09 -0.09  1.00  0.55  0.14 -0.32  0.12  0.59
## X110   0.64 -0.48 -0.30 -0.31  0.55  1.00 -0.11 -0.52 -0.06  0.14
## disq  -0.05  0.04  0.81  0.15  0.14 -0.11  1.00  0.34  0.44  0.40
## perc  -0.39  0.35  0.48  0.21 -0.32 -0.52  0.34  1.00  0.27 -0.03
## jave  -0.06  0.18  0.60  0.12  0.12 -0.06  0.44  0.27  1.00  0.10
## X1500  0.26 -0.40  0.27 -0.11  0.59  0.14  0.40 -0.03  0.10  1.00

Explanations of pairwise correlations that have an absolute value > 0.5:

  1. X100 and long (-0.54): I can’t think of a reason these are negatively correlated other than they are simply different events requiring different skills. The X100 is purely about speed whereas the long jump is about distance jumped which requires more dimensions of control.

  2. X100 and X400 (0.61): The positive relationship here makes sense. Although 400 meters requires a different pacing than the 100 meter sprint, both are short distance races.

  3. x100 and x110 (0.64): I think you could argue this either way, but I think it makes sense given the short sprint distances. I think someone who can spring the 110 hurdles would probably be able to perform well at the 100 meters, even though the events take different skills.

  4. X400 and X110 (0.55): Similarly to the X100 and X110 explanation, I think this is moderate correlation probably makes sense because of the short distance sprinting component of both races.

  5. disq and poid (0.81): Shot put and discus show a strong positive correlation. This makes sense given that both events require upper body strength and hand-eye coordination.

  6. poid and jave (0.60): Like the disc and shot put, I think this probably makes sense given the skills and strength required. There may be a lower coordination with this pair given that the javelin is much different from a shot put than a disc is from a shot put.

  7. long and x400 (-0.52): Similar to the X100 and long I think these are negatively correlated because they are different events requiring different skills. The X400 is purely about speed whereas the long jump is about distance jumped which requires more dimensions of control.

  8. x1500 and x400 (0.59) A moderate correlation makes sense given that these are both just running events. It’s logical that it isn’t very strong given how a an x1500 requires different pacing than a 400 meter sprint.

  9. perc and x110 (-0.52): a negative correlation makes sense given that the body build and skills associated with these events are very different.

Question 4

install.packages(“corrplot”) library(corrplot) library(ggplot2)

correlation_matrix <- cor(data[, -1])
hc <- hclust(dist(correlation_matrix))
ordered_matrix <- correlation_matrix[hc$order, hc$order]

ggplot(data = reshape2::melt(ordered_matrix), aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white") +
  theme_minimal() +
  labs(x = "Events X", y = "Events Y")

There are clusters around negative and positive correlations related to how similar events are. For example, the blue clusters show negative correlations between sprint events and events where people throw things or jump. This makes sense given the events require different skills and training. Another example is a red cluster where the events are related such as the throwing events: javelin, disc, and shot put.

Question 5

Scatterplot of discus and shot put

# Convert to data frame
data_df <- as.data.frame(data)

disq <- as.numeric(data_df[-1, "disq"])
poid <- as.numeric(data_df[-1, "poid"])

plot(poid, disq, main = "Scatterplot of Discus vs Shot Put",
     xlab = "Discus", ylab = "Shot Put")

shot_put <- as.numeric(data_df[-1, "poid"])
javelin <- as.numeric(data_df[-1, "jave"])


plot(shot_put, javelin, main = "Scatterplot of Shot Put vs Javelin",
     xlab = "Shot Put", ylab = "Javelin")

# Scatterplot of Shot Put vs Javelin

disq <- as.numeric(data_df[-1, "disq"])
jave <- as.numeric(data_df[-1, "jave"])

plot(disq, jave, main = "Scatterplot of Discus vs Javelin",
     xlab = "Discus", ylab = "Javelin")

# Set the margins 
par(mar = c(4, 4, 2, 2))

# Adjusted margins
layout(matrix(c(1, 2, 3, 4), nrow = 2, byrow = TRUE), heights = c(2, 1))

# Scatterplot of Discus vs Shot Put
plot(poid, disq, main = "Scatterplot of Discus vs Shot Put",
     xlab = "Discus", ylab = "Shot Put")

# Scatterplot of Discus vs Javelin
plot(poid, jave, main = "Scatterplot of Discus vs Javelin",
     xlab = "Discus", ylab = "Javelin")

# Scatterplot of Shot Put vs Javelin
plot(disq, jave, main = "Scatterplot of Shot Put vs Javelin",
    xlab = "Shot Put", ylab = "Javelin")

hist(poid, main = "Histogram of Shot Put Scores",
     xlab = "Shot Put")

## Question 7

layout_matrix <- matrix(c(1, 2, 3, 3), nrow = 2, ncol = 2, byrow = TRUE)
layout(layout_matrix)
layout_matrix[2, ] <- layout_matrix[2, ] * 2


#Scatterplot of Discus vs Shot Put
plot(poid, disq, main = "Scatterplot of Discus vs Shot Put",
     xlab = "Discus", ylab = "Shot Put")

#Shot Put vs Javelin
plot(disq, jave, main = "Scatterplot of Shot Put vs Javelin",
     xlab = "Shot Put", ylab = "Javelin")

# Histogram of Shot Put scores
hist(poid, main = "Histogram of Shot Put Scores",
     xlab = "Shot Put")