install.packages(“readr”)
library(readr)
file_path <- "/Users/nicoleborunda/Downloads/decath.csv"
read_csv(file_path)
## Rows: 33 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (10): 100, long, poid, haut, 400, 110, disq, perc, jave, 1500
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 33 × 10
## `100` long poid haut `400` `110` disq perc jave `1500`
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 11.2 7.43 15.5 2.27 48.9 15.1 49.3 4.7 61.3 269.
## 2 10.9 7.45 15.0 1.97 47.7 14.5 44.4 5.1 61.8 273.
## 3 11.2 7.44 14.2 1.97 48.3 14.8 43.7 5.2 64.2 263.
## 4 10.6 7.38 15.0 2.03 49.1 14.7 44.8 4.9 64.0 285.
## 5 11.0 7.43 12.9 1.97 47.4 14.4 41.2 5.2 57.5 257.
## 6 10.8 7.72 13.6 2.12 48.3 14.2 43.1 4.9 52.2 274.
## 7 11.2 7.05 14.1 2.06 49.3 14.4 41.7 5.7 61.6 291.
## 8 11.0 6.95 15.3 2 48.2 14.4 41.3 4.8 63 266.
## 9 11.2 7.12 14.5 2.03 49.2 14.7 42.4 4.9 66.5 270.
## 10 11.2 7.28 15.2 1.97 48.6 14.8 48.0 5.2 59.5 292.
## # ℹ 23 more rows
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ purrr 1.0.1
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
is.tibble(file_path)
## Warning: `is.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `is_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## [1] FALSE
It says “a tibble” in the top left but I guess it isn’t tibble because when I run is.tibble (or is_tibble), it says FALSE.
data <- read.csv("/Users/nicoleborunda/Downloads/decath.csv")
print(data, nrow = nrow(data), ncol = ncol(data))
## X100 long poid haut X400 X110 disq perc jave X1500
## 1 11.25 7.43 15.48 2.27 48.90 15.13 49.28 4.7 61.32 268.95
## 2 10.87 7.45 14.97 1.97 47.71 14.46 44.36 5.1 61.76 273.02
## 3 11.18 7.44 14.20 1.97 48.29 14.81 43.66 5.2 64.16 263.20
## 4 10.62 7.38 15.02 2.03 49.06 14.72 44.80 4.9 64.04 285.11
## 5 11.02 7.43 12.92 1.97 47.44 14.40 41.20 5.2 57.46 256.64
## 6 10.83 7.72 13.58 2.12 48.34 14.18 43.06 4.9 52.18 274.07
## 7 11.18 7.05 14.12 2.06 49.34 14.39 41.68 5.7 61.60 291.20
## 8 11.05 6.95 15.34 2.00 48.21 14.36 41.32 4.8 63.00 265.86
## 9 11.15 7.12 14.52 2.03 49.15 14.66 42.36 4.9 66.46 269.62
## 10 11.23 7.28 15.25 1.97 48.60 14.76 48.02 5.2 59.48 292.24
## 11 10.94 7.45 15.34 1.97 49.94 14.25 41.86 4.8 66.64 295.89
## 12 11.18 7.34 14.48 1.94 49.02 15.11 42.76 4.7 65.84 256.74
## 13 11.02 7.29 12.92 2.06 48.23 14.94 39.54 5.0 56.80 257.85
## 14 10.99 7.37 13.61 1.97 47.83 14.70 43.88 4.3 66.54 268.97
## 15 11.03 7.45 14.20 1.97 48.94 15.44 41.66 4.7 64.00 267.48
## 16 11.09 7.08 14.51 2.03 49.89 14.78 43.20 4.9 57.18 268.54
## 17 11.46 6.75 16.07 2.00 51.28 16.06 50.66 4.8 72.60 302.42
## 18 11.57 7.00 16.60 1.94 49.84 15.00 46.66 4.9 60.20 286.04
## 19 11.07 7.04 13.41 1.94 47.97 14.96 40.38 4.5 51.50 262.41
## 20 10.89 7.07 15.84 1.79 49.68 15.38 45.32 4.9 60.48 277.84
## 21 11.52 7.36 13.93 1.94 49.99 15.64 38.82 4.6 67.04 266.42
## 22 11.49 7.02 13.80 2.03 50.60 15.22 39.08 4.7 60.92 262.93
## 23 11.38 7.08 14.31 2.00 50.24 14.97 46.34 4.4 55.68 272.68
## 24 11.30 6.97 13.23 2.15 49.98 15.38 38.72 4.6 54.34 277.84
## 25 11.00 7.23 13.15 2.03 49.73 14.96 38.06 4.5 52.82 285.57
## 26 11.33 6.83 11.63 2.06 48.37 15.39 37.52 4.6 55.42 270.07
## 27 11.10 6.98 12.69 1.82 48.63 15.13 38.04 4.7 49.52 261.90
## 28 11.51 7.01 14.17 1.94 51.16 15.18 45.84 4.6 56.28 303.17
## 29 11.26 6.90 12.41 1.88 48.24 15.61 38.02 4.4 52.68 272.06
## 30 11.50 7.09 12.94 1.82 49.27 15.56 42.32 4.5 53.50 293.85
## 31 11.43 6.22 13.98 1.91 51.25 15.88 46.18 4.6 57.84 294.99
## 32 11.47 6.43 12.33 1.94 50.30 15.00 38.72 4.0 57.26 293.72
## 33 11.57 7.19 10.27 1.91 50.71 16.20 34.36 4.1 54.94 269.98
mean_scores <- apply(data[, -1], 2, mean)
mean_scores
## long poid haut X400 X110 disq perc
## 7.133333 13.976364 1.982727 49.276667 15.048788 42.353939 4.739394
## jave X1500
## 59.438788 276.038485
sd_scores <- apply(data[, -1], 2, sd)
sd_scores
## long poid haut X400 X110 disq perc
## 0.3043401 1.3319906 0.0939838 1.0696602 0.5067652 3.7191312 0.3344206
## jave X1500
## 5.4959984 13.6570975
cov_matrix <- cov(data)
cor_matrix <- cor(data)
rounded_cor_matrix <- round(cor_matrix, 2)
print(rounded_cor_matrix)
## X100 long poid haut X400 X110 disq perc jave X1500
## X100 1.00 -0.54 -0.21 -0.15 0.61 0.64 -0.05 -0.39 -0.06 0.26
## long -0.54 1.00 0.14 0.27 -0.52 -0.48 0.04 0.35 0.18 -0.40
## poid -0.21 0.14 1.00 0.12 0.09 -0.30 0.81 0.48 0.60 0.27
## haut -0.15 0.27 0.12 1.00 -0.09 -0.31 0.15 0.21 0.12 -0.11
## X400 0.61 -0.52 0.09 -0.09 1.00 0.55 0.14 -0.32 0.12 0.59
## X110 0.64 -0.48 -0.30 -0.31 0.55 1.00 -0.11 -0.52 -0.06 0.14
## disq -0.05 0.04 0.81 0.15 0.14 -0.11 1.00 0.34 0.44 0.40
## perc -0.39 0.35 0.48 0.21 -0.32 -0.52 0.34 1.00 0.27 -0.03
## jave -0.06 0.18 0.60 0.12 0.12 -0.06 0.44 0.27 1.00 0.10
## X1500 0.26 -0.40 0.27 -0.11 0.59 0.14 0.40 -0.03 0.10 1.00
Explanations of pairwise correlations that have an absolute value > 0.5:
X100 and long (-0.54): I can’t think of a reason these are negatively correlated other than they are simply different events requiring different skills. The X100 is purely about speed whereas the long jump is about distance jumped which requires more dimensions of control.
X100 and X400 (0.61): The positive relationship here makes sense. Although 400 meters requires a different pacing than the 100 meter sprint, both are short distance races.
x100 and x110 (0.64): I think you could argue this either way, but I think it makes sense given the short sprint distances. I think someone who can spring the 110 hurdles would probably be able to perform well at the 100 meters, even though the events take different skills.
X400 and X110 (0.55): Similarly to the X100 and X110 explanation, I think this is moderate correlation probably makes sense because of the short distance sprinting component of both races.
disq and poid (0.81): Shot put and discus show a strong positive correlation. This makes sense given that both events require upper body strength and hand-eye coordination.
poid and jave (0.60): Like the disc and shot put, I think this probably makes sense given the skills and strength required. There may be a lower coordination with this pair given that the javelin is much different from a shot put than a disc is from a shot put.
long and x400 (-0.52): Similar to the X100 and long I think these are negatively correlated because they are different events requiring different skills. The X400 is purely about speed whereas the long jump is about distance jumped which requires more dimensions of control.
x1500 and x400 (0.59) A moderate correlation makes sense given that these are both just running events. It’s logical that it isn’t very strong given how a an x1500 requires different pacing than a 400 meter sprint.
perc and x110 (-0.52): a negative correlation makes sense given that the body build and skills associated with these events are very different.
install.packages(“corrplot”) library(corrplot) library(ggplot2)
correlation_matrix <- cor(data[, -1])
hc <- hclust(dist(correlation_matrix))
ordered_matrix <- correlation_matrix[hc$order, hc$order]
ggplot(data = reshape2::melt(ordered_matrix), aes(x = Var1, y = Var2, fill = value)) +
geom_tile() +
scale_fill_gradient2(low = "blue", high = "red", mid = "white") +
theme_minimal() +
labs(x = "Events X", y = "Events Y")
There are clusters around negative and positive correlations related to how similar events are. For example, the blue clusters show negative correlations between sprint events and events where people throw things or jump. This makes sense given the events require different skills and training. Another example is a red cluster where the events are related such as the throwing events: javelin, disc, and shot put.
# Convert to data frame
data_df <- as.data.frame(data)
disq <- as.numeric(data_df[-1, "disq"])
poid <- as.numeric(data_df[-1, "poid"])
plot(poid, disq, main = "Scatterplot of Discus vs Shot Put",
xlab = "Discus", ylab = "Shot Put")
shot_put <- as.numeric(data_df[-1, "poid"])
javelin <- as.numeric(data_df[-1, "jave"])
plot(shot_put, javelin, main = "Scatterplot of Shot Put vs Javelin",
xlab = "Shot Put", ylab = "Javelin")
# Scatterplot of Shot Put vs Javelin
disq <- as.numeric(data_df[-1, "disq"])
jave <- as.numeric(data_df[-1, "jave"])
plot(disq, jave, main = "Scatterplot of Discus vs Javelin",
xlab = "Discus", ylab = "Javelin")
# Set the margins
par(mar = c(4, 4, 2, 2))
# Adjusted margins
layout(matrix(c(1, 2, 3, 4), nrow = 2, byrow = TRUE), heights = c(2, 1))
# Scatterplot of Discus vs Shot Put
plot(poid, disq, main = "Scatterplot of Discus vs Shot Put",
xlab = "Discus", ylab = "Shot Put")
# Scatterplot of Discus vs Javelin
plot(poid, jave, main = "Scatterplot of Discus vs Javelin",
xlab = "Discus", ylab = "Javelin")
# Scatterplot of Shot Put vs Javelin
plot(disq, jave, main = "Scatterplot of Shot Put vs Javelin",
xlab = "Shot Put", ylab = "Javelin")
hist(poid, main = "Histogram of Shot Put Scores",
xlab = "Shot Put")
## Question 7
layout_matrix <- matrix(c(1, 2, 3, 3), nrow = 2, ncol = 2, byrow = TRUE)
layout(layout_matrix)
layout_matrix[2, ] <- layout_matrix[2, ] * 2
#Scatterplot of Discus vs Shot Put
plot(poid, disq, main = "Scatterplot of Discus vs Shot Put",
xlab = "Discus", ylab = "Shot Put")
#Shot Put vs Javelin
plot(disq, jave, main = "Scatterplot of Shot Put vs Javelin",
xlab = "Shot Put", ylab = "Javelin")
# Histogram of Shot Put scores
hist(poid, main = "Histogram of Shot Put Scores",
xlab = "Shot Put")