library(MASS)
## Warning: package 'MASS' was built under R version 4.5.2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data("Boston")
Boston_Chas <- Boston %>%
filter(chas == 1) %>%
select(dis, age, indus, nox, medv)
head(Boston_Chas)
## dis age indus nox medv
## 1 1.3216 100.0 19.58 0.871 13.4
## 2 1.6102 88.0 19.58 0.871 15.3
## 3 1.7494 96.0 19.58 0.871 17.0
## 4 1.7455 82.6 19.58 0.871 15.6
## 5 1.7984 92.6 19.58 0.605 27.0
## 6 2.0407 98.2 19.58 0.605 50.0
with(Boston_Chas, {
plot(indus, nox,
pch = 16, cex = 0.8,
xlab = "Industrial land (percent of acres)",
ylab = "NOx concentration (per 10 million)",
main = "NOx vs. Industrial Land (Charles River Tracts)",
cex.axis = 1.2, cex.lab = 1.2,
ylim = c(0, 1))
rug(indus, side = 1)
rug(nox, side = 2)
})
What is misleading with the rug plot for the indus variable? Explain with a full sentence.
It is misleading because there is overlap in the ticks. So it seems like there are less values at a given value than there are. Jittering allows this overlap to be displayed.
set.seed(180)
indus_j <- jitter(Boston_Chas$indus, amount = 0.2)
nox_j <- jitter(Boston_Chas$nox, amount = 0.025)
plot(indus_j, nox_j,
pch = 16, cex = 0.8,
xlab = "Industrial land (percent of acres, jittered)",
ylab = "NOx concentration (per 10 million, jittered)",
main = "NOx vs. Industrial Land (Jittered)",
cex.axis = 1.2, cex.lab = 1.2,
ylim = c(0, 1))
rug(indus_j, side = 1)
rug(nox_j, side = 2)
What is the relationship between the median value in the suburb and the two predictor variables? I’m not suggesting it is a simple relationship; what can you say about the more valuable suburbs (in general)?
The high value suburbs(ones with a large radius) tend to be closer to Boston and have newer housing.
op <- par(mar = c(5.1, 4.1, 4.1, 3.1)) # set before plotting
x <- Boston_Chas$dis
y <- Boston_Chas$age
r <- Boston_Chas$medv
plot(x, y,
xlab = "Distance to Boston employment centers (index units)",
ylab = "Owner-occupied units built prior to 1940 (%)",
main = "Bubble Plot: Value vs. Distance & Age (Charles River Tracts)",
cex.axis = 1.2, cex.lab = 1.2)
symbols(x, y,
circles = r, # medv as radius per instructions
inches = 0.3, # scale control (per spec)
bg = "lightblue",
lwd = 2,
add = TRUE)
lm_rm <- lm(medv ~ rm, data = Boston)
summary(lm_rm)
##
## Call:
## lm(formula = medv ~ rm, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.346 -2.547 0.090 2.986 39.433
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -34.671 2.650 -13.08 <2e-16 ***
## rm 9.102 0.419 21.72 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.616 on 504 degrees of freedom
## Multiple R-squared: 0.4835, Adjusted R-squared: 0.4825
## F-statistic: 471.8 on 1 and 504 DF, p-value: < 2.2e-16
plot(Boston$rm, Boston$medv,
pch = 16,
xlab = "Rooms per dwelling (rm)",
ylab = "Median home value (medv, $1000s)",
main = "medv vs rm with Fitted Line",
cex.axis = 1.2, cex.lab = 1.2)
abline(lm_rm, lwd = 2)
The slope of the regression line is around 9.1 per additional room. The number of rooms per dwelling increases with the median value.
coef(lm_rm)[2]
## rm
## 9.102109
The R² value is around .48 which indicates the variation in from the line that exists in the graph. Values closer to 1 imply a better fit so this graph doesn’t have the best fit but it’s not to bad.
summary(lm_rm)$r.squared
## [1] 0.4835255
The predicted median value would be around 19.94.
predict(lm_rm, newdata = data.frame(rm = 6))
## 1
## 19.94203
plot(Boston$rm, resid(lm_rm),
pch = 16,
xlab = "Rooms per dwelling (rm)",
ylab = "Residuals",
main = "Residuals vs rm",
cex.axis = 1.2, cex.lab = 1.2)
abline(h = 0, lwd = 2)
The residuals are centered for the most part and there is no strong curvature. There are a few outliers, so the linear model is mostly reasonable, but there are definitely some irregularities.
List the athlete(s) and countries printed above. Briefly note if any should get special treatment (e.g., true exceptional performance vs. data entry error).
Heptathlon88 <- read.csv("heptathlon.csv", stringsAsFactors = FALSE)
if (ncol(Heptathlon88) >= 2) {
Heptathlon88$athlete <- Heptathlon88[[1]]
Heptathlon88 <- Heptathlon88[ , -1, drop = FALSE]
}
boxplot(Heptathlon88$score,
horizontal = TRUE,
main = "Distribution of Total Heptathlon Scores (1988)",
col = "lightblue",
lwd = 2,
whisklty = 1,
xlab = "Total points",
cex.lab = 1.25, cex.axis = 1.2)
There are two outliers
Joyner-Kersee - strong performers
Launa - weak performer (should be removed)
bp <- boxplot.stats(Heptathlon88$score)
out_vals <- bp$out
out_rows <- which(Heptathlon88$score %in% out_vals)
cols_to_show <- c("athlete", "country", "score")
cols_to_show <- cols_to_show[cols_to_show %in% names(Heptathlon88)]
out_table <- Heptathlon88[out_rows, cols_to_show, drop = FALSE]
out_table
## athlete score
## 1 Joyner-Kersee (USA) 7291
## 25 Launa (PNG) 4566
if (!"athlete" %in% names(Heptathlon88)) {
Heptathlon88$athlete <- rownames(Heptathlon88)
rownames(Heptathlon88) <- NULL
}
if ("country" %in% names(Heptathlon88)) {
Heptathlon88_noPNG <- subset(Heptathlon88, toupper(country) != "PNG")
} else {
Heptathlon88_noPNG <- subset(Heptathlon88, !grepl("^launa$", athlete, ignore.case = TRUE))
}
panel_lowess <- function(x, y, ...) {
panel.smooth(x, y, lwd = 2, ...)
points(x, y, pch = 16, cex = 0.8)
}
num_cols <- sapply(Heptathlon88_noPNG, is.numeric)
hept_pairs <- Heptathlon88_noPNG[ , num_cols, drop = FALSE]
pairs(hept_pairs,
panel = panel_lowess,
pch = 16,
cex = 0.8,
main = "Heptathlon (1988): Events vs Total Score — PNG Removed")
Hurdles - Negative relationship between hurdles time and score. Athletes that run faster have lower hurdle times and their total scores increase, the pattern appears fairly linear. This indicates the hurdles event is a good predictor of overall performance.
Long Jump - Positive relationship with jump distance and score. Athletes with longer jump distances tend to achieve higher total scores. The pattern is tight which suggests long jump performance is strongly associated with overall success.
Javelin - Positive relationship with javelin distance and score. Javelin distance is positively associated with score, meaning athletes who throw farther generally score more points overall.
800m Run - Negative relationship between run time and score. This makes sense because a faster athlete will have lower run times which should and does correspond a score increases.
The event with the strongest linear relationship to total score is long jump.
event_cols <- setdiff(names(Heptathlon88_noPNG)[sapply(Heptathlon88_noPNG, is.numeric)], "score")
cors <- sapply(Heptathlon88_noPNG[ , event_cols], function(z)
cor(z, Heptathlon88_noPNG$score, use = "complete.obs"))
best_var <- names(which.max(abs(cors)))
best_var
## [1] "longjump"
lm_best <- lm(reformulate(best_var, response = "score"), data = Heptathlon88_noPNG)
summary(lm_best)
##
## Call:
## lm(formula = reformulate(best_var, response = "score"), data = Heptathlon88_noPNG)
##
## Residuals:
## Min 1Q Median 3Q Max
## -329.34 -99.34 12.48 160.71 338.20
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -919.15 479.51 -1.917 0.0678 .
## longjump 1139.35 77.72 14.660 3.69e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 180.5 on 23 degrees of freedom
## Multiple R-squared: 0.9033, Adjusted R-squared: 0.8991
## F-statistic: 214.9 on 1 and 23 DF, p-value: 3.694e-13
plot(Heptathlon88_noPNG[[best_var]], Heptathlon88_noPNG$score,
pch = 16,
xlab = best_var,
ylab = "Total Score",
main = paste("Total Score vs", best_var, "(PNG Removed)"),
cex.lab = 1.2,
cex.axis = 1.2)
abline(lm_best, lwd = 2)
She under performed with an actual score of 7291 and a predicted of 7363.94
jk_row <- grep("joyner|kersee", Heptathlon88_noPNG$athlete, ignore.case = TRUE)
if (length(jk_row) > 0) {
jk <- Heptathlon88_noPNG[jk_row[1], , drop = FALSE]
# Prediction
jk_pred <- predict(lm_best, newdata = jk)
# Residual
jk_resid <- jk$score - jk_pred
# Output results
data.frame(
athlete = jk$athlete,
actual_score = jk$score,
predicted_score = round(as.numeric(jk_pred), 2),
residual = round(as.numeric(jk_resid), 2),
performance = ifelse(jk_resid > 0, "over-performed", "under-performed")
)
} else {
message("Jackie Joyner-Kersee not found in dataset.")
}
## athlete actual_score predicted_score residual performance
## 1 Joyner-Kersee (USA) 7291 7363.94 -72.94 under-performed