Andrew Stein - HW4 Data 180

Question 1a

library(MASS)

## Warning: package 'MASS' was built under R version 4.5.2

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data("Boston")

Boston_Chas <- Boston %>%
  filter(chas == 1) %>%
  select(dis, age, indus, nox, medv)

head(Boston_Chas)

##      dis   age indus   nox medv
## 1 1.3216 100.0 19.58 0.871 13.4
## 2 1.6102  88.0 19.58 0.871 15.3
## 3 1.7494  96.0 19.58 0.871 17.0
## 4 1.7455  82.6 19.58 0.871 15.6
## 5 1.7984  92.6 19.58 0.605 27.0
## 6 2.0407  98.2 19.58 0.605 50.0

Question 1b

with(Boston_Chas, {
  plot(indus, nox,
       pch = 16, cex = 0.8,
       xlab = "Industrial land (percent of acres)",
       ylab = "NOx concentration (per 10 million)",
       main = "NOx vs. Industrial Land (Charles River Tracts)",
       cex.axis = 1.2, cex.lab = 1.2,
       ylim = c(0, 1))
  rug(indus, side = 1)
  rug(nox, side = 2)
})

Question 1c

What is misleading with the rug plot for the indus variable? Explain with a full sentence.

It is misleading because there is overlap in the ticks. So it seems like there are less values at a given value than there are. Jittering allows this overlap to be displayed.

set.seed(180)
indus_j <- jitter(Boston_Chas$indus, amount = 0.2)
nox_j <- jitter(Boston_Chas$nox, amount = 0.025)

plot(indus_j, nox_j,
     pch = 16, cex = 0.8,
     xlab = "Industrial land (percent of acres, jittered)",
     ylab = "NOx concentration (per 10 million, jittered)",
     main = "NOx vs. Industrial Land (Jittered)",
     cex.axis = 1.2, cex.lab = 1.2,
     ylim = c(0, 1))
rug(indus_j, side = 1)
rug(nox_j, side = 2)

Question 1d

What is the relationship between the median value in the suburb and the two predictor variables? I’m not suggesting it is a simple relationship; what can you say about the more valuable suburbs (in general)?

The high value suburbs(ones with a large radius) tend to be closer to Boston and have newer housing.

op <- par(mar = c(5.1, 4.1, 4.1, 3.1)) # set before plotting

x <- Boston_Chas$dis
y <- Boston_Chas$age
r <- Boston_Chas$medv

plot(x, y,
  xlab = "Distance to Boston employment centers (index units)",
  ylab = "Owner-occupied units built prior to 1940 (%)",
  main = "Bubble Plot: Value vs. Distance & Age (Charles River Tracts)",
  cex.axis = 1.2, cex.lab = 1.2)

symbols(x, y,
  circles = r, # medv as radius per instructions
  inches = 0.3, # scale control (per spec)
  bg = "lightblue",
  lwd = 2,
  add = TRUE)

Question 2a

lm_rm <- lm(medv ~ rm, data = Boston)
summary(lm_rm)

## 
## Call:
## lm(formula = medv ~ rm, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.346  -2.547   0.090   2.986  39.433 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -34.671      2.650  -13.08   <2e-16 ***
## rm             9.102      0.419   21.72   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.616 on 504 degrees of freedom
## Multiple R-squared:  0.4835, Adjusted R-squared:  0.4825 
## F-statistic: 471.8 on 1 and 504 DF,  p-value: < 2.2e-16

Question 2b

plot(Boston$rm, Boston$medv,
pch = 16,
xlab = "Rooms per dwelling (rm)",
ylab = "Median home value (medv, $1000s)",
main = "medv vs rm with Fitted Line",
cex.axis = 1.2, cex.lab = 1.2)
abline(lm_rm, lwd = 2)

Question 2c

The slope of the regression line is around 9.1 per additional room. The number of rooms per dwelling increases with the median value.

coef(lm_rm)[2]

##       rm 
## 9.102109

Question 2d

The R² value is around .48 which indicates the variation in from the line that exists in the graph. Values closer to 1 imply a better fit so this graph doesn’t have the best fit but it’s not to bad.

summary(lm_rm)$r.squared

## [1] 0.4835255

Question 2e

The predicted median value would be around 19.94.

predict(lm_rm, newdata = data.frame(rm = 6))

##        1 
## 19.94203

Question 2f

plot(Boston$rm, resid(lm_rm),
pch = 16,
xlab = "Rooms per dwelling (rm)",
ylab = "Residuals",
main = "Residuals vs rm",
cex.axis = 1.2, cex.lab = 1.2)
abline(h = 0, lwd = 2)

Question 2g

The residuals are centered for the most part and there is no strong curvature. There are a few outliers, so the linear model is mostly reasonable, but there are definitely some irregularities.

Question 3a

List the athlete(s) and countries printed above. Briefly note if any should get special treatment (e.g., true exceptional performance vs. data entry error).

Heptathlon88 <- read.csv("heptathlon.csv", stringsAsFactors = FALSE)

if (ncol(Heptathlon88) >= 2) {
  Heptathlon88$athlete <- Heptathlon88[[1]]
  Heptathlon88 <- Heptathlon88[ , -1, drop = FALSE]
}

boxplot(Heptathlon88$score,
        horizontal = TRUE,
        main = "Distribution of Total Heptathlon Scores (1988)",
        col = "lightblue",
        lwd = 2,
        whisklty = 1,
        xlab = "Total points",
        cex.lab = 1.25, cex.axis = 1.2)

There are two outliers

Joyner-Kersee - strong performers
Launa - weak performer (should be removed)

bp <- boxplot.stats(Heptathlon88$score)
out_vals <- bp$out
out_rows <- which(Heptathlon88$score %in% out_vals)

cols_to_show <- c("athlete", "country", "score")
cols_to_show <- cols_to_show[cols_to_show %in% names(Heptathlon88)]
out_table <- Heptathlon88[out_rows, cols_to_show, drop = FALSE]
out_table

##                athlete score
## 1  Joyner-Kersee (USA)  7291
## 25         Launa (PNG)  4566

Question 3b

if (!"athlete" %in% names(Heptathlon88)) {
  Heptathlon88$athlete <- rownames(Heptathlon88)
  rownames(Heptathlon88) <- NULL
}

if ("country" %in% names(Heptathlon88)) {
  Heptathlon88_noPNG <- subset(Heptathlon88, toupper(country) != "PNG")
} else {
  Heptathlon88_noPNG <- subset(Heptathlon88, !grepl("^launa$", athlete, ignore.case = TRUE))
}

panel_lowess <- function(x, y, ...) {
  panel.smooth(x, y, lwd = 2, ...)
  points(x, y, pch = 16, cex = 0.8)
}

num_cols <- sapply(Heptathlon88_noPNG, is.numeric)
hept_pairs <- Heptathlon88_noPNG[ , num_cols, drop = FALSE]

pairs(hept_pairs,
      panel = panel_lowess,
      pch = 16,               
      cex = 0.8,
      main = "Heptathlon (1988): Events vs Total Score — PNG Removed")

Question 3c

Hurdles - Negative relationship between hurdles time and score. Athletes that run faster have lower hurdle times and their total scores increase, the pattern appears fairly linear. This indicates the hurdles event is a good predictor of overall performance.

Long Jump - Positive relationship with jump distance and score. Athletes with longer jump distances tend to achieve higher total scores. The pattern is tight which suggests long jump performance is strongly associated with overall success.

Javelin - Positive relationship with javelin distance and score. Javelin distance is positively associated with score, meaning athletes who throw farther generally score more points overall.

800m Run - Negative relationship between run time and score. This makes sense because a faster athlete will have lower run times which should and does correspond a score increases.

Question 3d i

The event with the strongest linear relationship to total score is long jump.

event_cols <- setdiff(names(Heptathlon88_noPNG)[sapply(Heptathlon88_noPNG, is.numeric)], "score")
cors <- sapply(Heptathlon88_noPNG[ , event_cols], function(z)
  cor(z, Heptathlon88_noPNG$score, use = "complete.obs"))
best_var <- names(which.max(abs(cors)))
best_var

## [1] "longjump"

lm_best <- lm(reformulate(best_var, response = "score"), data = Heptathlon88_noPNG)
summary(lm_best)

## 
## Call:
## lm(formula = reformulate(best_var, response = "score"), data = Heptathlon88_noPNG)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -329.34  -99.34   12.48  160.71  338.20 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -919.15     479.51  -1.917   0.0678 .  
## longjump     1139.35      77.72  14.660 3.69e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 180.5 on 23 degrees of freedom
## Multiple R-squared:  0.9033, Adjusted R-squared:  0.8991 
## F-statistic: 214.9 on 1 and 23 DF,  p-value: 3.694e-13

Question 3d ii

plot(Heptathlon88_noPNG[[best_var]], Heptathlon88_noPNG$score,
     pch = 16,
     xlab = best_var,
     ylab = "Total Score",
     main = paste("Total Score vs", best_var, "(PNG Removed)"),
     cex.lab = 1.2,
     cex.axis = 1.2)

abline(lm_best, lwd = 2)

Question 3d iii

She under performed with an actual score of 7291 and a predicted of 7363.94

jk_row <- grep("joyner|kersee", Heptathlon88_noPNG$athlete, ignore.case = TRUE)

if (length(jk_row) > 0) {
  jk <- Heptathlon88_noPNG[jk_row[1], , drop = FALSE]

  # Prediction
  jk_pred <- predict(lm_best, newdata = jk)
  
  # Residual
  jk_resid <- jk$score - jk_pred
  
  # Output results
  data.frame(
    athlete = jk$athlete,
    actual_score = jk$score,
    predicted_score = round(as.numeric(jk_pred), 2),
    residual = round(as.numeric(jk_resid), 2),
    performance = ifelse(jk_resid > 0, "over-performed", "under-performed")
  )
} else {
  message("Jackie Joyner-Kersee not found in dataset.")
}

##               athlete actual_score predicted_score residual     performance
## 1 Joyner-Kersee (USA)         7291         7363.94   -72.94 under-performed