#A Dataset Overview
library(MASS)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
dim(Boston)
## [1] 506 14
# ?Boston
#B Pairwise Scatter Plots
pairs(Boston)

Boston$chas <- factor(Boston$chas)
predictor_df <- data.frame(response = character(),
varname = character(),
R2 = numeric(),
R2_log = numeric(),
R2_quad = numeric(),
max_R2 = numeric(),
stringsAsFactors = F)
best_predictor <- function(data, response) {
}
for (i in setdiff(1:ncol(Boston), 4)) { # excluding 'chas' as it uses different eval metric
response <- names(Boston)[i]
predictor_info <- best_predictor(Boston, names(Boston)[i])[1, -2]
predictor_df <- rbind(predictor_df, cbind(response, predictor_info))
}
best_predictor(Boston, "chas")[1, ]
## NULL
g1 <- ggplot(Boston, aes(x = tax, y = rad)) +
geom_jitter(alpha = 0.1) +
geom_smooth(method = "lm", formula = "y ~ x + I(x^2)") +
scale_x_continuous(labels = scales::comma_format()) +
labs(title = "Tax vs Rad - Jitter Plot",
x = "Tax",
y = "Rad")
g2 <- Boston %>%
filter(rad < 20) %>%
ggplot(aes(x = tax, y = rad)) +
geom_jitter(alpha = 0.1) +
geom_smooth(method = "lm", formula = "y ~ x + I(x^2)") +
scale_x_continuous(labels = scales::comma_format()) +
labs(title = "Tax vs Rad - Jitter Plot",
subtitle = "Filtered for rad < 20 (132/506 obs. removed)",
x = "Tax",
y = "Rad")
grid.arrange(g1, g2, ncol = 2)

Boston_2 <- Boston %>%
filter(rad < 20)
summary(lm(rad ~ tax + I(tax^2), data = Boston_2))
##
## Call:
## lm(formula = rad ~ tax + I(tax^2), data = Boston_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0642 -0.7413 -0.0235 0.7961 3.6424
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.778e-02 8.026e-01 -0.122 0.903
## tax 2.280e-02 4.284e-03 5.323 1.78e-07 ***
## I(tax^2) -2.504e-05 5.484e-06 -4.567 6.75e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.565 on 371 degrees of freedom
## Multiple R-squared: 0.08678, Adjusted R-squared: 0.08185
## F-statistic: 17.63 on 2 and 371 DF, p-value: 4.863e-08
ggplot(Boston, aes(x = nox, y = dis)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "lm", formula = "y ~ x + I(x^2)") +
labs(title = "Nox vs Dis - Scatter Plot",
subtitle = "Quadratic fit",
x = "Nox",
y = "Dis")

#C Predictors of crim
best_predictor(Boston, "crim")
## NULL
ggplot(Boston, aes(x = medv, y = crim)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", formula = "y ~ x + I(x^2)", col = "green") +
labs(title = "Medv vs Crim - Scatter Plot",
x = "Medv",
y = "Crim")

ggplot(Boston, aes(x = dis, y = crim)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", formula = "y ~ x + I(x^2)", col = "green") +
labs(title = "Dis vs Crim - Scatter Plot",
x = "Dis",
y = "Crim")

ggplot(Boston, aes(x = lstat, y = crim)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", formula = "y ~ x + I(x^2)", col = "green") +
labs(title = "Lstat vs Crim - Scatter Plot",
x = "Lstat",
y = "Crim")

#D High Crime, Tax Rate & Pupil-Teacher Ratio Suburbs
ggplot(Boston, aes(x = "", y = crim)) +
geom_boxplot() +
scale_y_continuous(breaks = seq(0, 100, 10)) +
labs(y = "Crim",
title = "Crim - Boxplot") +
coord_flip() +
theme(axis.title.y = element_blank(),
axis.ticks.y = element_blank())

range(Boston$crim)
## [1] 0.00632 88.97620
range(Boston$tax)
## [1] 187 711
ggplot(Boston, aes(x = tax)) +
geom_histogram(binwidth = 10, fill = "mediumseagreen") +
scale_y_continuous(breaks = seq(0, 200, 20)) +
labs(title = "Tax - Histogram",
x = "Tax",
y = "Count")

range(Boston$ptratio)
## [1] 12.6 22.0
ggplot(Boston, aes(x = ptratio)) +
geom_histogram(binwidth = 0.25, fill = "deepskyblue") +
scale_x_continuous(breaks = seq(12, 23, 0.5)) +
scale_y_continuous(breaks = seq(0, 200, 20)) +
labs(title = "Ptratio - Histogram",
x = "Ptratio",
y = "Count")

Boston %>%
group_by(ptratio) %>%
count() %>%
arrange(desc(n))
## # A tibble: 46 × 2
## # Groups: ptratio [46]
## ptratio n
## <dbl> <int>
## 1 20.2 140
## 2 14.7 34
## 3 21 27
## 4 17.8 23
## 5 19.2 19
## 6 17.4 18
## 7 18.6 17
## 8 19.1 17
## 9 16.6 16
## 10 18.4 16
## # ℹ 36 more rows
Boston %>%
group_by(ptratio, rad, tax, zn, indus) %>%
count() %>%
arrange(desc(n))
## # A tibble: 78 × 6
## # Groups: ptratio, rad, tax, zn, indus [78]
## ptratio rad tax zn indus n
## <dbl> <int> <dbl> <dbl> <dbl> <int>
## 1 20.2 24 666 0 18.1 132
## 2 14.7 5 403 0 19.6 30
## 3 21 4 307 0 8.14 22
## 4 17.4 8 307 0 6.2 18
## 5 21.2 4 437 0 21.9 15
## 6 13 5 264 20 3.97 12
## 7 18.4 4 304 0 9.9 12
## 8 18.6 4 277 0 10.6 11
## 9 20.9 5 384 0 8.56 11
## 10 19.1 7 330 22 5.86 10
## # ℹ 68 more rows
Boston %>%
group_by(ptratio, rad, tax, zn, indus, black) %>%
count() %>%
arrange(desc(n))
## # A tibble: 430 × 7
## # Groups: ptratio, rad, tax, zn, indus, black [430]
## ptratio rad tax zn indus black n
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int>
## 1 20.2 24 666 0 18.1 397. 29
## 2 19.2 6 391 0 9.69 397. 6
## 3 19.6 5 287 0 7.38 397. 6
## 4 21.2 4 437 0 21.9 397. 5
## 5 16 4 289 0 13.9 397. 4
## 6 17.9 3 233 0 6.91 397. 4
## 7 20.2 5 224 0 5.19 397. 4
## 8 14.7 5 403 0 19.6 397. 3
## 9 17.6 4 254 40 6.41 397. 3
## 10 17.8 3 193 0 2.46 397. 3
## # ℹ 420 more rows
#E Charles River
#How many of the suburbs in this data set bound the Charles river?
table(Boston$chas)
##
## 0 1
## 471 35
#F Median ptratio
#What is the median pupil-teacher ratio among the towns in this data set?
median(Boston$ptratio)
## [1] 19.05
#G Lowest medv Suburbs
#Which suburb of Boston has lowest median value of owneroccupied homes? What are the values of the other predictors for that suburb, and how do those values compare to the overall ranges for those predictors? Comment on your findings.
Boston[Boston$medv == min(Boston$medv), ]
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 399 38.3518 0 18.1 0 0.693 5.453 100 1.4896 24 666 20.2 396.90 30.59
## 406 67.9208 0 18.1 0 0.693 5.683 100 1.4254 24 666 20.2 384.97 22.98
## medv
## 399 5
## 406 5
Boston_percentiles <- sapply(Boston[ ,-4], function(x) rank(x)/length(x)) %>%
as.data.frame()
Boston_percentiles[c(399, 406),]
## crim zn indus nox rm age dis
## 399 0.9881423 0.3685771 0.7579051 0.8448617 0.0770751 0.958498 0.05731225
## 406 0.9960474 0.3685771 0.7579051 0.8448617 0.1363636 0.958498 0.04150198
## rad tax ptratio black lstat medv
## 399 0.8705534 0.8606719 0.7519763 0.8814229 0.9782609 0.002964427
## 406 0.8705534 0.8606719 0.7519763 0.3498024 0.8992095 0.002964427
#H Many Rooms Per Dwelling: rm > 8
#In this data set, how many of the suburbs average more than seven rooms per dwelling? More than eight rooms per dwelling? Comment on the suburbs that average more than eight rooms per dwelling
#More than seven rooms per dwelling:
sum(Boston$rm > 7)
## [1] 64
#More than eight rooms per dwelling:
Boston_gt_8rooms <- Boston[Boston$rm > 8, ]
nrow(Boston_gt_8rooms)
## [1] 13
#2/13 were located near the Charles river:
prop.table(table(Boston_gt_8rooms$chas))
##
## 0 1
## 0.8461538 0.1538462
Boston_gt_8rooms_perc <- Boston_percentiles[as.numeric(rownames(Boston_gt_8rooms)), ]
glimpse(Boston_gt_8rooms_perc)
## Rows: 13
## Columns: 13
## $ crim <dbl> 0.34387352, 0.69367589, 0.03557312, 0.52766798, 0.58893281, 0.…
## $ zn <dbl> 0.3685771, 0.3685771, 0.9950593, 0.3685771, 0.3685771, 0.36857…
## $ indus <dbl> 0.09683794, 0.91798419, 0.08992095, 0.34090909, 0.34090909, 0.…
## $ nox <dbl> 0.21541502, 0.68873518, 0.08893281, 0.38833992, 0.38833992, 0.…
## $ rm <dbl> 0.9802372, 0.9920949, 0.9762846, 0.9861660, 0.9980237, 0.97826…
## $ age <dbl> 0.48418972, 0.74604743, 0.14130435, 0.50988142, 0.55830040, 0.…
## $ dis <dbl> 0.5454545, 0.2766798, 0.7480237, 0.4634387, 0.4634387, 0.50296…
## $ rad <dbl> 0.06422925, 0.49407115, 0.27173913, 0.71640316, 0.71640316, 0.…
## $ tax <dbl> 0.21541502, 0.63932806, 0.07806324, 0.41600791, 0.41600791, 0.…
## $ ptratio <dbl> 0.37154150, 0.06818182, 0.06818182, 0.26778656, 0.26778656, 0.…
## $ black <dbl> 0.8814229, 0.4100791, 0.4624506, 0.3537549, 0.3201581, 0.39328…
## $ lstat <dbl> 0.075098814, 0.035573123, 0.011857708, 0.071146245, 0.09881422…
## $ medv <dbl> 0.9367589, 0.9851779, 0.9851779, 0.9565217, 0.9851779, 0.93280…
#Taking the mean of each column to simplify:
sapply(Boston_gt_8rooms_perc, mean)
## crim zn indus nox rm age dis
## 0.53815749 0.54651870 0.33612040 0.47514442 0.98814229 0.48008513 0.47202797
## rad tax ptratio black lstat medv
## 0.57236242 0.37473396 0.24407115 0.43987534 0.09007297 0.93227425