#A Dataset Overview
library(MASS)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
dim(Boston)
## [1] 506  14
# ?Boston
#B Pairwise Scatter Plots
pairs(Boston)

Boston$chas <- factor(Boston$chas)
predictor_df <- data.frame(response = character(), 
                             varname = character(), 
                             R2 = numeric(), 
                             R2_log = numeric(), 
                             R2_quad = numeric(), 
                             max_R2 = numeric(), 
                             stringsAsFactors = F)
best_predictor <- function(data, response) {

}

for (i in setdiff(1:ncol(Boston), 4)) { # excluding 'chas' as it uses different eval metric
  response <- names(Boston)[i]
  predictor_info <- best_predictor(Boston, names(Boston)[i])[1, -2]
  predictor_df <- rbind(predictor_df, cbind(response, predictor_info))
}
best_predictor(Boston, "chas")[1, ]
## NULL
g1 <- ggplot(Boston, aes(x = tax, y = rad)) + 
  geom_jitter(alpha = 0.1) + 
  geom_smooth(method = "lm", formula = "y ~ x + I(x^2)") + 
  scale_x_continuous(labels = scales::comma_format()) +
  labs(title = "Tax vs Rad - Jitter Plot", 
       x = "Tax", 
       y = "Rad")

g2 <- Boston %>%
  filter(rad < 20) %>%
  ggplot(aes(x = tax, y = rad)) + 
  geom_jitter(alpha = 0.1) + 
  geom_smooth(method = "lm", formula = "y ~ x + I(x^2)") + 
  scale_x_continuous(labels = scales::comma_format()) +
  labs(title = "Tax vs Rad - Jitter Plot", 
       subtitle = "Filtered for rad < 20 (132/506 obs. removed)",
       x = "Tax", 
       y = "Rad")

grid.arrange(g1, g2, ncol = 2)

Boston_2 <- Boston %>%
  filter(rad < 20)

summary(lm(rad ~ tax + I(tax^2), data = Boston_2))
## 
## Call:
## lm(formula = rad ~ tax + I(tax^2), data = Boston_2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0642 -0.7413 -0.0235  0.7961  3.6424 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -9.778e-02  8.026e-01  -0.122    0.903    
## tax          2.280e-02  4.284e-03   5.323 1.78e-07 ***
## I(tax^2)    -2.504e-05  5.484e-06  -4.567 6.75e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.565 on 371 degrees of freedom
## Multiple R-squared:  0.08678,    Adjusted R-squared:  0.08185 
## F-statistic: 17.63 on 2 and 371 DF,  p-value: 4.863e-08
ggplot(Boston, aes(x = nox, y = dis)) + 
  geom_point(alpha = 0.2) + 
  geom_smooth(method = "lm", formula = "y ~ x + I(x^2)") + 
    labs(title = "Nox vs Dis - Scatter Plot", 
         subtitle = "Quadratic fit",
         x = "Nox",
         y = "Dis")

#C Predictors of crim
best_predictor(Boston, "crim")
## NULL
ggplot(Boston, aes(x = medv, y = crim)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = "lm", formula = "y ~ x + I(x^2)", col = "green") +
    labs(title = "Medv vs Crim - Scatter Plot", 
       x = "Medv", 
       y = "Crim")

ggplot(Boston, aes(x = dis, y = crim)) + 
  geom_point(alpha = 0.5) + 
    geom_smooth(method = "lm", formula = "y ~ x + I(x^2)", col = "green") +
    labs(title = "Dis vs Crim - Scatter Plot", 
       x = "Dis", 
       y = "Crim")

ggplot(Boston, aes(x = lstat, y = crim)) + 
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", formula = "y ~ x + I(x^2)", col = "green") +
  labs(title = "Lstat vs Crim - Scatter Plot",
       x = "Lstat", 
       y = "Crim")

#D High Crime, Tax Rate & Pupil-Teacher Ratio Suburbs
ggplot(Boston, aes(x = "", y = crim)) + 
  geom_boxplot() + 
  scale_y_continuous(breaks = seq(0, 100, 10)) + 
  labs(y = "Crim", 
       title = "Crim - Boxplot") + 
  coord_flip() + 
  theme(axis.title.y = element_blank(), 
        axis.ticks.y = element_blank())

range(Boston$crim)
## [1]  0.00632 88.97620
range(Boston$tax)
## [1] 187 711
ggplot(Boston, aes(x = tax)) + 
  geom_histogram(binwidth = 10, fill = "mediumseagreen") + 
  scale_y_continuous(breaks = seq(0, 200, 20)) +
  labs(title = "Tax - Histogram", 
       x = "Tax", 
       y = "Count")

range(Boston$ptratio)
## [1] 12.6 22.0
ggplot(Boston, aes(x = ptratio)) + 
  geom_histogram(binwidth = 0.25, fill = "deepskyblue") + 
  scale_x_continuous(breaks = seq(12, 23, 0.5)) + 
  scale_y_continuous(breaks = seq(0, 200, 20)) + 
  labs(title = "Ptratio - Histogram", 
       x = "Ptratio", 
       y = "Count")

Boston %>%
  group_by(ptratio) %>%
  count() %>%
  arrange(desc(n))
## # A tibble: 46 × 2
## # Groups:   ptratio [46]
##    ptratio     n
##      <dbl> <int>
##  1    20.2   140
##  2    14.7    34
##  3    21      27
##  4    17.8    23
##  5    19.2    19
##  6    17.4    18
##  7    18.6    17
##  8    19.1    17
##  9    16.6    16
## 10    18.4    16
## # ℹ 36 more rows
Boston %>%
  group_by(ptratio, rad, tax, zn, indus) %>%
  count() %>%
  arrange(desc(n))
## # A tibble: 78 × 6
## # Groups:   ptratio, rad, tax, zn, indus [78]
##    ptratio   rad   tax    zn indus     n
##      <dbl> <int> <dbl> <dbl> <dbl> <int>
##  1    20.2    24   666     0 18.1    132
##  2    14.7     5   403     0 19.6     30
##  3    21       4   307     0  8.14    22
##  4    17.4     8   307     0  6.2     18
##  5    21.2     4   437     0 21.9     15
##  6    13       5   264    20  3.97    12
##  7    18.4     4   304     0  9.9     12
##  8    18.6     4   277     0 10.6     11
##  9    20.9     5   384     0  8.56    11
## 10    19.1     7   330    22  5.86    10
## # ℹ 68 more rows
Boston %>%
  group_by(ptratio, rad, tax, zn, indus, black) %>%
  count() %>%
  arrange(desc(n))
## # A tibble: 430 × 7
## # Groups:   ptratio, rad, tax, zn, indus, black [430]
##    ptratio   rad   tax    zn indus black     n
##      <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int>
##  1    20.2    24   666     0 18.1   397.    29
##  2    19.2     6   391     0  9.69  397.     6
##  3    19.6     5   287     0  7.38  397.     6
##  4    21.2     4   437     0 21.9   397.     5
##  5    16       4   289     0 13.9   397.     4
##  6    17.9     3   233     0  6.91  397.     4
##  7    20.2     5   224     0  5.19  397.     4
##  8    14.7     5   403     0 19.6   397.     3
##  9    17.6     4   254    40  6.41  397.     3
## 10    17.8     3   193     0  2.46  397.     3
## # ℹ 420 more rows
#E Charles River
#How many of the suburbs in this data set bound the Charles river?

table(Boston$chas)
## 
##   0   1 
## 471  35
#F Median ptratio
#What is the median pupil-teacher ratio among the towns in this data set?
median(Boston$ptratio)
## [1] 19.05
#G Lowest medv Suburbs
#Which suburb of Boston has lowest median value of owneroccupied homes? What are the values of the other predictors for that suburb, and how do those values compare to the overall ranges for those predictors? Comment on your findings.
Boston[Boston$medv == min(Boston$medv), ]
##        crim zn indus chas   nox    rm age    dis rad tax ptratio  black lstat
## 399 38.3518  0  18.1    0 0.693 5.453 100 1.4896  24 666    20.2 396.90 30.59
## 406 67.9208  0  18.1    0 0.693 5.683 100 1.4254  24 666    20.2 384.97 22.98
##     medv
## 399    5
## 406    5
Boston_percentiles <- sapply(Boston[ ,-4], function(x) rank(x)/length(x)) %>%
  as.data.frame()

Boston_percentiles[c(399, 406),]
##          crim        zn     indus       nox        rm      age        dis
## 399 0.9881423 0.3685771 0.7579051 0.8448617 0.0770751 0.958498 0.05731225
## 406 0.9960474 0.3685771 0.7579051 0.8448617 0.1363636 0.958498 0.04150198
##           rad       tax   ptratio     black     lstat        medv
## 399 0.8705534 0.8606719 0.7519763 0.8814229 0.9782609 0.002964427
## 406 0.8705534 0.8606719 0.7519763 0.3498024 0.8992095 0.002964427
#H Many Rooms Per Dwelling: rm > 8
#In this data set, how many of the suburbs average more than seven rooms per dwelling? More than eight rooms per dwelling? Comment on the suburbs that average more than eight rooms per dwelling
#More than seven rooms per dwelling:
sum(Boston$rm > 7)
## [1] 64
#More than eight rooms per dwelling:
Boston_gt_8rooms <- Boston[Boston$rm > 8, ]

nrow(Boston_gt_8rooms)
## [1] 13
#2/13 were located near the Charles river:
prop.table(table(Boston_gt_8rooms$chas))
## 
##         0         1 
## 0.8461538 0.1538462
Boston_gt_8rooms_perc <- Boston_percentiles[as.numeric(rownames(Boston_gt_8rooms)), ]

glimpse(Boston_gt_8rooms_perc)
## Rows: 13
## Columns: 13
## $ crim    <dbl> 0.34387352, 0.69367589, 0.03557312, 0.52766798, 0.58893281, 0.…
## $ zn      <dbl> 0.3685771, 0.3685771, 0.9950593, 0.3685771, 0.3685771, 0.36857…
## $ indus   <dbl> 0.09683794, 0.91798419, 0.08992095, 0.34090909, 0.34090909, 0.…
## $ nox     <dbl> 0.21541502, 0.68873518, 0.08893281, 0.38833992, 0.38833992, 0.…
## $ rm      <dbl> 0.9802372, 0.9920949, 0.9762846, 0.9861660, 0.9980237, 0.97826…
## $ age     <dbl> 0.48418972, 0.74604743, 0.14130435, 0.50988142, 0.55830040, 0.…
## $ dis     <dbl> 0.5454545, 0.2766798, 0.7480237, 0.4634387, 0.4634387, 0.50296…
## $ rad     <dbl> 0.06422925, 0.49407115, 0.27173913, 0.71640316, 0.71640316, 0.…
## $ tax     <dbl> 0.21541502, 0.63932806, 0.07806324, 0.41600791, 0.41600791, 0.…
## $ ptratio <dbl> 0.37154150, 0.06818182, 0.06818182, 0.26778656, 0.26778656, 0.…
## $ black   <dbl> 0.8814229, 0.4100791, 0.4624506, 0.3537549, 0.3201581, 0.39328…
## $ lstat   <dbl> 0.075098814, 0.035573123, 0.011857708, 0.071146245, 0.09881422…
## $ medv    <dbl> 0.9367589, 0.9851779, 0.9851779, 0.9565217, 0.9851779, 0.93280…
#Taking the mean of each column to simplify:
sapply(Boston_gt_8rooms_perc, mean)
##       crim         zn      indus        nox         rm        age        dis 
## 0.53815749 0.54651870 0.33612040 0.47514442 0.98814229 0.48008513 0.47202797 
##        rad        tax    ptratio      black      lstat       medv 
## 0.57236242 0.37473396 0.24407115 0.43987534 0.09007297 0.93227425