summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
Gives min, median, mean, max, and quartiles for each variable.
Boston %>% group_by(chas) %>% summarise(avg_medv = mean(medv))
## # A tibble: 2 × 2
## chas avg_medv
## <int> <dbl>
## 1 0 22.1
## 2 1 28.4
Houses along the Charles River (chas = 1) have higher average
medv
.
Boston %>% arrange(desc(crim)) %>% slice(1:5)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 88.9762 0 18.1 0 0.671 6.968 91.9 1.4165 24 666 20.2 396.90 17.21
## 2 73.5341 0 18.1 0 0.679 5.957 100.0 1.8026 24 666 20.2 16.45 20.62
## 3 67.9208 0 18.1 0 0.693 5.683 100.0 1.4254 24 666 20.2 384.97 22.98
## 4 51.1358 0 18.1 0 0.597 5.757 100.0 1.4130 24 666 20.2 2.60 10.11
## 5 45.7461 0 18.1 0 0.693 4.519 100.0 1.6582 24 666 20.2 88.27 36.98
## medv
## 1 10.4
## 2 8.8
## 3 5.0
## 4 15.0
## 5 7.0
These are the most crime-affected areas.
Boston <- Boston %>% mutate(tax_level = ifelse(tax > median(tax), "High", "Low"))
New column tax_level
added.
Boston %>% filter(rm > 7, medv > 30)
## crim zn indus chas nox rm age dis rad tax ptratio black
## 1 0.02729 0.0 7.07 0 0.4690 7.185 61.1 4.9671 2 242 17.8 392.83
## 2 0.06905 0.0 2.18 0 0.4580 7.147 54.2 6.0622 3 222 18.7 396.90
## 3 0.03359 75.0 2.95 0 0.4280 7.024 15.8 5.4011 3 252 18.3 395.62
## 4 0.01311 90.0 1.22 0 0.4030 7.249 21.9 8.6966 5 226 17.9 395.93
## 5 0.01951 17.5 1.38 0 0.4161 7.104 59.5 9.2229 3 216 18.6 393.24
## 6 0.12083 0.0 2.89 0 0.4450 8.069 76.0 3.4952 2 276 18.0 396.90
## 7 0.08187 0.0 2.89 0 0.4450 7.820 36.9 3.4952 2 276 18.0 393.53
## 8 0.06860 0.0 2.89 0 0.4450 7.416 62.5 3.4952 2 276 18.0 396.90
## 9 1.46336 0.0 19.58 0 0.6050 7.489 90.8 1.9709 5 403 14.7 374.43
## 10 1.83377 0.0 19.58 1 0.6050 7.802 98.2 2.0407 5 403 14.7 389.61
## 11 1.51902 0.0 19.58 1 0.6050 8.375 93.9 2.1620 5 403 14.7 388.45
## 12 2.01019 0.0 19.58 0 0.6050 7.929 96.2 2.0459 5 403 14.7 369.30
## 13 0.06588 0.0 2.46 0 0.4880 7.765 83.3 2.7410 3 193 17.8 395.56
## 14 0.09103 0.0 2.46 0 0.4880 7.155 92.2 2.7006 3 193 17.8 394.12
## 15 0.05602 0.0 2.46 0 0.4880 7.831 53.6 3.1992 3 193 17.8 392.63
## 16 0.08370 45.0 3.44 0 0.4370 7.185 38.9 4.5667 5 398 15.2 396.90
## 17 0.08664 45.0 3.44 0 0.4370 7.178 26.3 6.4798 5 398 15.2 390.49
## 18 0.01381 80.0 0.46 0 0.4220 7.875 32.0 5.6484 4 255 14.4 394.23
## 19 0.04011 80.0 1.52 0 0.4040 7.287 34.1 7.3090 2 329 12.6 396.90
## 20 0.04666 80.0 1.52 0 0.4040 7.107 36.6 7.3090 2 329 12.6 354.31
## 21 0.03768 80.0 1.52 0 0.4040 7.274 38.3 7.3090 2 329 12.6 392.20
## 22 0.01778 95.0 1.47 0 0.4030 7.135 13.9 7.6534 3 402 17.0 384.30
## 23 0.02177 82.5 2.03 0 0.4150 7.610 15.7 6.2700 2 348 14.7 395.38
## 24 0.03510 95.0 2.68 0 0.4161 7.853 33.2 5.1180 4 224 14.7 392.78
## 25 0.02009 95.0 2.68 0 0.4161 8.034 31.9 5.1180 4 224 14.7 390.55
## 26 0.31533 0.0 6.20 0 0.5040 8.266 78.3 2.8944 8 307 17.4 385.05
## 27 0.52693 0.0 6.20 0 0.5040 8.725 83.0 2.8944 8 307 17.4 382.00
## 28 0.38214 0.0 6.20 0 0.5040 8.040 86.5 3.2157 8 307 17.4 387.38
## 29 0.41238 0.0 6.20 0 0.5040 7.163 79.9 3.2157 8 307 17.4 372.08
## 30 0.29819 0.0 6.20 0 0.5040 7.686 17.0 3.3751 8 307 17.4 377.51
## 31 0.46296 0.0 6.20 0 0.5040 7.412 76.9 3.6715 8 307 17.4 376.14
## 32 0.57529 0.0 6.20 0 0.5070 8.337 73.3 3.8384 8 307 17.4 385.91
## 33 0.33147 0.0 6.20 0 0.5070 8.247 70.4 3.6519 8 307 17.4 378.95
## 34 0.51183 0.0 6.20 0 0.5070 7.358 71.6 4.1480 8 307 17.4 390.07
## 35 0.36894 22.0 5.86 0 0.4310 8.259 8.4 8.9067 7 330 19.1 396.90
## 36 0.01538 90.0 3.75 0 0.3940 7.454 34.2 6.3361 3 244 15.9 386.34
## 37 0.61154 20.0 3.97 0 0.6470 8.704 86.9 1.8010 5 264 13.0 389.70
## 38 0.66351 20.0 3.97 0 0.6470 7.333 100.0 1.8946 5 264 13.0 383.29
## 39 0.54011 20.0 3.97 0 0.6470 7.203 81.8 2.1121 5 264 13.0 392.80
## 40 0.53412 20.0 3.97 0 0.6470 7.520 89.4 2.1398 5 264 13.0 388.37
## 41 0.52014 20.0 3.97 0 0.6470 8.398 91.5 2.2885 5 264 13.0 386.86
## 42 0.82526 20.0 3.97 0 0.6470 7.327 94.5 2.0788 5 264 13.0 393.42
## 43 0.55007 20.0 3.97 0 0.6470 7.206 91.6 1.9301 5 264 13.0 387.89
## 44 0.78570 20.0 3.97 0 0.6470 7.014 84.6 2.1329 5 264 13.0 384.07
## 45 0.57834 20.0 3.97 0 0.5750 8.297 67.0 2.4216 5 264 13.0 384.54
## 46 0.54050 20.0 3.97 0 0.5750 7.470 52.6 2.8720 5 264 13.0 390.30
## 47 0.22188 20.0 6.96 1 0.4640 7.691 51.8 4.3665 3 223 18.6 390.77
## 48 0.10469 40.0 6.41 1 0.4470 7.267 49.0 4.7872 4 254 17.6 389.25
## 49 0.03578 20.0 3.33 0 0.4429 7.820 64.5 4.6947 5 216 14.9 387.31
## 50 0.06129 20.0 3.33 1 0.4429 7.645 49.7 5.2119 5 216 14.9 377.07
## 51 0.01501 90.0 1.21 1 0.4010 7.923 24.8 5.8850 1 198 13.6 395.52
## 52 0.00906 90.0 2.97 0 0.4000 7.088 20.8 7.3073 1 285 15.3 394.72
## 53 0.07886 80.0 4.95 0 0.4110 7.148 27.7 5.1167 4 245 19.2 396.90
## 54 0.05515 33.0 2.18 0 0.4720 7.236 41.1 4.0220 7 222 18.4 393.68
## 55 0.07503 33.0 2.18 0 0.4720 7.420 71.9 3.0992 7 222 18.4 396.90
## 56 0.01301 35.0 1.52 0 0.4420 7.241 49.3 7.0379 1 284 15.5 394.74
## 57 6.53876 0.0 18.10 1 0.6310 7.016 97.5 1.2024 24 666 20.2 392.05
## lstat medv tax_level
## 1 4.03 34.7 Low
## 2 5.33 36.2 Low
## 3 1.98 34.9 Low
## 4 4.81 35.4 Low
## 5 8.05 33.0 Low
## 6 4.21 38.7 Low
## 7 3.57 43.8 Low
## 8 6.19 33.2 Low
## 9 1.73 50.0 High
## 10 1.92 50.0 High
## 11 3.32 50.0 High
## 12 3.70 50.0 High
## 13 7.56 39.8 Low
## 14 4.82 37.9 Low
## 15 4.45 50.0 Low
## 16 5.39 34.9 High
## 17 2.87 36.4 High
## 18 2.97 50.0 Low
## 19 4.08 33.3 Low
## 20 8.61 30.3 Low
## 21 6.62 34.6 Low
## 22 4.45 32.9 High
## 23 3.11 42.3 High
## 24 3.81 48.5 Low
## 25 2.88 50.0 Low
## 26 4.14 44.8 Low
## 27 4.63 50.0 Low
## 28 3.13 37.6 Low
## 29 6.36 31.6 Low
## 30 3.92 46.7 Low
## 31 5.25 31.7 Low
## 32 2.47 41.7 Low
## 33 3.95 48.3 Low
## 34 4.73 31.5 Low
## 35 3.54 42.8 Low
## 36 3.11 44.0 Low
## 37 5.12 50.0 Low
## 38 7.79 36.0 Low
## 39 9.59 33.8 Low
## 40 7.26 43.1 Low
## 41 5.91 48.8 Low
## 42 11.25 31.0 Low
## 43 8.10 36.5 Low
## 44 14.79 30.7 Low
## 45 7.44 50.0 Low
## 46 3.16 43.5 Low
## 47 6.58 35.2 Low
## 48 6.05 33.2 Low
## 49 3.76 45.4 Low
## 50 3.01 46.0 Low
## 51 3.16 50.0 Low
## 52 7.85 32.2 Low
## 53 3.56 37.3 Low
## 54 6.93 36.1 Low
## 55 6.47 33.4 Low
## 56 5.49 32.7 Low
## 57 2.96 50.0 High
High-end spacious houses.
ggplot(Boston, aes(x = factor(chas))) + geom_bar(fill = "steelblue") +
labs(title = "Count of houses near Charles River", x = "Chas (1 = yes)")
ggplot(Boston, aes(x = crim)) + geom_histogram(bins = 30, fill = "coral", color = "black") +
labs(title = "Crime Rate Distribution")
ggplot(Boston, aes(x = tax_level, y = medv)) + geom_boxplot(fill = "lightgreen") +
labs(title = "Boxplot of medv by tax level")
ggplot(Boston, aes(x = rm, y = medv)) + geom_point(color = "blue") +
labs(title = "Rooms vs Median Value")
Boston_sorted <- Boston[order(Boston$medv), ]
Boston_sorted$index <- 1:nrow(Boston_sorted)
ggplot(Boston_sorted, aes(x = index, y = medv)) + geom_line(color = "purple") +
labs(title = "Trend of Median House Values")
ggpairs(Boston[, c("crim", "rm", "tax", "medv")])
model_slr <- lm(medv ~ rm, data = Boston)
summary(model_slr)
##
## Call:
## lm(formula = medv ~ rm, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.346 -2.547 0.090 2.986 39.433
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -34.671 2.650 -13.08 <2e-16 ***
## rm 9.102 0.419 21.72 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.616 on 504 degrees of freedom
## Multiple R-squared: 0.4835, Adjusted R-squared: 0.4825
## F-statistic: 471.8 on 1 and 504 DF, p-value: < 2.2e-16
More rooms increase house value.
model_mlr <- lm(medv ~ rm + lstat + ptratio + crim, data = Boston)
summary(model_mlr)
##
## Call:
## lm(formula = medv ~ rm + lstat + ptratio + crim, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.5492 -3.1454 -0.9357 1.6894 30.1563
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.92330 3.97564 4.257 2.48e-05 ***
## rm 4.61862 0.42716 10.812 < 2e-16 ***
## lstat -0.53431 0.04564 -11.708 < 2e-16 ***
## ptratio -0.88969 0.11883 -7.487 3.19e-13 ***
## crim -0.06544 0.03081 -2.124 0.0342 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.211 on 501 degrees of freedom
## Multiple R-squared: 0.6815, Adjusted R-squared: 0.6789
## F-statistic: 268 on 4 and 501 DF, p-value: < 2.2e-16
Multiple predictors influence medv.
Boston$rm2 <- Boston$rm^2
model_poly <- lm(medv ~ rm + rm2, data = Boston)
summary(model_poly)
##
## Call:
## lm(formula = medv ~ rm + rm2, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35.769 -2.752 0.619 3.003 35.464
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 66.0588 12.1040 5.458 7.59e-08 ***
## rm -22.6433 3.7542 -6.031 3.15e-09 ***
## rm2 2.4701 0.2905 8.502 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.193 on 503 degrees of freedom
## Multiple R-squared: 0.5484, Adjusted R-squared: 0.5466
## F-statistic: 305.4 on 2 and 503 DF, p-value: < 2.2e-16
Captures non-linear relationship.
Boston$medv_bin <- ifelse(Boston$medv > 25, 1, 0)
model_log <- glm(medv_bin ~ rm + lstat + ptratio, data = Boston, family = "binomial")
summary(model_log)
##
## Call:
## glm(formula = medv_bin ~ rm + lstat + ptratio, family = "binomial",
## data = Boston)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.06555 3.38355 -2.975 0.00293 **
## rm 2.44936 0.42613 5.748 9.04e-09 ***
## lstat -0.27378 0.05329 -5.137 2.79e-07 ***
## ptratio -0.24468 0.08418 -2.907 0.00365 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 563.52 on 505 degrees of freedom
## Residual deviance: 252.84 on 502 degrees of freedom
## AIC: 260.84
##
## Number of Fisher Scoring iterations: 7
Predicting high-value homes.
cor_matrix <- round(cor(Boston[, sapply(Boston, is.numeric)]), 2)
cor_matrix
## crim zn indus chas nox rm age dis rad tax ptratio
## crim 1.00 -0.20 0.41 -0.06 0.42 -0.22 0.35 -0.38 0.63 0.58 0.29
## zn -0.20 1.00 -0.53 -0.04 -0.52 0.31 -0.57 0.66 -0.31 -0.31 -0.39
## indus 0.41 -0.53 1.00 0.06 0.76 -0.39 0.64 -0.71 0.60 0.72 0.38
## chas -0.06 -0.04 0.06 1.00 0.09 0.09 0.09 -0.10 -0.01 -0.04 -0.12
## nox 0.42 -0.52 0.76 0.09 1.00 -0.30 0.73 -0.77 0.61 0.67 0.19
## rm -0.22 0.31 -0.39 0.09 -0.30 1.00 -0.24 0.21 -0.21 -0.29 -0.36
## age 0.35 -0.57 0.64 0.09 0.73 -0.24 1.00 -0.75 0.46 0.51 0.26
## dis -0.38 0.66 -0.71 -0.10 -0.77 0.21 -0.75 1.00 -0.49 -0.53 -0.23
## rad 0.63 -0.31 0.60 -0.01 0.61 -0.21 0.46 -0.49 1.00 0.91 0.46
## tax 0.58 -0.31 0.72 -0.04 0.67 -0.29 0.51 -0.53 0.91 1.00 0.46
## ptratio 0.29 -0.39 0.38 -0.12 0.19 -0.36 0.26 -0.23 0.46 0.46 1.00
## black -0.39 0.18 -0.36 0.05 -0.38 0.13 -0.27 0.29 -0.44 -0.44 -0.18
## lstat 0.46 -0.41 0.60 -0.05 0.59 -0.61 0.60 -0.50 0.49 0.54 0.37
## medv -0.39 0.36 -0.48 0.18 -0.43 0.70 -0.38 0.25 -0.38 -0.47 -0.51
## rm2 -0.20 0.31 -0.38 0.10 -0.28 0.99 -0.22 0.18 -0.20 -0.28 -0.37
## medv_bin -0.19 0.35 -0.42 0.13 -0.31 0.62 -0.27 0.15 -0.23 -0.33 -0.43
## black lstat medv rm2 medv_bin
## crim -0.39 0.46 -0.39 -0.20 -0.19
## zn 0.18 -0.41 0.36 0.31 0.35
## indus -0.36 0.60 -0.48 -0.38 -0.42
## chas 0.05 -0.05 0.18 0.10 0.13
## nox -0.38 0.59 -0.43 -0.28 -0.31
## rm 0.13 -0.61 0.70 0.99 0.62
## age -0.27 0.60 -0.38 -0.22 -0.27
## dis 0.29 -0.50 0.25 0.18 0.15
## rad -0.44 0.49 -0.38 -0.20 -0.23
## tax -0.44 0.54 -0.47 -0.28 -0.33
## ptratio -0.18 0.37 -0.51 -0.37 -0.43
## black 1.00 -0.37 0.33 0.13 0.18
## lstat -0.37 1.00 -0.74 -0.60 -0.53
## medv 0.33 -0.74 1.00 0.72 0.79
## rm2 0.13 -0.60 0.72 1.00 0.64
## medv_bin 0.18 -0.53 0.79 0.64 1.00
anova_model <- aov(medv ~ tax_level, data = Boston)
summary(anova_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## tax_level 1 6157 6157 84.89 <2e-16 ***
## Residuals 504 36559 73
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Difference in medv based on tax category.