#setwd("~/Downloads/R Studio Downloads/BostonHousing.csv")
# Q1. Import the data
boston<- read.csv("BostonHousing.csv")
#Problem 2 ##
# Q2. Print the first 10 rows
head(boston, 10)
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 4.98 24.0
## 2 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 9.14 21.6
## 3 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 4.03 34.7
## 4 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 2.94 33.4
## 5 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 5.33 36.2
## 6 0.02985 0.0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 5.21 28.7
## 7 0.08829 12.5 7.87 0 0.524 6.012 66.6 5.5605 5 311 15.2 12.43 22.9
## 8 0.14455 12.5 7.87 0 0.524 6.172 96.1 5.9505 5 311 15.2 19.15 27.1
## 9 0.21124 12.5 7.87 0 0.524 5.631 100.0 6.0821 5 311 15.2 29.93 16.5
## 10 0.17004 12.5 7.87 0 0.524 6.004 85.9 6.5921 5 311 15.2 17.10 18.9
#Problem 3 ##
# Q3. Summary of all variables
summary(boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio lstat
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 1.73
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.: 6.95
## Median : 5.000 Median :330.0 Median :19.05 Median :11.36
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :12.65
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:16.95
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :37.97
## medv
## Min. : 5.00
## 1st Qu.:17.02
## Median :21.20
## Mean :22.53
## 3rd Qu.:25.00
## Max. :50.00
#Problem 4 ##
# Q4. Calculate the mean value of medv
mean(boston$medv)
## [1] 22.53281
mean_medv <-mean(boston$medv)
#Problem 5 ##
# Q5. Histogram of medv with vertical line at mean
hist(boston$medv,
main = "Histogram of Median Home Values (medv)",
xlab = "Median Value ($1000s)",
col = "lightblue",
border = "black")
abline(v = mean_medv, col = "red", lwd = 2, lty = 2)
# Adding a legend
legend("topright", legend = paste("Mean =", round(mean_medv, 2)),
col = "red", lwd = 2, lty = 2)
# Q6. Create cat.medv variable
boston$cat.medv <- ifelse(boston$medv > 30, 1, 0)
# Q7. Mean of cat.medv
mean_cat_medv <- mean(boston$cat.medv)
mean_cat_medv
## [1] 0.1660079
## The cat.medv variable is representative of when the mean value of the home is above 30000, so by taking the mean, we can find the proportion of rows in the dataset where the median home value exceeds that $30000.
#Task 3: Bar Chart ##
#Task 3: Bar Chart
# Q8. Mean of cat.medv for tracts next to Charles River
mean_chas1 <- mean(boston$cat.medv[boston$chas == 1],na.rm = TRUE)
# Q9. Mean of cat.medv for tracts not next to Charles River
mean_chas0 <- mean(boston$cat.medv[boston$chas == 0], na.rm = TRUE)
# Q10. Create vector of the two means
means_vector <- c(mean_chas1, mean_chas0)
# Q11. Bar chart
bp <- barplot(means_vector,
names.arg= c("Bounds Charles River (chas=1)", "Doesn't Bound Charles River (chas=0"),
col=c ("blue","pink"),
ylim=c(0,1),
main = ("Proportion of tracts with medv > $30k by Proximity to River"),
ylab = "Mean of cat.medv")
text (x=bp, y=means_vector+0.05, labels=round(means_vector,2))
#Need to discuss finding from the bar chart.
#Task 4, Box Plot ##
# Q13. Boxplot of medv by Charles River proximity
boxplot(medv ~ chas,data = boston,
col=c("orange","purple"),
names = c("Bounds Charles River (chas=1)", "Doesn't Bound Charles River (chas=0"),
main = "Boxplot of Median Value (medv) by Charles River Proximity",
ylab = "Median Value of Homes ($1000s)")
#Task 5, Scatter Plot ##
# Task 5: Scatter Plot
# Q15. Scatter plot of rm vs medval
plot(boston$rm, boston$medv,
main = "MEDVL vs LSTAT",
xlab = "LSTAT",
ylab = "Median Value of Homes (medv)",
pch = 19, col = "blue")
# Q16. Add regression line
model <- lm(medv ~ rm, data = boston)
summary(model)
##
## Call:
## lm(formula = medv ~ rm, data = boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.346 -2.547 0.090 2.986 39.433
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -34.671 2.650 -13.08 <2e-16 ***
## rm 9.102 0.419 21.72 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.616 on 504 degrees of freedom
## Multiple R-squared: 0.4835, Adjusted R-squared: 0.4825
## F-statistic: 471.8 on 1 and 504 DF, p-value: < 2.2e-16
abline(model, col = "red", lwd = 2)