setwd("~/Downloads/Rscript Rstudio projects/Rscript Homework")
setwd("~/Downloads")
df <- read.csv("BostonHousing.csv")
###1.2: Add first 10 rows
head(df,10)
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 4.98 24.0
## 2 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 9.14 21.6
## 3 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 4.03 34.7
## 4 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 2.94 33.4
## 5 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 5.33 36.2
## 6 0.02985 0.0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 5.21 28.7
## 7 0.08829 12.5 7.87 0 0.524 6.012 66.6 5.5605 5 311 15.2 12.43 22.9
## 8 0.14455 12.5 7.87 0 0.524 6.172 96.1 5.9505 5 311 15.2 19.15 27.1
## 9 0.21124 12.5 7.87 0 0.524 5.631 100.0 6.0821 5 311 15.2 29.93 16.5
## 10 0.17004 12.5 7.87 0 0.524 6.004 85.9 6.5921 5 311 15.2 17.10 18.9
###1.3: Summary of all variables
summary(df)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio lstat
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 1.73
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.: 6.95
## Median : 5.000 Median :330.0 Median :19.05 Median :11.36
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :12.65
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:16.95
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :37.97
## medv
## Min. : 5.00
## 1st Qu.:17.02
## Median :21.20
## Mean :22.53
## 3rd Qu.:25.00
## Max. :50.00
###1.4: Mean value
mean_medv <- mean(df$medv)
mean_medv
## [1] 22.53281
###1.5: Histogram of Medv Variable, mark the mean val of medv by adding a vertical line
hist(df$medv,
main= "Histogram of MEDV Aka Median Home Value",
xlab= "medv($1,000s)",
border = "blue")
abline(v= mean_medv, col= "red", lwd=2, lty=2)
legend("topright",legend=paste0("Mean=", round(mean_medv, 2)),
lty = 2, lwd = 2, col="red", bty = "y")
###2.6: cat.medv confirmation thing
df$cat.medv <- as.integer(df$medv > 30)
###2.7: calculate the mean of cat.medv + explain what it means
mean_cat_medv <- mean(df$cat.medv, na.rm=TRUE)
mean_cat_medv
## [1] 0.1660079
cat("After getting the mean of the home prices of medv that is over $30,000, which is ", mean_cat_medv,
" This means ", round(mean_cat_medv*100,2), "% of the census tracts have median home values above $30,000.\n")
## After getting the mean of the home prices of medv that is over $30,000, which is 0.1660079 This means 16.6 % of the census tracts have median home values above $30,000.
###3.8: Calculate the mean of cat.medv for the tracts that bound the Charles River (chas==1) and save it to a variable
m_chas1 <- mean(df$cat.medv[df$chas==1], na.rm = TRUE)
m_chas1
## [1] 0.3142857
###3.9 Mean of cat.medv for tracts that do not bound the river (chas==0)
m_chas0 <- mean(df$cat.medv[df$chas== 0], no.rm = TRUE)
m_chas0
## [1] 0.1549894
###3.10 Create a vector of the two means
comb_chas <-c(m_chas1, m_chas0)
comb_chas
## [1] 0.3142857 0.1549894
###3.11 Plot a bar chart
bc <- barplot(comb_chas,
names.arg = c("chas = 1 (Bounds River)", "chas = 0 (Does Not Bound)"),
ylim=c(0,1),
ylab="proportion with MEDV>30k",
main = "Chart of Tracts with MEDV > $30k by the Charles River Boundary",
border= "black")
text(x = bc, y = comb_chas, labels = round(comb_chas, 3), pos = 3)
###3.12 What I can tell from the bar chart
cat("From the bar chart I can tell that there are more Tracts that bound the charles river from homes that are worth more than $30,000. 31% of homes that border the Charles river are worth over $30,000 usd bound the river while only 15.5% of homes that dont border it, are worth over $30,000 usd")
## From the bar chart I can tell that there are more Tracts that bound the charles river from homes that are worth more than $30,000. 31% of homes that border the Charles river are worth over $30,000 usd bound the river while only 15.5% of homes that dont border it, are worth over $30,000 usd
###4.13 side by side boxplot
boxplot(medv ~ as.factor(chas), data = df,
names = c("chas = 0 (No)", "chas = 1 (Yes)"),
xlab = "Tract bounds Charles River?",
ylab = "MEDV ($1,000s)",
main = "MEDV by Charles River")
###5.14 Scatter plot of medv
plot(df$lstat, df$medv,
xlab = "lstat (% lower socioeconomic status)",
ylab = "medv ($1,000s)",
main = "Scatterplot of MEDV vs LSTAT",
pch = 19,
cex = 0.7,
col = "black")
###5.15 Linear Regression
fit <- lm(medv ~ lstat, data = df)
summary(fit)
##
## Call:
## lm(formula = medv ~ lstat, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.168 -3.990 -1.318 2.034 24.500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.55384 0.56263 61.41 <2e-16 ***
## lstat -0.95005 0.03873 -24.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared: 0.5441, Adjusted R-squared: 0.5432
## F-statistic: 601.6 on 1 and 504 DF, p-value: < 2.2e-16
###5.16 Line + Legend
abline(fit, col = "blue", lwd = 2)
legend("topright", legend = "Fitted line: medv ~ lstat",
col = "blue", lwd = 2, bty = "n")