library(tidyverse) #to do visuals and cleanup
library(e1071) #for naive bayes
library(caret) #to create a correlation matrix
library(plotly) #to make interactive visuals
library(ggcorrplot) #to make our correlation matrix
library (dplyr)
library (forcats)
housing_new <- read_csv("housing_new.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## id = col_double(),
## crime = col_double(),
## zone = col_double(),
## industry = col_double(),
## water = col_double(),
## pollutant = col_double(),
## room = col_double(),
## home_age_relative = col_double(),
## distance_from_business = col_double(),
## highway_accessible = col_double(),
## tax_rate = col_double(),
## teacher_ratio = col_double(),
## unemployment_percentage = col_double(),
## price = col_double()
## )
#view(housing_new)
housing_cleaned = subset(housing_new, select = -c(X1,id) )
ggcorrplot(cor(housing_cleaned), p.mat = cor_pmat (housing_cleaned), hc.order = TRUE, lab = TRUE, type = "lower")
The unemployment percentage(-.81) is the strongest predictor of price, followed by the amount of rooms the house has(.62). The negative correlation between unemployment and home price suggests the intuitive idea that neighborhoods with expensive homes tend to also have high earning families with low unemployment rates.
housing_cleaned %>%
ggplot(aes(x= room))+
geom_histogram(bins = 6)+
labs(x= "Number of Rooms", y = "Number of Observations" , title = "Distribution of the Number of Rooms per House")
3 and 3.5 rooms are the most common observations for the number of rooms a house has. The histogram resembles a bell curve indicating a normal distribution.
correlation_equation_roo <- function(x, y) {
correlation_coefficient_roo <- round(cor(x, y), digits = 2)
paste("italic(r) == ", correlation_coefficient_roo)
}
roo_disp_label <- data.frame(x = 375000, y = 3.25 , label = correlation_equation_roo(housing_cleaned$price, housing_cleaned$room))
housing_new%>%
mutate (Price= housing_new$price * 100000) %>%
ggplot() +
aes(x = Price, y = room) +
geom_point() +
geom_smooth(se = FALSE, method = "lm") +
geom_text(data = roo_disp_label, aes(x = x, y = y, label = label), parse = TRUE) +
labs(X = "Price of Home", Y = "Number of Rooms", title = "Correlation Between Price and Number of Rooms")
## `geom_smooth()` using formula 'y ~ x'
There is a fairly strong correlation between the price of a home and the number of rooms it has. To be exact, the correlation here is .62.
correlation_equation_une <- function(x, y) {
correlation_coefficient_une <- round(cor(x, y), digits = 2)
paste("italic(r) == ", correlation_coefficient_une)
}
une_disp_label <- data.frame(x = 200000, y = 10.5 , label = correlation_equation_une(housing_cleaned$price, housing_cleaned$unemployment_percentage))
housing_new%>%
mutate (Price= housing_new$price * 100000) %>%
ggplot() +
aes(x = Price, y = unemployment_percentage) +
geom_point() +
geom_smooth(se = FALSE, method = "lm") +
geom_text(data = une_disp_label, aes(x = x, y = y, label = label), parse = TRUE) +
labs(x = "Price of Home", y = "% of Zipcode Currently Unemployed", title = "Correlation Between Price and Unempployment Percent")
## `geom_smooth()` using formula 'y ~ x'
Although our previous correlation was strong, the correlation between price and unemployment percent is stronger yet. With a correlation coefficient of .81 this relationship is as strong as it gets.
correlation_equation_ind <- function(x, y) {
correlation_coefficient_ind <- round(cor(x, y), digits = 2)
paste("italic(r) == ", correlation_coefficient_ind)
}
ind_disp_label <- data.frame(x = 200000, y = 25, label = correlation_equation_ind(housing_cleaned$price, housing_cleaned$industry))
housing_new%>%
mutate (Price= housing_new$price * 100000) %>%
ggplot() +
aes(x = Price, y = industry) +
geom_point() +
geom_smooth(se = FALSE, method = "lm") +
geom_text(data = ind_disp_label, aes(x = x, y = y, label = label), parse = TRUE) +
labs(x = "Price of Home", y = "Proportion of non-retial Business/ Zip", title = "Correlation Between Price and Industry")
## `geom_smooth()` using formula 'y ~ x'
The correlation of price and industry is representative of the group of variables that hover around the .5 range. These variables while strong, are not as strong as the previous relationships we lookd at.
set.seed(1)
row.names(housing_cleaned)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12"
## [13] "13" "14" "15" "16" "17" "18" "19" "20" "21" "22" "23" "24"
## [25] "25" "26" "27" "28" "29" "30" "31" "32" "33" "34" "35" "36"
## [37] "37" "38" "39" "40" "41" "42" "43" "44" "45" "46" "47" "48"
## [49] "49" "50" "51" "52" "53" "54" "55" "56" "57" "58" "59" "60"
## [61] "61" "62" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72"
## [73] "73" "74" "75" "76" "77" "78" "79" "80" "81" "82" "83" "84"
## [85] "85" "86" "87" "88" "89" "90" "91" "92" "93" "94" "95" "96"
## [97] "97" "98" "99" "100" "101" "102" "103" "104" "105" "106" "107" "108"
## [109] "109" "110" "111" "112" "113" "114" "115" "116" "117" "118" "119" "120"
## [121] "121" "122" "123" "124" "125" "126" "127" "128" "129" "130" "131" "132"
## [133] "133" "134" "135" "136" "137" "138" "139" "140" "141" "142" "143" "144"
## [145] "145" "146" "147" "148" "149" "150" "151" "152" "153" "154" "155" "156"
## [157] "157" "158" "159" "160" "161" "162" "163" "164" "165" "166" "167" "168"
## [169] "169" "170" "171" "172" "173" "174" "175" "176" "177" "178" "179" "180"
## [181] "181" "182" "183" "184" "185" "186" "187" "188" "189" "190" "191" "192"
## [193] "193" "194" "195" "196" "197" "198" "199" "200" "201" "202" "203" "204"
## [205] "205" "206" "207" "208" "209" "210" "211" "212" "213" "214" "215" "216"
## [217] "217" "218" "219" "220" "221" "222" "223" "224" "225" "226" "227" "228"
## [229] "229" "230" "231" "232" "233" "234" "235" "236" "237" "238" "239" "240"
## [241] "241" "242" "243" "244" "245" "246" "247" "248" "249" "250" "251" "252"
## [253] "253" "254" "255" "256" "257" "258" "259" "260" "261" "262" "263" "264"
## [265] "265" "266" "267" "268" "269" "270" "271" "272" "273" "274" "275" "276"
## [277] "277" "278" "279" "280" "281" "282" "283" "284" "285" "286" "287" "288"
## [289] "289" "290" "291" "292" "293" "294" "295" "296" "297" "298" "299" "300"
## [301] "301" "302" "303" "304" "305" "306" "307" "308" "309" "310" "311" "312"
## [313] "313" "314" "315" "316" "317" "318" "319" "320" "321" "322" "323" "324"
## [325] "325" "326" "327" "328" "329" "330" "331" "332" "333" "334" "335" "336"
## [337] "337" "338" "339" "340" "341" "342" "343" "344" "345" "346" "347" "348"
## [349] "349" "350" "351" "352" "353" "354" "355" "356" "357" "358" "359" "360"
## [361] "361" "362" "363" "364" "365" "366" "367" "368" "369" "370" "371" "372"
## [373] "373" "374" "375" "376" "377" "378" "379" "380" "381" "382" "383" "384"
## [385] "385" "386" "387" "388" "389" "390" "391" "392" "393" "394" "395" "396"
## [397] "397" "398" "399" "400" "401" "402" "403" "404" "405" "406" "407" "408"
## [409] "409" "410" "411" "412" "413" "414" "415" "416" "417" "418" "419" "420"
## [421] "421" "422" "423" "424" "425" "426" "427" "428" "429" "430" "431" "432"
## [433] "433" "434" "435" "436" "437" "438" "439" "440" "441" "442" "443" "444"
## [445] "445" "446" "447" "448" "449" "450" "451" "452" "453" "454" "455" "456"
## [457] "457" "458" "459" "460" "461" "462" "463" "464" "465" "466" "467" "468"
## [469] "469" "470" "471" "472" "473" "474" "475" "476" "477" "478" "479" "480"
## [481] "481" "482" "483" "484" "485" "486" "487" "488" "489" "490" "491" "492"
## [493] "493" "494" "495" "496" "497" "498" "499" "500" "501" "502" "503" "504"
## [505] "505" "506"
training_set_rows <- sample(rownames(housing_cleaned), nrow(housing_cleaned)*0.6)
validation_set_rows <- sample(setdiff(rownames(housing_cleaned), training_set_rows), nrow(housing_cleaned)*0.4)
training_data <- housing_cleaned[training_set_rows, ]
validation_data <- housing_cleaned[validation_set_rows, ]
training_lm_model1 <- lm(price ~ ., data = housing_cleaned, subset = training_set_rows)
training_residuals1 <- data.frame(training_data$price, training_lm_model1$fitted.values, training_lm_model1$residuals)
head(training_residuals1)
## training_data.price training_lm_model1.fitted.values
## 505 3.091042 3.252898
## 324 2.917771 3.030806
## 167 3.912023 3.605942
## 129 2.890372 2.837452
## 418 2.341806 2.197836
## 471 2.990720 2.880636
## training_lm_model1.residuals
## 505 -0.16185599
## 324 -0.11303572
## 167 0.30608073
## 129 0.05291957
## 418 0.14396952
## 471 0.11008340
options(scipen = 999)
summary(training_lm_model1)
##
## Call:
## lm(formula = price ~ ., data = housing_cleaned, subset = training_set_rows)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.64868 -0.10814 -0.01137 0.11000 0.77503
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.2612376 0.2603465 16.368 < 0.0000000000000002 ***
## crime -0.0127322 0.0018184 -7.002 0.0000000000176 ***
## zone 0.0011645 0.0007534 1.546 0.123250
## industry 0.0029859 0.0031845 0.938 0.349216
## water 0.1150665 0.0435203 2.644 0.008640 **
## pollutant -0.6925676 0.2063594 -3.356 0.000896 ***
## room 0.1595912 0.0430401 3.708 0.000250 ***
## home_age_relative 0.0003102 0.0007027 0.441 0.659261
## distance_from_business -0.0467319 0.0108840 -4.294 0.0000239966316 ***
## highway_accessible 0.0143584 0.0033872 4.239 0.0000302083049 ***
## tax_rate -0.0006870 0.0001895 -3.626 0.000340 ***
## teacher_ratio -0.0353876 0.0070876 -4.993 0.0000010283601 ***
## unemployment_percentage -0.1259424 0.0109335 -11.519 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1953 on 290 degrees of freedom
## Multiple R-squared: 0.7876, Adjusted R-squared: 0.7788
## F-statistic: 89.59 on 12 and 290 DF, p-value: < 0.00000000000000022
ggplot(data = training_lm_model1) +
aes(x = training_lm_model1$residuals) +
geom_histogram(bins = 10)+
labs (x= "Residuals", y="Count")
The graph above confirms the outputs and shows that the distribution is symmetric. The majority of the error are close to zero avoiding to have over or under estimations.
Model1_price <- (-0.0127322*housing_cleaned$crime[1]) +
(0.0011645*housing_cleaned$zone[1]) +
(0.0029859*housing_cleaned$industry[1]) +
(0.1150665*housing_cleaned$water[1]) +
(-0.6925676*housing_cleaned$pollutant[1]) +
(0.1595912*housing_cleaned$room[1]) +
(0.0003102*housing_cleaned$home_age_relative[1]) +
(-0.0467319*housing_cleaned$distance_from_business[1]) +
(0.0143584*housing_cleaned$highway_accessible[1]) +
(-0.0006870*housing_cleaned$tax_rate[1]) +
(-0.0353876*housing_cleaned$teacher_ratio[1]) +
(-0.1259424*housing_cleaned$unemployment_percentage[1])+
4.2612376
Model1_price * 100000
## [1] 341685.3
prediction_off_by <- 0.1953/4.2612376
prediction_off_by
## [1] 0.04583176
The above result shows that predictions will be off by 4%. Another observation is that the R-square measure tells that the relationship between the predictors accounts for 79% of the variation.
prediction_ml_model1 <- predict(training_lm_model1, newdata = validation_data)
validation_residuals1 <- data.frame(validation_data$price, prediction_ml_model1, residuals = validation_data$price - prediction_ml_model1)
head(validation_residuals1)
## validation_data.price prediction_ml_model1 residuals
## 1 1.840550 2.370480 -0.529930497
## 2 3.117950 3.270353 -0.152403177
## 3 3.081910 3.026990 0.054919706
## 4 2.949688 2.681568 0.268119892
## 5 3.314186 2.788545 0.525640741
## 6 2.928524 2.920141 0.008382301
The table above shows the predicted values (fitted values), and the residuals (prediction errors). Based on these results, we are creating the following graph:
ggplot(data = validation_residuals1) +
aes(x = residuals) +
geom_histogram(bins = 10)+
labs(X="Residuals", y="Count", title = "Validation Residuals")
Comparing the training and validation data, there is less error with predictions when using the training data as opposed to using the validation data. There is a greater concentration of residuals of near 0 for the training data set as opposed to the validation data set. Despite that, they both perform rather similarly based on the histograms.
Some variables were not statistically significant such as zone, industry and home _age _relative. We are removing these variables to improve our model. We are creating a data frame that contains the home price from the new predictive model, along with the residuals. These residuals are showing the difference between the observations and predictions on a row by row basis.
training_lm_model2 <- lm(`price` ~ crime + water + pollutant + room + distance_from_business + highway_accessible + tax_rate + teacher_ratio + unemployment_percentage, data = housing_cleaned, subset = training_set_rows)
training_residuals2 <- data.frame(training_data$price, training_lm_model2$fitted.values, training_lm_model2$residuals)
head(training_residuals2)
## training_data.price training_lm_model2.fitted.values
## 505 3.091042 3.239371
## 324 2.917771 3.038557
## 167 3.912023 3.596462
## 129 2.890372 2.809753
## 418 2.341806 2.198668
## 471 2.990720 2.876940
## training_lm_model2.residuals
## 505 -0.14832853
## 324 -0.12078605
## 167 0.31556150
## 129 0.08061829
## 418 0.14313774
## 471 0.11377961
options(scipen = 999)
summary(training_lm_model2)
##
## Call:
## lm(formula = price ~ crime + water + pollutant + room + distance_from_business +
## highway_accessible + tax_rate + teacher_ratio + unemployment_percentage,
## data = housing_cleaned, subset = training_set_rows)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.64484 -0.11011 -0.01171 0.10811 0.77612
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.2608330 0.2596419 16.410 < 0.0000000000000002 ***
## crime -0.0126165 0.0018073 -6.981 0.0000000000197 ***
## water 0.1211285 0.0433022 2.797 0.005495 **
## pollutant -0.6498922 0.1942717 -3.345 0.000929 ***
## room 0.1688652 0.0417699 4.043 0.0000675871400 ***
## distance_from_business -0.0437858 0.0086274 -5.075 0.0000006889514 ***
## highway_accessible 0.0131083 0.0032410 4.045 0.0000671058115 ***
## tax_rate -0.0005656 0.0001687 -3.353 0.000903 ***
## teacher_ratio -0.0378030 0.0066488 -5.686 0.0000000314966 ***
## unemployment_percentage -0.1228098 0.0101595 -12.088 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1953 on 293 degrees of freedom
## Multiple R-squared: 0.7853, Adjusted R-squared: 0.7787
## F-statistic: 119.1 on 9 and 293 DF, p-value: < 0.00000000000000022
Analyzing the table above, we see that the f-statistic has increased from 89.59 to 119.1. Aside from that, everything else remains mostly unchanged. This shows that the second model is better since it has a better relationship between the predictors and response variable than the first model. Based on this, the equation for the second model is as follows:
Model2_price <- (-0.0126165*housing_cleaned$crime[1]) +
(0.1211285*housing_cleaned$water[1]) +
(-0.6498922*housing_cleaned$pollutant[1]) +
( 0.1688652*housing_cleaned$room[1]) +
(-0.0437858*housing_cleaned$distance_from_business[1]) +
( 0.0131083*housing_cleaned$highway_accessible[1]) +
(-0.0005656*housing_cleaned$tax_rate[1]) +
(-0.0378030*housing_cleaned$teacher_ratio[1]) +
(-0.1228098*housing_cleaned$unemployment_percentage[1])+
4.2608330
Model2_price * 100000
## [1] 343746.2
training_lm_model11 <- lm(`pollutant` ~ ., data = housing_cleaned, subset = training_set_rows)
training_residuals11 <- data.frame(training_data$pollutant, training_lm_model11$fitted.values, training_lm_model11$residuals)
head(training_residuals11)
## training_data.pollutant training_lm_model11.fitted.values
## 505 0.573 0.5327212
## 324 0.493 0.5038849
## 167 0.605 0.6167978
## 129 0.624 0.6167520
## 418 0.679 0.6851179
## 471 0.580 0.6443924
## training_lm_model11.residuals
## 505 0.040278813
## 324 -0.010884885
## 167 -0.011797799
## 129 0.007247973
## 418 -0.006117908
## 471 -0.064392364
options(scipen = 999)
summary(training_lm_model11)
##
## Call:
## lm(formula = pollutant ~ ., data = housing_cleaned, subset = training_set_rows)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.114497 -0.031904 -0.008015 0.028600 0.200732
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.90174808 0.08579063 10.511 < 0.0000000000000002
## crime -0.00104107 0.00054551 -1.908 0.057323
## zone -0.00005275 0.00021117 -0.250 0.802923
## industry 0.00380402 0.00086196 4.413 0.00001437994
## water 0.01630551 0.01225875 1.330 0.184527
## room -0.01216307 0.01227724 -0.991 0.322658
## home_age_relative 0.00074438 0.00019133 3.891 0.000124
## distance_from_business -0.01771545 0.00295611 -5.993 0.00000000611
## highway_accessible 0.00286541 0.00095990 2.985 0.003076
## tax_rate 0.00006897 0.00005393 1.279 0.201997
## teacher_ratio -0.01175169 0.00194320 -6.048 0.00000000452
## unemployment_percentage -0.00241808 0.00368255 -0.657 0.511938
## price -0.05398427 0.01608530 -3.356 0.000896
##
## (Intercept) ***
## crime .
## zone
## industry ***
## water
## room
## home_age_relative ***
## distance_from_business ***
## highway_accessible **
## tax_rate
## teacher_ratio ***
## unemployment_percentage
## price ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05452 on 290 degrees of freedom
## Multiple R-squared: 0.787, Adjusted R-squared: 0.7781
## F-statistic: 89.27 on 12 and 290 DF, p-value: < 0.00000000000000022
ggplot(data = training_lm_model11) +
aes(x = training_lm_model11$residuals) +
geom_histogram(bins = 10)+
labs (x= "Residuals", y="Count")
The graph above confirms the outputs and shows that the distribution is symmetric. The majority of the error are close to zero avoiding to have over or under estimations.
pollutant <- (-0.00104107*housing_cleaned$crime[1]) +
(-0.00005275*housing_cleaned$zone[1]) +
(0.00380402*housing_cleaned$industry[1]) +
(0.01630551*housing_cleaned$water[1]) +
(-0.01216307*housing_cleaned$room[1]) +
(0.00074438*housing_cleaned$home_age_relative[1]) + (-0.01771545*housing_cleaned$distance_from_business[1]) + (0.00286541*housing_cleaned$highway_accessible[1]) +
(0.00006897*housing_cleaned$tax_rate[1]) +
(0.00006897*housing_cleaned$tax_rate[1]) +
(-0.01175169*housing_cleaned$teacher_ratio[1]) + (-0.00241808*housing_cleaned$unemployment_percentage[1]) +
(-0.05398427*housing_cleaned$price[1]) + 0.90174808
pollutant
## [1] 0.5324053
prediction_off_by <- 0.05452/0.90174808
prediction_off_by
## [1] 0.06046034
The above result shows that predictions will be off by 6%. Another observation is that the R-square measure tells that the relationship between the predictors accounts for 79% of the variation.
prediction_ml_model11 <- predict(training_lm_model11, newdata = validation_data)
validation_residuals11 <- data.frame(validation_data$pollutant, prediction_ml_model11, residuals= validation_data$pollutant- prediction_ml_model11)
head(validation_residuals11)
## validation_data.pollutant prediction_ml_model11 residuals
## 1 0.693 0.7149598 -0.02195980
## 2 0.510 0.5493664 -0.03936637
## 3 0.585 0.5448343 0.04016573
## 4 0.580 0.6263445 -0.04634454
## 5 0.597 0.6479213 -0.05092131
## 6 0.547 0.5983617 -0.05136167
The table above shows the predicted values (fitted values), and the residuals (prediction errors). Based on these results, we are creating the following graph:
ggplot(data = validation_residuals11) +
aes(x = residuals) +
geom_histogram(bins = 10)+
labs( y="Count", x="Residuals", title = "Validation Residuals")
Comparing the training and validation data, there is less error with predictions when using the training data as opposed to using the validation data. There is a greater concentration of residuals of near 0 for the training data set as opposed to the validation data set. Despite that, they both perform rather similarly based on the histograms.
Some variables were not statistically significant such as crime, zone,water, room,tax_rate and unemployment_percentage. We are removing these variables to improve our model. We are creating a data frame that contains the pollutant values from the new predictive model, along with the residuals. These residuals are showing the difference between the observations and predictions on a row by row basis.
training_lm_model22 <- lm(`pollutant` ~ industry + home_age_relative + distance_from_business + highway_accessible + teacher_ratio + price, data = housing_cleaned, subset = training_set_rows)
training_residuals22 <- data.frame(training_data$pollutant, training_lm_model22$fitted.values, training_lm_model22$residuals)
head(training_residuals22)
## training_data.pollutant training_lm_model22.fitted.values
## 505 0.573 0.5390572
## 324 0.493 0.5026784
## 167 0.605 0.6323244
## 129 0.624 0.6186247
## 418 0.679 0.6948076
## 471 0.580 0.6393825
## training_lm_model22.residuals
## 505 0.033942784
## 324 -0.009678370
## 167 -0.027324422
## 129 0.005375342
## 418 -0.015807567
## 471 -0.059382527
options(scipen = 999)
summary(training_lm_model22)
##
## Call:
## lm(formula = pollutant ~ industry + home_age_relative + distance_from_business +
## highway_accessible + teacher_ratio + price, data = housing_cleaned,
## subset = training_set_rows)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.109313 -0.032184 -0.009453 0.029812 0.198024
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.8200819 0.0599676 13.675 < 0.0000000000000002 ***
## industry 0.0046967 0.0007473 6.285 0.000000001171 ***
## home_age_relative 0.0007337 0.0001816 4.041 0.000067900727 ***
## distance_from_business -0.0169912 0.0026475 -6.418 0.000000000548 ***
## highway_accessible 0.0031753 0.0004837 6.564 0.000000000234 ***
## teacher_ratio -0.0110066 0.0018192 -6.050 0.000000004359 ***
## price -0.0433574 0.0100884 -4.298 0.000023440508 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05483 on 296 degrees of freedom
## Multiple R-squared: 0.7801, Adjusted R-squared: 0.7756
## F-statistic: 175 on 6 and 296 DF, p-value: < 0.00000000000000022
Analyzing the table above, we see that the f-statistic has increased from 89.27 to 175. Aside from that, everything else remains mostly unchanged. This shows that the second model is better since it has a better relationship between the predictors and response variable than the first model. Based on this, the equation for the second model is as follows:
Model22_pollutant <- (0.00380402*housing_cleaned$industry[1]) +
(0.00074438 *housing_cleaned$home_age_relative[1])+
(-0.01771545*housing_cleaned$distance_from_business[1]) +
(0.00286541*housing_cleaned$highway_accessible[1]) +
(-0.01175169*housing_cleaned$teacher_ratio[1]) +
(-0.05398427*housing_cleaned$price[1])+
0.90174808
Model22_pollutant
## [1] 0.5381124