Loading the libraries

library(tidyverse) #to do visuals and cleanup
library(e1071) #for naive bayes
library(caret) #to create a correlation matrix
library(plotly) #to make interactive visuals
library(ggcorrplot) #to make our correlation matrix
library (dplyr)
library (forcats)

Downloading the dataset

housing_new <- read_csv("housing_new.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   id = col_double(),
##   crime = col_double(),
##   zone = col_double(),
##   industry = col_double(),
##   water = col_double(),
##   pollutant = col_double(),
##   room = col_double(),
##   home_age_relative = col_double(),
##   distance_from_business = col_double(),
##   highway_accessible = col_double(),
##   tax_rate = col_double(),
##   teacher_ratio = col_double(),
##   unemployment_percentage = col_double(),
##   price = col_double()
## )
#view(housing_new)

Cleaning the data

housing_cleaned = subset(housing_new, select = -c(X1,id) ) 

Creating the correlation matrix to visualize the relationships between variables

ggcorrplot(cor(housing_cleaned), p.mat = cor_pmat (housing_cleaned), hc.order = TRUE, lab = TRUE, type = "lower")

The unemployment percentage(-.81) is the strongest predictor of price, followed by the amount of rooms the house has(.62). The negative correlation between unemployment and home price suggests the intuitive idea that neighborhoods with expensive homes tend to also have high earning families with low unemployment rates.

Analizing the distribution of the number of rooms per house.

housing_cleaned %>%
  ggplot(aes(x= room))+
  geom_histogram(bins = 6)+
  labs(x= "Number of Rooms", y = "Number of Observations" , title = "Distribution of the Number of Rooms per House")

3 and 3.5 rooms are the most common observations for the number of rooms a house has. The histogram resembles a bell curve indicating a normal distribution.

correlation_equation_roo <- function(x, y) {
  
  correlation_coefficient_roo <- round(cor(x, y), digits = 2)
  paste("italic(r) == ", correlation_coefficient_roo)
}

roo_disp_label <- data.frame(x = 375000, y = 3.25 , label = correlation_equation_roo(housing_cleaned$price, housing_cleaned$room))

housing_new%>%
  mutate (Price= housing_new$price * 100000) %>%
ggplot() +
  aes(x = Price, y = room) +
  geom_point() +
  geom_smooth(se = FALSE, method = "lm") +
        geom_text(data = roo_disp_label, aes(x = x, y = y, label = label), parse = TRUE) +

  labs(X = "Price of Home", Y = "Number of Rooms", title = "Correlation Between Price and Number of Rooms")
## `geom_smooth()` using formula 'y ~ x'

There is a fairly strong correlation between the price of a home and the number of rooms it has. To be exact, the correlation here is .62.

correlation_equation_une <- function(x, y) {
  
  correlation_coefficient_une <- round(cor(x, y), digits = 2)
  paste("italic(r) == ", correlation_coefficient_une)
}

une_disp_label <- data.frame(x = 200000, y = 10.5 , label = correlation_equation_une(housing_cleaned$price, housing_cleaned$unemployment_percentage))

housing_new%>%
  mutate (Price= housing_new$price * 100000) %>%
ggplot() +
  aes(x = Price, y = unemployment_percentage) +
  geom_point() +
  geom_smooth(se = FALSE, method = "lm") +
      geom_text(data = une_disp_label, aes(x = x, y = y, label = label), parse = TRUE) +
  labs(x = "Price of Home", y = "% of Zipcode Currently Unemployed", title = "Correlation Between Price and Unempployment Percent")
## `geom_smooth()` using formula 'y ~ x'

Although our previous correlation was strong, the correlation between price and unemployment percent is stronger yet. With a correlation coefficient of .81 this relationship is as strong as it gets.

correlation_equation_ind <- function(x, y) {
  
  correlation_coefficient_ind <- round(cor(x, y), digits = 2)
  paste("italic(r) == ", correlation_coefficient_ind)
}

ind_disp_label <- data.frame(x = 200000, y = 25, label = correlation_equation_ind(housing_cleaned$price, housing_cleaned$industry))
                               
                               
housing_new%>%
  mutate (Price= housing_new$price * 100000) %>%
ggplot() +
  aes(x = Price, y = industry) +
  geom_point() +
  geom_smooth(se = FALSE, method = "lm") +
    geom_text(data = ind_disp_label, aes(x = x, y = y, label = label), parse = TRUE) +

  labs(x = "Price of Home", y = "Proportion of non-retial Business/ Zip", title = "Correlation Between Price and Industry")
## `geom_smooth()` using formula 'y ~ x'

The correlation of price and industry is representative of the group of variables that hover around the .5 range. These variables while strong, are not as strong as the previous relationships we lookd at.

Partitioning the data into a training and validation set

set.seed(1)
row.names(housing_cleaned)
##   [1] "1"   "2"   "3"   "4"   "5"   "6"   "7"   "8"   "9"   "10"  "11"  "12" 
##  [13] "13"  "14"  "15"  "16"  "17"  "18"  "19"  "20"  "21"  "22"  "23"  "24" 
##  [25] "25"  "26"  "27"  "28"  "29"  "30"  "31"  "32"  "33"  "34"  "35"  "36" 
##  [37] "37"  "38"  "39"  "40"  "41"  "42"  "43"  "44"  "45"  "46"  "47"  "48" 
##  [49] "49"  "50"  "51"  "52"  "53"  "54"  "55"  "56"  "57"  "58"  "59"  "60" 
##  [61] "61"  "62"  "63"  "64"  "65"  "66"  "67"  "68"  "69"  "70"  "71"  "72" 
##  [73] "73"  "74"  "75"  "76"  "77"  "78"  "79"  "80"  "81"  "82"  "83"  "84" 
##  [85] "85"  "86"  "87"  "88"  "89"  "90"  "91"  "92"  "93"  "94"  "95"  "96" 
##  [97] "97"  "98"  "99"  "100" "101" "102" "103" "104" "105" "106" "107" "108"
## [109] "109" "110" "111" "112" "113" "114" "115" "116" "117" "118" "119" "120"
## [121] "121" "122" "123" "124" "125" "126" "127" "128" "129" "130" "131" "132"
## [133] "133" "134" "135" "136" "137" "138" "139" "140" "141" "142" "143" "144"
## [145] "145" "146" "147" "148" "149" "150" "151" "152" "153" "154" "155" "156"
## [157] "157" "158" "159" "160" "161" "162" "163" "164" "165" "166" "167" "168"
## [169] "169" "170" "171" "172" "173" "174" "175" "176" "177" "178" "179" "180"
## [181] "181" "182" "183" "184" "185" "186" "187" "188" "189" "190" "191" "192"
## [193] "193" "194" "195" "196" "197" "198" "199" "200" "201" "202" "203" "204"
## [205] "205" "206" "207" "208" "209" "210" "211" "212" "213" "214" "215" "216"
## [217] "217" "218" "219" "220" "221" "222" "223" "224" "225" "226" "227" "228"
## [229] "229" "230" "231" "232" "233" "234" "235" "236" "237" "238" "239" "240"
## [241] "241" "242" "243" "244" "245" "246" "247" "248" "249" "250" "251" "252"
## [253] "253" "254" "255" "256" "257" "258" "259" "260" "261" "262" "263" "264"
## [265] "265" "266" "267" "268" "269" "270" "271" "272" "273" "274" "275" "276"
## [277] "277" "278" "279" "280" "281" "282" "283" "284" "285" "286" "287" "288"
## [289] "289" "290" "291" "292" "293" "294" "295" "296" "297" "298" "299" "300"
## [301] "301" "302" "303" "304" "305" "306" "307" "308" "309" "310" "311" "312"
## [313] "313" "314" "315" "316" "317" "318" "319" "320" "321" "322" "323" "324"
## [325] "325" "326" "327" "328" "329" "330" "331" "332" "333" "334" "335" "336"
## [337] "337" "338" "339" "340" "341" "342" "343" "344" "345" "346" "347" "348"
## [349] "349" "350" "351" "352" "353" "354" "355" "356" "357" "358" "359" "360"
## [361] "361" "362" "363" "364" "365" "366" "367" "368" "369" "370" "371" "372"
## [373] "373" "374" "375" "376" "377" "378" "379" "380" "381" "382" "383" "384"
## [385] "385" "386" "387" "388" "389" "390" "391" "392" "393" "394" "395" "396"
## [397] "397" "398" "399" "400" "401" "402" "403" "404" "405" "406" "407" "408"
## [409] "409" "410" "411" "412" "413" "414" "415" "416" "417" "418" "419" "420"
## [421] "421" "422" "423" "424" "425" "426" "427" "428" "429" "430" "431" "432"
## [433] "433" "434" "435" "436" "437" "438" "439" "440" "441" "442" "443" "444"
## [445] "445" "446" "447" "448" "449" "450" "451" "452" "453" "454" "455" "456"
## [457] "457" "458" "459" "460" "461" "462" "463" "464" "465" "466" "467" "468"
## [469] "469" "470" "471" "472" "473" "474" "475" "476" "477" "478" "479" "480"
## [481] "481" "482" "483" "484" "485" "486" "487" "488" "489" "490" "491" "492"
## [493] "493" "494" "495" "496" "497" "498" "499" "500" "501" "502" "503" "504"
## [505] "505" "506"
training_set_rows <- sample(rownames(housing_cleaned), nrow(housing_cleaned)*0.6)
validation_set_rows <- sample(setdiff(rownames(housing_cleaned), training_set_rows), nrow(housing_cleaned)*0.4)

training_data <- housing_cleaned[training_set_rows, ] 
validation_data <- housing_cleaned[validation_set_rows, ]

Creating multiple linear regression models to predict price

First model using the predictor variable room

training_lm_model1 <- lm(price ~ ., data = housing_cleaned, subset = training_set_rows)
training_residuals1 <- data.frame(training_data$price, training_lm_model1$fitted.values, training_lm_model1$residuals)

head(training_residuals1)
##     training_data.price training_lm_model1.fitted.values
## 505            3.091042                         3.252898
## 324            2.917771                         3.030806
## 167            3.912023                         3.605942
## 129            2.890372                         2.837452
## 418            2.341806                         2.197836
## 471            2.990720                         2.880636
##     training_lm_model1.residuals
## 505                  -0.16185599
## 324                  -0.11303572
## 167                   0.30608073
## 129                   0.05291957
## 418                   0.14396952
## 471                   0.11008340

Summary of the traning data- Model 1

options(scipen = 999)
summary(training_lm_model1)
## 
## Call:
## lm(formula = price ~ ., data = housing_cleaned, subset = training_set_rows)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.64868 -0.10814 -0.01137  0.11000  0.77503 
## 
## Coefficients:
##                           Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)              4.2612376  0.2603465  16.368 < 0.0000000000000002 ***
## crime                   -0.0127322  0.0018184  -7.002      0.0000000000176 ***
## zone                     0.0011645  0.0007534   1.546             0.123250    
## industry                 0.0029859  0.0031845   0.938             0.349216    
## water                    0.1150665  0.0435203   2.644             0.008640 ** 
## pollutant               -0.6925676  0.2063594  -3.356             0.000896 ***
## room                     0.1595912  0.0430401   3.708             0.000250 ***
## home_age_relative        0.0003102  0.0007027   0.441             0.659261    
## distance_from_business  -0.0467319  0.0108840  -4.294      0.0000239966316 ***
## highway_accessible       0.0143584  0.0033872   4.239      0.0000302083049 ***
## tax_rate                -0.0006870  0.0001895  -3.626             0.000340 ***
## teacher_ratio           -0.0353876  0.0070876  -4.993      0.0000010283601 ***
## unemployment_percentage -0.1259424  0.0109335 -11.519 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1953 on 290 degrees of freedom
## Multiple R-squared:  0.7876, Adjusted R-squared:  0.7788 
## F-statistic: 89.59 on 12 and 290 DF,  p-value: < 0.00000000000000022

Zone, industry, and home_age_relative are the only factors that failed to reject the null hypothesis.

Interpreting the output of the fitted values with the training data.

ggplot(data = training_lm_model1) +
  aes(x = training_lm_model1$residuals) + 
  geom_histogram(bins = 10)+
labs (x= "Residuals", y="Count")

The graph above confirms the outputs and shows that the distribution is symmetric. The majority of the error are close to zero avoiding to have over or under estimations.

Calculating the multiple linear regression formula.

Model1_price <- (-0.0127322*housing_cleaned$crime[1]) +
(0.0011645*housing_cleaned$zone[1]) + 
(0.0029859*housing_cleaned$industry[1]) + 
(0.1150665*housing_cleaned$water[1]) + 
(-0.6925676*housing_cleaned$pollutant[1]) +
(0.1595912*housing_cleaned$room[1]) +
(0.0003102*housing_cleaned$home_age_relative[1]) +
(-0.0467319*housing_cleaned$distance_from_business[1]) + 
(0.0143584*housing_cleaned$highway_accessible[1]) + 
(-0.0006870*housing_cleaned$tax_rate[1]) + 
(-0.0353876*housing_cleaned$teacher_ratio[1]) + 
(-0.1259424*housing_cleaned$unemployment_percentage[1])+ 
4.2612376 

Model1_price * 100000
## [1] 341685.3

Calulating percentage error for any of the predictions.

prediction_off_by <- 0.1953/4.2612376 
prediction_off_by
## [1] 0.04583176

The above result shows that predictions will be off by 4%. Another observation is that the R-square measure tells that the relationship between the predictors accounts for 79% of the variation.

Interpreting the output value with the validation data. Model-1

prediction_ml_model1 <- predict(training_lm_model1, newdata = validation_data)

validation_residuals1 <- data.frame(validation_data$price, prediction_ml_model1, residuals = validation_data$price - prediction_ml_model1)

head(validation_residuals1)
##   validation_data.price prediction_ml_model1    residuals
## 1              1.840550             2.370480 -0.529930497
## 2              3.117950             3.270353 -0.152403177
## 3              3.081910             3.026990  0.054919706
## 4              2.949688             2.681568  0.268119892
## 5              3.314186             2.788545  0.525640741
## 6              2.928524             2.920141  0.008382301

The table above shows the predicted values (fitted values), and the residuals (prediction errors). Based on these results, we are creating the following graph:

ggplot(data = validation_residuals1) +
  aes(x = residuals) + 
  geom_histogram(bins = 10)+
  labs(X="Residuals", y="Count", title = "Validation Residuals")

Comparing the training and validation data, there is less error with predictions when using the training data as opposed to using the validation data. There is a greater concentration of residuals of near 0 for the training data set as opposed to the validation data set. Despite that, they both perform rather similarly based on the histograms.

Second model making a change to our predictor variables.

Some variables were not statistically significant such as zone, industry and home _age _relative. We are removing these variables to improve our model. We are creating a data frame that contains the home price from the new predictive model, along with the residuals. These residuals are showing the difference between the observations and predictions on a row by row basis.

training_lm_model2 <- lm(`price` ~ crime + water + pollutant + room + distance_from_business + highway_accessible + tax_rate + teacher_ratio + unemployment_percentage, data = housing_cleaned, subset = training_set_rows)  

training_residuals2 <- data.frame(training_data$price, training_lm_model2$fitted.values, training_lm_model2$residuals) 

head(training_residuals2)
##     training_data.price training_lm_model2.fitted.values
## 505            3.091042                         3.239371
## 324            2.917771                         3.038557
## 167            3.912023                         3.596462
## 129            2.890372                         2.809753
## 418            2.341806                         2.198668
## 471            2.990720                         2.876940
##     training_lm_model2.residuals
## 505                  -0.14832853
## 324                  -0.12078605
## 167                   0.31556150
## 129                   0.08061829
## 418                   0.14313774
## 471                   0.11377961

Summary of Model 2

options(scipen = 999)
summary(training_lm_model2)
## 
## Call:
## lm(formula = price ~ crime + water + pollutant + room + distance_from_business + 
##     highway_accessible + tax_rate + teacher_ratio + unemployment_percentage, 
##     data = housing_cleaned, subset = training_set_rows)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.64484 -0.11011 -0.01171  0.10811  0.77612 
## 
## Coefficients:
##                           Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)              4.2608330  0.2596419  16.410 < 0.0000000000000002 ***
## crime                   -0.0126165  0.0018073  -6.981      0.0000000000197 ***
## water                    0.1211285  0.0433022   2.797             0.005495 ** 
## pollutant               -0.6498922  0.1942717  -3.345             0.000929 ***
## room                     0.1688652  0.0417699   4.043      0.0000675871400 ***
## distance_from_business  -0.0437858  0.0086274  -5.075      0.0000006889514 ***
## highway_accessible       0.0131083  0.0032410   4.045      0.0000671058115 ***
## tax_rate                -0.0005656  0.0001687  -3.353             0.000903 ***
## teacher_ratio           -0.0378030  0.0066488  -5.686      0.0000000314966 ***
## unemployment_percentage -0.1228098  0.0101595 -12.088 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1953 on 293 degrees of freedom
## Multiple R-squared:  0.7853, Adjusted R-squared:  0.7787 
## F-statistic: 119.1 on 9 and 293 DF,  p-value: < 0.00000000000000022

Analyzing the table above, we see that the f-statistic has increased from 89.59 to 119.1. Aside from that, everything else remains mostly unchanged. This shows that the second model is better since it has a better relationship between the predictors and response variable than the first model. Based on this, the equation for the second model is as follows:

Model2_price <- (-0.0126165*housing_cleaned$crime[1]) +
(0.1211285*housing_cleaned$water[1]) + 
(-0.6498922*housing_cleaned$pollutant[1]) +
( 0.1688652*housing_cleaned$room[1]) +
(-0.0437858*housing_cleaned$distance_from_business[1]) + 
( 0.0131083*housing_cleaned$highway_accessible[1]) + 
(-0.0005656*housing_cleaned$tax_rate[1]) + 
(-0.0378030*housing_cleaned$teacher_ratio[1]) + 
(-0.1228098*housing_cleaned$unemployment_percentage[1])+ 
 4.2608330

Model2_price * 100000
## [1] 343746.2

Creating multiple linear regression models to predict pollutant.

First model using the predictor variable pollutant.

training_lm_model11 <- lm(`pollutant` ~ ., data = housing_cleaned, subset = training_set_rows)

training_residuals11 <- data.frame(training_data$pollutant, training_lm_model11$fitted.values, training_lm_model11$residuals)


head(training_residuals11)
##     training_data.pollutant training_lm_model11.fitted.values
## 505                   0.573                         0.5327212
## 324                   0.493                         0.5038849
## 167                   0.605                         0.6167978
## 129                   0.624                         0.6167520
## 418                   0.679                         0.6851179
## 471                   0.580                         0.6443924
##     training_lm_model11.residuals
## 505                   0.040278813
## 324                  -0.010884885
## 167                  -0.011797799
## 129                   0.007247973
## 418                  -0.006117908
## 471                  -0.064392364
options(scipen = 999)
summary(training_lm_model11)
## 
## Call:
## lm(formula = pollutant ~ ., data = housing_cleaned, subset = training_set_rows)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.114497 -0.031904 -0.008015  0.028600  0.200732 
## 
## Coefficients:
##                            Estimate  Std. Error t value             Pr(>|t|)
## (Intercept)              0.90174808  0.08579063  10.511 < 0.0000000000000002
## crime                   -0.00104107  0.00054551  -1.908             0.057323
## zone                    -0.00005275  0.00021117  -0.250             0.802923
## industry                 0.00380402  0.00086196   4.413        0.00001437994
## water                    0.01630551  0.01225875   1.330             0.184527
## room                    -0.01216307  0.01227724  -0.991             0.322658
## home_age_relative        0.00074438  0.00019133   3.891             0.000124
## distance_from_business  -0.01771545  0.00295611  -5.993        0.00000000611
## highway_accessible       0.00286541  0.00095990   2.985             0.003076
## tax_rate                 0.00006897  0.00005393   1.279             0.201997
## teacher_ratio           -0.01175169  0.00194320  -6.048        0.00000000452
## unemployment_percentage -0.00241808  0.00368255  -0.657             0.511938
## price                   -0.05398427  0.01608530  -3.356             0.000896
##                            
## (Intercept)             ***
## crime                   .  
## zone                       
## industry                ***
## water                      
## room                       
## home_age_relative       ***
## distance_from_business  ***
## highway_accessible      ** 
## tax_rate                   
## teacher_ratio           ***
## unemployment_percentage    
## price                   ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05452 on 290 degrees of freedom
## Multiple R-squared:  0.787,  Adjusted R-squared:  0.7781 
## F-statistic: 89.27 on 12 and 290 DF,  p-value: < 0.00000000000000022

zone, water, room, tax_rate and unemployment_percentage each failed to reject the null.

Interpreting the output of the fitted values with the training data.

ggplot(data = training_lm_model11) +
  aes(x = training_lm_model11$residuals) + 
  geom_histogram(bins = 10)+
labs (x= "Residuals", y="Count")

The graph above confirms the outputs and shows that the distribution is symmetric. The majority of the error are close to zero avoiding to have over or under estimations.

Calculating the multiple linear regression formula.

pollutant <- (-0.00104107*housing_cleaned$crime[1]) + 
  (-0.00005275*housing_cleaned$zone[1]) +
  (0.00380402*housing_cleaned$industry[1]) + 
  (0.01630551*housing_cleaned$water[1]) + 
  (-0.01216307*housing_cleaned$room[1]) +
  (0.00074438*housing_cleaned$home_age_relative[1]) +      (-0.01771545*housing_cleaned$distance_from_business[1]) + (0.00286541*housing_cleaned$highway_accessible[1]) + 
  (0.00006897*housing_cleaned$tax_rate[1]) + 
  (0.00006897*housing_cleaned$tax_rate[1]) + 
  (-0.01175169*housing_cleaned$teacher_ratio[1]) + (-0.00241808*housing_cleaned$unemployment_percentage[1]) + 
  (-0.05398427*housing_cleaned$price[1]) + 0.90174808       

pollutant
## [1] 0.5324053

Calculating percentage error for any of the predictions.

prediction_off_by <- 0.05452/0.90174808  
prediction_off_by
## [1] 0.06046034

The above result shows that predictions will be off by 6%. Another observation is that the R-square measure tells that the relationship between the predictors accounts for 79% of the variation.

Interpreting the output value with the validation data. Model-1

prediction_ml_model11 <- predict(training_lm_model11, newdata = validation_data) 

validation_residuals11 <- data.frame(validation_data$pollutant, prediction_ml_model11, residuals= validation_data$pollutant- prediction_ml_model11) 

head(validation_residuals11)
##   validation_data.pollutant prediction_ml_model11   residuals
## 1                     0.693             0.7149598 -0.02195980
## 2                     0.510             0.5493664 -0.03936637
## 3                     0.585             0.5448343  0.04016573
## 4                     0.580             0.6263445 -0.04634454
## 5                     0.597             0.6479213 -0.05092131
## 6                     0.547             0.5983617 -0.05136167

The table above shows the predicted values (fitted values), and the residuals (prediction errors). Based on these results, we are creating the following graph:

ggplot(data = validation_residuals11) +
  aes(x = residuals) + 
  geom_histogram(bins = 10)+
  labs( y="Count", x="Residuals", title = "Validation Residuals")

Comparing the training and validation data, there is less error with predictions when using the training data as opposed to using the validation data. There is a greater concentration of residuals of near 0 for the training data set as opposed to the validation data set. Despite that, they both perform rather similarly based on the histograms.

Second model making a change to our predictor variables.

Some variables were not statistically significant such as crime, zone,water, room,tax_rate and unemployment_percentage. We are removing these variables to improve our model. We are creating a data frame that contains the pollutant values from the new predictive model, along with the residuals. These residuals are showing the difference between the observations and predictions on a row by row basis.

training_lm_model22 <- lm(`pollutant` ~ industry +  home_age_relative + distance_from_business + highway_accessible + teacher_ratio + price, data = housing_cleaned, subset = training_set_rows)  

training_residuals22 <- data.frame(training_data$pollutant, training_lm_model22$fitted.values, training_lm_model22$residuals) 

head(training_residuals22)
##     training_data.pollutant training_lm_model22.fitted.values
## 505                   0.573                         0.5390572
## 324                   0.493                         0.5026784
## 167                   0.605                         0.6323244
## 129                   0.624                         0.6186247
## 418                   0.679                         0.6948076
## 471                   0.580                         0.6393825
##     training_lm_model22.residuals
## 505                   0.033942784
## 324                  -0.009678370
## 167                  -0.027324422
## 129                   0.005375342
## 418                  -0.015807567
## 471                  -0.059382527

Summary of Model 2

options(scipen = 999)
summary(training_lm_model22)
## 
## Call:
## lm(formula = pollutant ~ industry + home_age_relative + distance_from_business + 
##     highway_accessible + teacher_ratio + price, data = housing_cleaned, 
##     subset = training_set_rows)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.109313 -0.032184 -0.009453  0.029812  0.198024 
## 
## Coefficients:
##                          Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)             0.8200819  0.0599676  13.675 < 0.0000000000000002 ***
## industry                0.0046967  0.0007473   6.285       0.000000001171 ***
## home_age_relative       0.0007337  0.0001816   4.041       0.000067900727 ***
## distance_from_business -0.0169912  0.0026475  -6.418       0.000000000548 ***
## highway_accessible      0.0031753  0.0004837   6.564       0.000000000234 ***
## teacher_ratio          -0.0110066  0.0018192  -6.050       0.000000004359 ***
## price                  -0.0433574  0.0100884  -4.298       0.000023440508 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05483 on 296 degrees of freedom
## Multiple R-squared:  0.7801, Adjusted R-squared:  0.7756 
## F-statistic:   175 on 6 and 296 DF,  p-value: < 0.00000000000000022

Analyzing the table above, we see that the f-statistic has increased from 89.27 to 175. Aside from that, everything else remains mostly unchanged. This shows that the second model is better since it has a better relationship between the predictors and response variable than the first model. Based on this, the equation for the second model is as follows:

Model22_pollutant <- (0.00380402*housing_cleaned$industry[1]) +
(0.00074438 *housing_cleaned$home_age_relative[1])+
(-0.01771545*housing_cleaned$distance_from_business[1]) + 
(0.00286541*housing_cleaned$highway_accessible[1]) + 
(-0.01175169*housing_cleaned$teacher_ratio[1]) + 
(-0.05398427*housing_cleaned$price[1])+ 
0.90174808

Model22_pollutant
## [1] 0.5381124