R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Clear the workspace
rm(list = ls())  # Clear environment
gc()             # Clear unused memory / Take out the trash
##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  692981 37.1    1327144 70.9  1327144 70.9
## Vcells 1246182  9.6    8388608 64.0  1996585 15.3
cat("\f")        # Clear the console
if(!is.null(dev.list())) dev.off() # Clear all plots
## null device 
##           1
library(readxl)

df <- read_excel("DataDF.xlsx")
## New names:
## • `2023 Pop Wrk Trav Time 60 Min` -> `2023 Pop Wrk Trav Time 60 Min...45`
## • `2023 Pop Wrk Trav Time 60 Min` -> `2023 Pop Wrk Trav Time 60 Min...46`
df <- as.data.frame(df)

glimpse(df)
## Rows: 1,137
## Columns: 46
## $ `Number Of Units`                        <dbl> 25, 8, 18, 17, 13, 125, 236, …
## $ `Year Built`                             <dbl> 1984, 1950, 1990, 2007, 1977,…
## $ Vacancy                                  <dbl> 7.04, 10.26, 6.31, 4.61, 6.29…
## $ `Land Area`                              <dbl> 108900, 277477, 67965, 174240…
## $ `Last Sale Price`                        <dbl> 3100000, 800000, 1000000, 600…
## $ `House Maint And Repair 2023 Cons Spdng` <dbl> 2315630, 178754, 7470769, 180…
## $ `Household Operations 2023 Cons Spdng`   <dbl> 2875112, 197441, 8760263, 158…
## $ `HH Grwth 2010 2023`                     <dbl> 1.17, 46.67, 2.66, 3.29, 0.00…
## $ `HH Grwth 2023 2028`                     <dbl> 9.88, 7.91, -1.30, -0.64, 0.0…
## $ `2023 Households`                        <dbl> 2571, 139, 8107, 156, 21, 832…
## $ `2023 Med HH Size`                       <dbl> 2, 3, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ `2023 Avg HH Size`                       <dbl> 2.3, 3.0, 2.0, 2.1, 2.2, 2.1,…
## $ `HU Grwth 2010 2023`                     <dbl> 8.03, 51.09, 5.03, 10.64, -25…
## $ `2023 Avg HU Size`                       <dbl> 4, 2, 9, 1, 1, 7, 9, 2, 17, 3…
## $ `2023 Avg HU Value`                      <dbl> 215342, 225340, 240448, 15709…
## $ `2023 Avg Yr Built`                      <dbl> 1971, 2003, 1974, 1984, 1984,…
## $ `2023 Group Quarters`                    <dbl> 491, 0, 130, 0, 0, 62, 209, 1…
## $ `2023 Home Blt 1940 1949`                <dbl> 849, 1, 763, 7, 4, 583, 363, …
## $ `2023 Home Blt 1950 1959`                <dbl> 381, 2, 1500, 4, 1, 2772, 217…
## $ `2023 Home Blt 1960 1969`                <dbl> 277, 2, 1854, 7, 3, 1745, 178…
## $ `2023 Home Blt 1970 1979`                <dbl> 247, 8, 3048, 83, 5, 1907, 26…
## $ `2023 Home Blt 1980 1989`                <dbl> 310, 8, 1360, 52, 6, 786, 104…
## $ `2023 Home Blt 1990 1999`                <dbl> 284, 12, 268, 40, 8, 414, 245…
## $ `2023 Home Blt 2000 2010`                <dbl> 372, 72, 917, 21, 10, 56, 135…
## $ `2023 Home Blt 2010`                     <dbl> 145, 49, 283, 8, 0, 1220, 0, …
## $ `2023 HU 1 Unit`                         <dbl> 1985, 120, 3791, 77, 16, 6281…
## $ `2023 HU 20 Units`                       <dbl> 88, 3, 1353, 0, 0, 1658, 1695…
## $ `2023 HU 2 4 Units`                      <dbl> 127, 0, 926, 0, 0, 458, 435, …
## $ `2023 HU 5 19 Units`                     <dbl> 432, 5, 1522, 0, 0, 546, 948,…
## $ `2023 Med Yr Built`                      <dbl> 1966, 2006, 1972, 1981, 1988,…
## $ `2023 Owner Occd Housing`                <dbl> 1343, 103, 4289, 124, 18, 631…
## $ `2023 Renter Occd Housing`               <dbl> 1229, 36, 3818, 32, 3, 2012, …
## $ `2023 Home Value 1000000`                <dbl> 3, 0, 106, 0, 4, 15, 117, 0, …
## $ `2023 Home Value 100000 200000`          <dbl> 514, 28, 919, 29, 4, 1958, 12…
## $ `2023 Home Value 200000 300000`          <dbl> 341, 56, 671, 3, 2, 1352, 133…
## $ `2023 Home Value 300000 400000`          <dbl> 106, 12, 579, 2, 2, 730, 543,…
## $ `2023 Home Value 400000 500000`          <dbl> 105, 1, 80, 0, 0, 355, 159, 1…
## $ `2023 Home Value 500000 1000000`         <dbl> 33, 0, 361, 12, 0, 324, 624, …
## $ `2023 Home Value 100000`                 <dbl> 242, 6, 1573, 78, 5, 1575, 65…
## $ `2023 Median Home Value`                 <dbl> 183657, 231250, 162187, 79486…
## $ `Pop Grwth 2010 2023`                    <dbl> 9.44, 51.61, 4.49, 10.56, -26…
## $ `Pop Grwth 2023 2028`                    <dbl> 10.34, 8.04, -1.39, 0.00, -1.…
## $ `2023 Population`                        <dbl> 6479, 423, 16666, 335, 51, 17…
## $ `2023 Pop Wrk Trav Time 30 Min`          <dbl> 1882, 104, 5237, 79, 11, 6442…
## $ `2023 Pop Wrk Trav Time 60 Min...45`     <dbl> 476, 93, 2031, 49, 2, 1838, 1…
## $ `2023 Pop Wrk Trav Time 60 Min...46`     <dbl> 202, 21, 377, 4, 2, 234, 274,…
library(dplyr)
library(ggcorrplot)
## Loading required package: ggplot2
?ggcorrplot()
?cor


?cor_pmat # cor_pmat(): Compute a correlation matrix p-values.

mycorr<- cor(x = df, use = "pairwise.complete.obs",method = c("spearman"))

p.mat <- ggcorrplot::cor_pmat(x = df)

# head(p.mat)
library(ggcorrplot)

myplot<-ggcorrplot(corr     = mycorr,   # correlation matrix to visualize
                   method   = "square", # character, the visualization method of correlation matrix to be used. Allowed values are "square" (default), "circle"
                   type     = "lower",  # character, "full" (default), "lower" or "upper" display
                   title    = "Correlation Plot",  # character, title of the graph
                   colors   = c("red", "white","green"), #    vector of 3 colors for low, mid and high correlation values.
                   lab      = TRUE,   # If TRUE, add corr coeff on the plot.
                   lab_size = 2,      # labels. used when lab = TRUE.
                   p.mat    = p.mat,  # matrix of p-value. If NULL, arguments sig.level, insig, pch, pch.col, pch.cex is invalid.  # Barring the no significant coefficient
                   insig    = "pch",  # character, specialized insignificant correlation coefficients, "pch" (default), "blank". If "blank", wipe away the corresponding glyphs; if "pch", add characters (see pch for details) on corresponding glyphs.
                   pch      = 4, # add character on the glyphs of insignificant correlation coefficients (only valid when insig is "pch"). Default value is 4.
                   hc.order = TRUE, # If TRUE, correlation matrix will be hc.ordered using hclust function.
                   tl.cex   = 8, # the size, the color and the string rotation of text label
                   tl.col   = "black", 
                   digits = 2
)
myplot

# Load necessary library
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(stargazer,quietly = T)
stargazer(df,
          type = "text",
          title = "Summary Statistics",
          notes = "Data Source: CoStar, n= 46",
          digits = 2,
          omit.summary.stat = "n",  
          out = "my_table.tex")
## 
## Summary Statistics
## ====================================================================================
## Statistic                                  Mean       St. Dev.     Min       Max    
## ------------------------------------------------------------------------------------
## Number Of Units                           119.37       158.94       1       1,519   
## Year Built                               1,970.41       18.41     1,900     2,022   
## Vacancy                                    6.20         5.56       0.06    100.00   
## Land Area                               869,519.80  1,862,579.00    1    25,501,431 
## Last Sale Price                        6,121,384.00 15,591,909.00 2,600  363,125,000
## House Maint And Repair 2023 Cons Spdng 3,201,500.00 2,408,647.00    0    12,607,149 
## Household Operations 2023 Cons Spdng   3,457,632.00 2,645,364.00    0    18,560,824 
## HH Grwth 2010 2023                        12.45         17.91     -24.46   157.93   
## HH Grwth 2023 2028                         2.78         3.86      -10.79    17.24   
## 2023 Households                          3,034.50     2,401.24      0      12,942   
## 2023 Med HH Size                           2.10         0.33        0         4     
## 2023 Avg HH Size                           2.41         0.36       0.00     4.10    
## HU Grwth 2010 2023                        17.45         23.32     -49.33   188.33   
## 2023 Avg HU Size                           4.59         3.58        0        19     
## 2023 Avg HU Value                       227,786.60    96,678.88     0      764,965  
## 2023 Avg Yr Built                        1,980.33       83.62       0       2,008   
## 2023 Group Quarters                       104.62       233.94       0       3,287   
## 2023 Home Blt 1940 1949                   169.37       309.81       0       4,433   
## 2023 Home Blt 1950 1959                   345.57       512.42       0       3,297   
## 2023 Home Blt 1960 1969                   489.47       599.10       0       4,157   
## 2023 Home Blt 1970 1979                   770.10       826.32       0       5,872   
## 2023 Home Blt 1980 1989                   688.46       619.67       0       4,214   
## 2023 Home Blt 1990 1999                   391.28       359.37       0       3,347   
## 2023 Home Blt 2000 2010                   390.95       409.31       0       3,539   
## 2023 Home Blt 2010                        335.47       407.24       0       2,570   
## 2023 HU 1 Unit                           1,732.25     1,280.85      0       8,026   
## 2023 HU 20 Units                          387.00       724.56       0       7,773   
## 2023 HU 2 4 Units                         293.79       391.61       0       2,743   
## 2023 HU 5 19 Units                        420.39       621.17       0       4,326   
## 2023 Med Yr Built                        1,979.39       83.81       0       2,010   
## 2023 Owner Occd Housing                  1,860.25     1,393.00      0       7,443   
## 2023 Renter Occd Housing                 1,174.25     1,233.33      0       8,801   
## 2023 Home Value 1000000                   24.79         60.63       0        551    
## 2023 Home Value 100000 200000             531.79       500.64       0       3,138   
## 2023 Home Value 200000 300000             409.99       379.43       0       2,179   
## 2023 Home Value 300000 400000             198.22       237.68       0       1,649   
## 2023 Home Value 400000 500000             75.17        119.11       0       1,179   
## 2023 Home Value 500000 1000000            100.51       144.80       0       1,829   
## 2023 Home Value 100000                    519.80       585.55       0       4,566   
## 2023 Median Home Value                  188,513.00    92,171.90     0      767,973  
## Pop Grwth 2010 2023                       17.37         22.50     -48.46   189.24   
## Pop Grwth 2023 2028                        2.80         3.81      -8.96     17.13   
## 2023 Population                          7,401.47     5,884.95      0      32,505   
## 2023 Pop Wrk Trav Time 30 Min            2,057.95     1,764.77      0       8,511   
## 2023 Pop Wrk Trav Time 60 Min...45        988.64      1,018.90      0       7,965   
## 2023 Pop Wrk Trav Time 60 Min...46        245.87       287.87       0       2,940   
## ------------------------------------------------------------------------------------
## Data Source: CoStar, n= 46
# Extract numeric values from the dataframe
X <- as.matrix(df[sapply(df, is.numeric)])

# Before standardization
colMeans(X)  # Mean of each column
##                        Number Of Units                             Year Built 
##                           1.193685e+02                           1.970414e+03 
##                                Vacancy                              Land Area 
##                                     NA                           8.695198e+05 
##                        Last Sale Price House Maint And Repair 2023 Cons Spdng 
##                           6.121384e+06                           3.201500e+06 
##   Household Operations 2023 Cons Spdng                     HH Grwth 2010 2023 
##                           3.457632e+06                           1.244945e+01 
##                     HH Grwth 2023 2028                        2023 Households 
##                           2.775743e+00                           3.034502e+03 
##                       2023 Med HH Size                       2023 Avg HH Size 
##                           2.098505e+00                           2.413632e+00 
##                     HU Grwth 2010 2023                       2023 Avg HU Size 
##                           1.744588e+01                           4.585752e+00 
##                      2023 Avg HU Value                      2023 Avg Yr Built 
##                           2.277866e+05                           1.980332e+03 
##                    2023 Group Quarters                2023 Home Blt 1940 1949 
##                           1.046218e+02                           1.693685e+02 
##                2023 Home Blt 1950 1959                2023 Home Blt 1960 1969 
##                           3.455734e+02                           4.894670e+02 
##                2023 Home Blt 1970 1979                2023 Home Blt 1980 1989 
##                           7.701047e+02                           6.884617e+02 
##                2023 Home Blt 1990 1999                2023 Home Blt 2000 2010 
##                           3.912832e+02                           3.909525e+02 
##                     2023 Home Blt 2010                         2023 HU 1 Unit 
##                           3.354697e+02                           1.732252e+03 
##                       2023 HU 20 Units                      2023 HU 2 4 Units 
##                           3.869956e+02                           2.937863e+02 
##                     2023 HU 5 19 Units                      2023 Med Yr Built 
##                           4.203896e+02                           1.979392e+03 
##                2023 Owner Occd Housing               2023 Renter Occd Housing 
##                           1.860248e+03                           1.174253e+03 
##                2023 Home Value 1000000          2023 Home Value 100000 200000 
##                           2.479332e+01                           5.317854e+02 
##          2023 Home Value 200000 300000          2023 Home Value 300000 400000 
##                           4.099894e+02                           1.982172e+02 
##          2023 Home Value 400000 500000         2023 Home Value 500000 1000000 
##                           7.517326e+01                           1.005084e+02 
##                 2023 Home Value 100000                 2023 Median Home Value 
##                           5.197977e+02                           1.885130e+05 
##                    Pop Grwth 2010 2023                    Pop Grwth 2023 2028 
##                           1.737237e+01                           2.804186e+00 
##                        2023 Population          2023 Pop Wrk Trav Time 30 Min 
##                           7.401467e+03                           2.057947e+03 
##     2023 Pop Wrk Trav Time 60 Min...45     2023 Pop Wrk Trav Time 60 Min...46 
##                           9.886376e+02                           2.458698e+02
apply(X = X,
          MARGIN = 2,
          FUN = sd
          )  # standard deviation
##                        Number Of Units                             Year Built 
##                           1.589372e+02                           1.841338e+01 
##                                Vacancy                              Land Area 
##                                     NA                           1.862579e+06 
##                        Last Sale Price House Maint And Repair 2023 Cons Spdng 
##                           1.559191e+07                           2.408647e+06 
##   Household Operations 2023 Cons Spdng                     HH Grwth 2010 2023 
##                           2.645364e+06                           1.791372e+01 
##                     HH Grwth 2023 2028                        2023 Households 
##                           3.855609e+00                           2.401237e+03 
##                       2023 Med HH Size                       2023 Avg HH Size 
##                           3.263211e-01                           3.619425e-01 
##                     HU Grwth 2010 2023                       2023 Avg HU Size 
##                           2.331966e+01                           3.582514e+00 
##                      2023 Avg HU Value                      2023 Avg Yr Built 
##                           9.667888e+04                           8.362326e+01 
##                    2023 Group Quarters                2023 Home Blt 1940 1949 
##                           2.339396e+02                           3.098076e+02 
##                2023 Home Blt 1950 1959                2023 Home Blt 1960 1969 
##                           5.124212e+02                           5.991007e+02 
##                2023 Home Blt 1970 1979                2023 Home Blt 1980 1989 
##                           8.263159e+02                           6.196659e+02 
##                2023 Home Blt 1990 1999                2023 Home Blt 2000 2010 
##                           3.593700e+02                           4.093120e+02 
##                     2023 Home Blt 2010                         2023 HU 1 Unit 
##                           4.072429e+02                           1.280847e+03 
##                       2023 HU 20 Units                      2023 HU 2 4 Units 
##                           7.245635e+02                           3.916130e+02 
##                     2023 HU 5 19 Units                      2023 Med Yr Built 
##                           6.211725e+02                           8.381302e+01 
##                2023 Owner Occd Housing               2023 Renter Occd Housing 
##                           1.393002e+03                           1.233330e+03 
##                2023 Home Value 1000000          2023 Home Value 100000 200000 
##                           6.062964e+01                           5.006420e+02 
##          2023 Home Value 200000 300000          2023 Home Value 300000 400000 
##                           3.794259e+02                           2.376790e+02 
##          2023 Home Value 400000 500000         2023 Home Value 500000 1000000 
##                           1.191127e+02                           1.447951e+02 
##                 2023 Home Value 100000                 2023 Median Home Value 
##                           5.855536e+02                           9.217190e+04 
##                    Pop Grwth 2010 2023                    Pop Grwth 2023 2028 
##                           2.250016e+01                           3.809244e+00 
##                        2023 Population          2023 Pop Wrk Trav Time 30 Min 
##                           5.884947e+03                           1.764772e+03 
##     2023 Pop Wrk Trav Time 60 Min...45     2023 Pop Wrk Trav Time 60 Min...46 
##                           1.018898e+03                           2.878651e+02
# # scale : mean = 0, std=1
# ?scale    
#     X = scale(x = X)
#  
# # after standardization
#     colMeans(x = X)    # mean ~ 0
# 
apply(X = X,
          MARGIN = 2,
          FUN = sd
          )  # standard deviation = 1
##                        Number Of Units                             Year Built 
##                           1.589372e+02                           1.841338e+01 
##                                Vacancy                              Land Area 
##                                     NA                           1.862579e+06 
##                        Last Sale Price House Maint And Repair 2023 Cons Spdng 
##                           1.559191e+07                           2.408647e+06 
##   Household Operations 2023 Cons Spdng                     HH Grwth 2010 2023 
##                           2.645364e+06                           1.791372e+01 
##                     HH Grwth 2023 2028                        2023 Households 
##                           3.855609e+00                           2.401237e+03 
##                       2023 Med HH Size                       2023 Avg HH Size 
##                           3.263211e-01                           3.619425e-01 
##                     HU Grwth 2010 2023                       2023 Avg HU Size 
##                           2.331966e+01                           3.582514e+00 
##                      2023 Avg HU Value                      2023 Avg Yr Built 
##                           9.667888e+04                           8.362326e+01 
##                    2023 Group Quarters                2023 Home Blt 1940 1949 
##                           2.339396e+02                           3.098076e+02 
##                2023 Home Blt 1950 1959                2023 Home Blt 1960 1969 
##                           5.124212e+02                           5.991007e+02 
##                2023 Home Blt 1970 1979                2023 Home Blt 1980 1989 
##                           8.263159e+02                           6.196659e+02 
##                2023 Home Blt 1990 1999                2023 Home Blt 2000 2010 
##                           3.593700e+02                           4.093120e+02 
##                     2023 Home Blt 2010                         2023 HU 1 Unit 
##                           4.072429e+02                           1.280847e+03 
##                       2023 HU 20 Units                      2023 HU 2 4 Units 
##                           7.245635e+02                           3.916130e+02 
##                     2023 HU 5 19 Units                      2023 Med Yr Built 
##                           6.211725e+02                           8.381302e+01 
##                2023 Owner Occd Housing               2023 Renter Occd Housing 
##                           1.393002e+03                           1.233330e+03 
##                2023 Home Value 1000000          2023 Home Value 100000 200000 
##                           6.062964e+01                           5.006420e+02 
##          2023 Home Value 200000 300000          2023 Home Value 300000 400000 
##                           3.794259e+02                           2.376790e+02 
##          2023 Home Value 400000 500000         2023 Home Value 500000 1000000 
##                           1.191127e+02                           1.447951e+02 
##                 2023 Home Value 100000                 2023 Median Home Value 
##                           5.855536e+02                           9.217190e+04 
##                    Pop Grwth 2010 2023                    Pop Grwth 2023 2028 
##                           2.250016e+01                           3.809244e+00 
##                        2023 Population          2023 Pop Wrk Trav Time 30 Min 
##                           5.884947e+03                           1.764772e+03 
##     2023 Pop Wrk Trav Time 60 Min...45     2023 Pop Wrk Trav Time 60 Min...46 
##                           1.018898e+03                           2.878651e+02
library(tidyr)

# Replace missing values with 1
df_cleaned <- df %>% replace(is.na(.), 1)

# Verify the cleaned dataframe
head(df_cleaned)
##   Number Of Units Year Built Vacancy Land Area Last Sale Price
## 1              25       1984    7.04    108900         3100000
## 2               8       1950   10.26    277477          800000
## 3              18       1990    6.31     67965         1000000
## 4              17       2007    4.61    174240          600000
## 5              13       1977    6.29    348480          750000
## 6             125       1936    6.04    337154         9500000
##   House Maint And Repair 2023 Cons Spdng Household Operations 2023 Cons Spdng
## 1                                2315630                              2875112
## 2                                 178754                               197441
## 3                                7470769                              8760263
## 4                                 180919                               158527
## 5                                  26145                                21370
## 6                               11054731                             11175881
##   HH Grwth 2010 2023 HH Grwth 2023 2028 2023 Households 2023 Med HH Size
## 1               1.17               9.88            2571                2
## 2              46.67               7.91             139                3
## 3               2.66              -1.30            8107                2
## 4               3.29              -0.64             156                2
## 5               0.00               0.00              21                2
## 6              13.91               0.40            8322                2
##   2023 Avg HH Size HU Grwth 2010 2023 2023 Avg HU Size 2023 Avg HU Value
## 1              2.3               8.03                4            215342
## 2              3.0              51.09                2            225340
## 3              2.0               5.03                9            240448
## 4              2.1              10.64                1            157097
## 5              2.2             -25.00                1            394118
## 6              2.1              15.95                7            222175
##   2023 Avg Yr Built 2023 Group Quarters 2023 Home Blt 1940 1949
## 1              1971                 491                     849
## 2              2003                   0                       1
## 3              1974                 130                     763
## 4              1984                   0                       7
## 5              1984                   0                       4
## 6              1972                  62                     583
##   2023 Home Blt 1950 1959 2023 Home Blt 1960 1969 2023 Home Blt 1970 1979
## 1                     381                     277                     247
## 2                       2                       2                       8
## 3                    1500                    1854                    3048
## 4                       4                       7                      83
## 5                       1                       3                       5
## 6                    2772                    1745                    1907
##   2023 Home Blt 1980 1989 2023 Home Blt 1990 1999 2023 Home Blt 2000 2010
## 1                     310                     284                     372
## 2                       8                      12                      72
## 3                    1360                     268                     917
## 4                      52                      40                      21
## 5                       6                       8                      10
## 6                     786                     414                      56
##   2023 Home Blt 2010 2023 HU 1 Unit 2023 HU 20 Units 2023 HU 2 4 Units
## 1                145           1985               88               127
## 2                 49            120                3                 0
## 3                283           3791             1353               926
## 4                  8             77                0                 0
## 5                  0             16                0                 0
## 6               1220           6281             1658               458
##   2023 HU 5 19 Units 2023 Med Yr Built 2023 Owner Occd Housing
## 1                432              1966                    1343
## 2                  5              2006                     103
## 3               1522              1972                    4289
## 4                  0              1981                     124
## 5                  0              1988                      18
## 6                546              1967                    6310
##   2023 Renter Occd Housing 2023 Home Value 1000000
## 1                     1229                       3
## 2                       36                       0
## 3                     3818                     106
## 4                       32                       0
## 5                        3                       4
## 6                     2012                      15
##   2023 Home Value 100000 200000 2023 Home Value 200000 300000
## 1                           514                           341
## 2                            28                            56
## 3                           919                           671
## 4                            29                             3
## 5                             4                             2
## 6                          1958                          1352
##   2023 Home Value 300000 400000 2023 Home Value 400000 500000
## 1                           106                           105
## 2                            12                             1
## 3                           579                            80
## 4                             2                             0
## 5                             2                             0
## 6                           730                           355
##   2023 Home Value 500000 1000000 2023 Home Value 100000 2023 Median Home Value
## 1                             33                    242                 183657
## 2                              0                      6                 231250
## 3                            361                   1573                 162187
## 4                             12                     78                  79486
## 5                              0                      5                 187499
## 6                            324                   1575                 180668
##   Pop Grwth 2010 2023 Pop Grwth 2023 2028 2023 Population
## 1                9.44               10.34            6479
## 2               51.61                8.04             423
## 3                4.49               -1.39           16666
## 4               10.56                0.00             335
## 5              -26.09               -1.96              51
## 6               14.49                0.17           17448
##   2023 Pop Wrk Trav Time 30 Min 2023 Pop Wrk Trav Time 60 Min...45
## 1                          1882                                476
## 2                           104                                 93
## 3                          5237                               2031
## 4                            79                                 49
## 5                            11                                  2
## 6                          6442                               1838
##   2023 Pop Wrk Trav Time 60 Min...46
## 1                                202
## 2                                 21
## 3                                377
## 4                                  4
## 5                                  2
## 6                                234
# Load necessary library
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-8
# Assuming df contains your predictors and 'last_sale_price' is your target variable

# Extract the target variable y (Last Sale Price)
y <- df_cleaned$`Last Sale Price`

# Prepare the predictor variables (excluding the target variable)
X <- df_cleaned[, !colnames(df_cleaned) %in% c("Last Sale Price")]

# Standardize the predictor variables
X_scaled <- scale(X)
# Fit a Lasso model
lasso_model <- cv.glmnet(X_scaled, y, alpha = 1)  # alpha = 1 for Lasso

# Get the selected lambda (regularization strength)
best_lambda <- lasso_model$lambda.min

# Extract the coefficients for the selected lambda
lasso_coef <- coef(lasso_model, s = best_lambda)

# Print the coefficients
print(lasso_coef)
## 46 x 1 sparse Matrix of class "dgCMatrix"
##                                                 s1
## (Intercept)                             6121384.17
## Number Of Units                         6896306.68
## Year Built                                    .   
## Vacancy                                       .   
## Land Area                               4033290.13
## House Maint And Repair 2023 Cons Spdng        .   
## Household Operations 2023 Cons Spdng          .   
## HH Grwth 2010 2023                            .   
## HH Grwth 2023 2028                            .   
## 2023 Households                               .   
## 2023 Med HH Size                         900178.95
## 2023 Avg HH Size                       -1039301.39
## HU Grwth 2010 2023                      -991622.69
## 2023 Avg HU Size                         883102.82
## 2023 Avg HU Value                             .   
## 2023 Avg Yr Built                         18053.87
## 2023 Group Quarters                           .   
## 2023 Home Blt 1940 1949                       .   
## 2023 Home Blt 1950 1959                       .   
## 2023 Home Blt 1960 1969                 -124236.76
## 2023 Home Blt 1970 1979                 -283008.45
## 2023 Home Blt 1980 1989                  610197.19
## 2023 Home Blt 1990 1999                 -129183.67
## 2023 Home Blt 2000 2010                       .   
## 2023 Home Blt 2010                       876842.56
## 2023 HU 1 Unit                                .   
## 2023 HU 20 Units                        -241903.95
## 2023 HU 2 4 Units                             .   
## 2023 HU 5 19 Units                     -1097736.71
## 2023 Med Yr Built                             .   
## 2023 Owner Occd Housing                       .   
## 2023 Renter Occd Housing                      .   
## 2023 Home Value 1000000                       .   
## 2023 Home Value 100000 200000           -207854.07
## 2023 Home Value 200000 300000           -136592.67
## 2023 Home Value 300000 400000            425753.53
## 2023 Home Value 400000 500000                 .   
## 2023 Home Value 500000 1000000           988541.39
## 2023 Home Value 100000                        .   
## 2023 Median Home Value                  -336579.02
## Pop Grwth 2010 2023                           .   
## Pop Grwth 2023 2028                      651974.89
## 2023 Population                               .   
## 2023 Pop Wrk Trav Time 30 Min                 .   
## 2023 Pop Wrk Trav Time 60 Min...45       -86112.85
## 2023 Pop Wrk Trav Time 60 Min...46      -572852.03
library(glmnet)

# Fit a Ridge model
ridge_model <- cv.glmnet(X_scaled, y, alpha = 0)  # alpha = 0 for Ridge

# Get the selected lambda (regularization strength)
best_lambda <- ridge_model$lambda.min

# Extract the coefficients for the selected lambda
ridge_coef <- coef(ridge_model, s = best_lambda)

# Print the coefficients
print(ridge_coef)
## 46 x 1 sparse Matrix of class "dgCMatrix"
##                                                 s1
## (Intercept)                             6121384.17
## Number Of Units                         6122851.88
## Year Built                               115686.96
## Vacancy                                 -112623.76
## Land Area                               4082979.12
## House Maint And Repair 2023 Cons Spdng    72562.61
## Household Operations 2023 Cons Spdng     258208.54
## HH Grwth 2010 2023                       215037.74
## HH Grwth 2023 2028                       212661.70
## 2023 Households                          232800.16
## 2023 Med HH Size                         857457.45
## 2023 Avg HH Size                       -1067990.89
## HU Grwth 2010 2023                      -904010.06
## 2023 Avg HU Size                        1020802.93
## 2023 Avg HU Value                        204002.68
## 2023 Avg Yr Built                        138318.79
## 2023 Group Quarters                      -58098.78
## 2023 Home Blt 1940 1949                 -261420.59
## 2023 Home Blt 1950 1959                 -195418.15
## 2023 Home Blt 1960 1969                 -601532.40
## 2023 Home Blt 1970 1979                 -793365.62
## 2023 Home Blt 1980 1989                  685487.71
## 2023 Home Blt 1990 1999                 -530614.52
## 2023 Home Blt 2000 2010                  -54513.94
## 2023 Home Blt 2010                       773975.32
## 2023 HU 1 Unit                           383416.79
## 2023 HU 20 Units                        -430992.14
## 2023 HU 2 4 Units                        126537.03
## 2023 HU 5 19 Units                     -1040881.48
## 2023 Med Yr Built                          5774.95
## 2023 Owner Occd Housing                   68479.78
## 2023 Renter Occd Housing                 432183.72
## 2023 Home Value 1000000                 -131385.87
## 2023 Home Value 100000 200000           -547013.51
## 2023 Home Value 200000 300000           -237505.02
## 2023 Home Value 300000 400000            612758.29
## 2023 Home Value 400000 500000            209384.50
## 2023 Home Value 500000 1000000           938463.42
## 2023 Home Value 100000                   239574.39
## 2023 Median Home Value                  -705520.18
## Pop Grwth 2010 2023                     -276963.77
## Pop Grwth 2023 2028                      496516.36
## 2023 Population                          366932.60
## 2023 Pop Wrk Trav Time 30 Min            107498.06
## 2023 Pop Wrk Trav Time 60 Min...45      -439942.54
## 2023 Pop Wrk Trav Time 60 Min...46      -598047.27
# Extract non-zero coefficients from Lasso model
#lasso_coefs <- coef(lasso_model, s = "lambda.min")[-1]  # Exclude intercept
#keep_X <- which(lasso_coefs != 0)

# Subset predictor variables based on non-zero coefficients from Lasso
#X_H <- X_scaled[, keep_X]
 
# Perform Linear regression
#li.eq <- summary(lm(y ~ X_H))
# 
# Perform Ridge regression using lambda from Lasso
#ri.eq <- glmnet(x = X_scaled, y = y, lambda = lasso_model$lambda.min, family = "gaussian", intercept = FALSE, alpha = 0)
 
# Ensure all coefficients have the same length
#num_features <- max(length(beta), ncol(X_H), length(lasso_coefs), ncol(coef(ri.eq)))

# Create vectors with the same length
#beta <- c(beta, rep(NA, num_features - length(beta)))
#lasso_coefs <- c(lasso_coefs, rep(NA, num_features - length(lasso_coefs)))
#ridge_coefs <- c(coef(ri.eq)[, 1], rep(NA, num_features - ncol(coef(ri.eq))))

# Ensure all coefficients have the same length
#num_features <- max(length(beta), length(li.eq$coefficients), length(lasso_coefs), ncol(coef(ri.eq)))

# Create vectors with the same length and fill missing values with NA
#beta <- c(beta, rep(NA, num_features - length(beta)))
#lasso_coefs <- c(lasso_coefs, rep(NA, num_features - length(lasso_coefs)))

# Extract Ridge coefficients and fill missing values with NA
#ridge_coefs <- predict(ri.eq, s = lasso_model$lambda.min, type = "coefficients")[, 1]
#ridge_coefs <- as.vector(ridge_coefs)

# Ensure all coefficients have the same length
#num_features <- max(length(beta), length(li.eq$coefficients), length(lasso_coefs), length(ridge_coefs))

# Create vectors with the same length and fill missing values with NA
#beta <- c(beta, rep(NA, num_features - length(beta)))
#lasso_coefs <- c(lasso_coefs, rep(NA, num_features - length(lasso_coefs)))
#ridge_coefs <- c(ridge_coefs, rep(NA, num_features - length(ridge_coefs)))
#li_eq_coefs <- c(li.eq$coefficients, rep(NA, num_features - length(li.eq$coefficients)))

# Ensure all coefficients have the same length
#num_features <- max(length(beta), length(li.eq$coefficients), length(lasso_coefs), length(ridge_coefs))

# Pad coefficient vectors with NA if needed
#beta <- c(beta, rep(NA, num_features - length(beta)))
#li_eq_coefs <- c(li.eq$coefficients, rep(NA, num_features - length(li.eq$coefficients)))
#lasso_coefs <- c(lasso_coefs, rep(NA, num_features - length(lasso_coefs)))
#ridge_coefs <- c(ridge_coefs, rep(NA, num_features - length(ridge_coefs)))

# Ensure all coefficients have the same length
#num_features <- max(length(beta), length(li_eq_coefs), length(lasso_coefs), length(ridge_coefs))
# 
# Create vectors with the same length and fill missing values with NA
#beta <- c(beta, rep(NA, num_features - length(beta)))
#li_eq_coefs <- c(li_eq_coefs, rep(NA, num_features - length(li_eq_coefs)))
#lasso_coefs <- c(lasso_coefs, rep(NA, num_features - length(lasso_coefs)))
#ridge_coefs <- c(ridge_coefs, rep(NA, num_features - length(ridge_coefs)))

# Combine coefficient vectors into a matrix
#coef_matrix <- cbind(beta, li_eq_coefs, lasso_coefs, ridge_coefs)

# Convert the matrix to a dataframe
#df.comp <- as.data.frame(coef_matrix)
# 
# Rename the columns
#colnames(df.comp) <- c("beta", "Linear", "Lasso", "Ridge")
# 
# Print the coefficients dataframe
#print(df.comp)
lasso_coefs <- lasso_coef[1:46]
ridge_coefs <- ridge_coef[1:46]
 # Fit a multivariate linear regression model
lm_model <- lm(`Last Sale Price` ~ ., data = df)

# Print the summary of the regression model
summary(lm_model)
## 
## Call:
## lm(formula = `Last Sale Price` ~ ., data = df)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -82623712  -2597135    -14414   2321687 268637835 
## 
## Coefficients:
##                                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                              -1.484e+07  4.080e+07  -0.364  0.71611
## `Number Of Units`                         4.520e+04  3.139e+03  14.398  < 2e-16
## `Year Built`                              5.871e+03  2.023e+04   0.290  0.77166
## Vacancy                                  -1.154e+04  6.431e+04  -0.179  0.85763
## `Land Area`                               2.100e+00  2.577e-01   8.148 1.01e-15
## `House Maint And Repair 2023 Cons Spdng` -6.503e+00  9.739e+00  -0.668  0.50443
## `Household Operations 2023 Cons Spdng`    8.233e-01  1.991e+00   0.414  0.67925
## `HH Grwth 2010 2023`                      1.207e+05  5.516e+04   2.188  0.02886
## `HH Grwth 2023 2028`                     -7.993e+04  1.190e+06  -0.067  0.94648
## `2023 Households`                        -7.495e+05  7.146e+05  -1.049  0.29449
## `2023 Med HH Size`                        3.548e+06  1.563e+06   2.270  0.02337
## `2023 Avg HH Size`                       -3.660e+06  2.037e+06  -1.797  0.07260
## `HU Grwth 2010 2023`                     -4.389e+05  1.830e+05  -2.399  0.01663
## `2023 Avg HU Size`                        4.376e+05  2.130e+05   2.054  0.04019
## `2023 Avg HU Value`                       1.332e+01  1.505e+01   0.885  0.37623
## `2023 Avg Yr Built`                       2.682e+05  1.713e+05   1.566  0.11763
## `2023 Group Quarters`                    -9.404e+02  1.828e+03  -0.514  0.60711
## `2023 Home Blt 1940 1949`                -4.573e+03  2.466e+03  -1.854  0.06398
## `2023 Home Blt 1950 1959`                -3.108e+03  2.461e+03  -1.263  0.20686
## `2023 Home Blt 1960 1969`                -4.044e+03  1.966e+03  -2.057  0.03992
## `2023 Home Blt 1970 1979`                -4.507e+03  1.898e+03  -2.375  0.01772
## `2023 Home Blt 1980 1989`                -5.808e+02  2.000e+03  -0.290  0.77155
## `2023 Home Blt 1990 1999`                -5.509e+03  2.130e+03  -2.586  0.00984
## `2023 Home Blt 2000 2010`                -2.127e+03  1.665e+03  -1.278  0.20165
## `2023 Home Blt 2010`                      6.067e+02  1.958e+03   0.310  0.75674
## `2023 HU 1 Unit`                          1.111e+03  1.082e+03   1.027  0.30478
## `2023 HU 20 Units`                       -2.562e+02  1.211e+03  -0.212  0.83247
## `2023 HU 2 4 Units`                       1.658e+03  1.922e+03   0.863  0.38836
## `2023 HU 5 19 Units`                     -2.797e+03  1.487e+03  -1.881  0.06020
## `2023 Med Yr Built`                      -2.670e+05  1.715e+05  -1.557  0.11969
## `2023 Owner Occd Housing`                 7.712e+05  7.641e+05   1.009  0.31310
## `2023 Renter Occd Housing`                7.529e+05  7.145e+05   1.054  0.29222
## `2023 Home Value 1000000`                -1.350e+04  4.783e+05  -0.028  0.97749
## `2023 Home Value 100000 200000`          -1.049e+04  4.780e+05  -0.022  0.98249
## `2023 Home Value 200000 300000`          -1.011e+04  4.780e+05  -0.021  0.98313
## `2023 Home Value 300000 400000`          -4.351e+03  4.780e+05  -0.009  0.99274
## `2023 Home Value 400000 500000`          -7.447e+03  4.781e+05  -0.016  0.98757
## `2023 Home Value 500000 1000000`         -1.165e+02  4.780e+05   0.000  0.99981
## `2023 Home Value 100000`                 -8.129e+03  4.781e+05  -0.017  0.98644
## `2023 Median Home Value`                 -1.987e+01  1.460e+01  -1.361  0.17369
## `Pop Grwth 2010 2023`                     3.095e+05  1.816e+05   1.704  0.08870
## `Pop Grwth 2023 2028`                     2.842e+05  1.209e+06   0.235  0.81412
## `2023 Population`                         4.303e+02  8.436e+02   0.510  0.61012
## `2023 Pop Wrk Trav Time 30 Min`          -9.546e+02  1.083e+03  -0.882  0.37823
## `2023 Pop Wrk Trav Time 60 Min...45`     -1.370e+03  1.355e+03  -1.011  0.31228
## `2023 Pop Wrk Trav Time 60 Min...46`     -4.700e+03  2.859e+03  -1.644  0.10055
##                                             
## (Intercept)                                 
## `Number Of Units`                        ***
## `Year Built`                                
## Vacancy                                     
## `Land Area`                              ***
## `House Maint And Repair 2023 Cons Spdng`    
## `Household Operations 2023 Cons Spdng`      
## `HH Grwth 2010 2023`                     *  
## `HH Grwth 2023 2028`                        
## `2023 Households`                           
## `2023 Med HH Size`                       *  
## `2023 Avg HH Size`                       .  
## `HU Grwth 2010 2023`                     *  
## `2023 Avg HU Size`                       *  
## `2023 Avg HU Value`                         
## `2023 Avg Yr Built`                         
## `2023 Group Quarters`                       
## `2023 Home Blt 1940 1949`                .  
## `2023 Home Blt 1950 1959`                   
## `2023 Home Blt 1960 1969`                *  
## `2023 Home Blt 1970 1979`                *  
## `2023 Home Blt 1980 1989`                   
## `2023 Home Blt 1990 1999`                ** 
## `2023 Home Blt 2000 2010`                   
## `2023 Home Blt 2010`                        
## `2023 HU 1 Unit`                            
## `2023 HU 20 Units`                          
## `2023 HU 2 4 Units`                         
## `2023 HU 5 19 Units`                     .  
## `2023 Med Yr Built`                         
## `2023 Owner Occd Housing`                   
## `2023 Renter Occd Housing`                  
## `2023 Home Value 1000000`                   
## `2023 Home Value 100000 200000`             
## `2023 Home Value 200000 300000`             
## `2023 Home Value 300000 400000`             
## `2023 Home Value 400000 500000`             
## `2023 Home Value 500000 1000000`            
## `2023 Home Value 100000`                    
## `2023 Median Home Value`                    
## `Pop Grwth 2010 2023`                    .  
## `Pop Grwth 2023 2028`                       
## `2023 Population`                           
## `2023 Pop Wrk Trav Time 30 Min`             
## `2023 Pop Wrk Trav Time 60 Min...45`        
## `2023 Pop Wrk Trav Time 60 Min...46`        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11560000 on 1089 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.4728, Adjusted R-squared:  0.451 
## F-statistic:  21.7 on 45 and 1089 DF,  p-value: < 2.2e-16
# Extract coefficients from the multivariate linear regression model
lm_coefs <- coef(lm_model)
# Convert lasso_coef to a matrix with one row
lasso_coef_matrix <- as.matrix(lasso_coef)

# Convert lasso_coef_matrix to a regular matrix if it's not already in matrix form
lasso_coef_matrix <- as.matrix(lasso_coef_matrix)


# Convert lasso_coef_matrix to a regular matrix
lasso_coef_matrix <- as.matrix(lasso_coef_matrix)

# Combine all coefficient vectors into a data frame
df.comp <- data.frame(
  Linear = as.vector(lm_coefs),    # Coefficients from the multivariate linear regression model
  Lasso = as.vector(lasso_coef_matrix),  # Coefficients from the lasso regression model (converted to matrix)
  Ridge = as.vector(ridge_coef)   # Coefficients from the ridge regression model
)

# Print the coefficients dataframe
print(df.comp)
##           Linear       Lasso       Ridge
## 1  -1.483973e+07  6121384.17  6121384.17
## 2   4.520057e+04  6896306.68  6122851.88
## 3   5.871427e+03        0.00   115686.96
## 4  -1.153896e+04        0.00  -112623.76
## 5   2.100077e+00  4033290.13  4082979.12
## 6  -6.503166e+00        0.00    72562.61
## 7   8.233457e-01        0.00   258208.54
## 8   1.206935e+05        0.00   215037.74
## 9  -7.992534e+04        0.00   212661.70
## 10 -7.494807e+05        0.00   232800.16
## 11  3.548246e+06   900178.95   857457.45
## 12 -3.660241e+06 -1039301.39 -1067990.89
## 13 -4.388886e+05  -991622.69  -904010.06
## 14  4.376257e+05   883102.82  1020802.93
## 15  1.331867e+01        0.00   204002.68
## 16  2.682260e+05    18053.87   138318.79
## 17 -9.403814e+02        0.00   -58098.78
## 18 -4.573036e+03        0.00  -261420.59
## 19 -3.108226e+03        0.00  -195418.15
## 20 -4.043533e+03  -124236.76  -601532.40
## 21 -4.506785e+03  -283008.45  -793365.62
## 22 -5.807567e+02   610197.19   685487.71
## 23 -5.509304e+03  -129183.67  -530614.52
## 24 -2.126723e+03        0.00   -54513.94
## 25  6.067036e+02   876842.56   773975.32
## 26  1.110805e+03        0.00   383416.79
## 27 -2.562441e+02  -241903.95  -430992.14
## 28  1.658187e+03        0.00   126537.03
## 29 -2.796718e+03 -1097736.71 -1040881.48
## 30 -2.670147e+05        0.00     5774.95
## 31  7.711626e+05        0.00    68479.78
## 32  7.528854e+05        0.00   432183.72
## 33 -1.349811e+04        0.00  -131385.87
## 34 -1.049087e+04  -207854.07  -547013.51
## 35 -1.010864e+04  -136592.67  -237505.02
## 36 -4.350944e+03   425753.53   612758.29
## 37 -7.447363e+03        0.00   209384.50
## 38 -1.165058e+02   988541.39   938463.42
## 39 -8.128898e+03        0.00   239574.39
## 40 -1.987150e+01  -336579.02  -705520.18
## 41  3.094792e+05        0.00  -276963.77
## 42  2.842377e+05   651974.89   496516.36
## 43  4.302822e+02        0.00   366932.60
## 44 -9.546439e+02        0.00   107498.06
## 45 -1.370296e+03   -86112.85  -439942.54
## 46 -4.699641e+03  -572852.03  -598047.27
# Plotting coefficients
barplot(t(df.comp), beside = TRUE, col = c("blue", "green", "red"), 
        main = "Coefficients from Different Models", 
        xlab = "Variables", ylab = "Coefficients",
        legend.text = TRUE)
legend("topright", inset = 0.05, legend = colnames(df.comp), fill = c("blue", "green", "red"))

# Calculate the number of missing coefficients to fill
#missing_coef_count <- num_features - length(ridge_coefs)

# Create a vector with missing coefficients filled with NA
#if (missing_coef_count > 0) {
 # ridge_coefs <- c(ridge_coefs, rep(NA, missing_coef_count))
#}
# Split the data into training and testing sets (80% train, 20% test)
set.seed(42) # for reproducibility
train_index <- sample(1:nrow(df), 0.8 * nrow(df))
train_data <- df[train_index, ]
test_data <- df[-train_index, ]

# Define the formula for the regression model
formula <- as.formula("`Last Sale Price` ~ `Number Of Units` + `Land Area` + `2023 Households` + `2023 Med HH Size` + `2023 Avg HH Size` + `HU Grwth 2010 2023` + `2023 Home Value 1000000` + `2023 Home Value 100000 200000` + `2023 Home Value 200000 300000` + `Pop Grwth 2010 2023`")

# Fit the linear regression model
model <- lm(formula, data = train_data)

# Make predictions on the testing set
predictions <- predict(model, newdata = test_data)

# Evaluate the model
mse <- mean((test_data$Last_Sale_Price - predictions)^2)
r_squared <- summary(model)$r.squared
cat("Mean Squared Error:", mse, "\n")
## Mean Squared Error: NaN
cat("R-squared:", r_squared, "\n")
## R-squared: 0.43557
# Print the coefficients
coefficients <- coef(model)
print(coefficients)
##                     (Intercept)               `Number Of Units` 
##                    2.653295e+06                    4.843101e+04 
##                     `Land Area`               `2023 Households` 
##                    1.864141e+00                    2.549349e+02 
##              `2023 Med HH Size`              `2023 Avg HH Size` 
##                    4.127027e+06                   -5.124950e+06 
##            `HU Grwth 2010 2023`       `2023 Home Value 1000000` 
##                   -3.276234e+05                    8.484266e+03 
## `2023 Home Value 100000 200000` `2023 Home Value 200000 300000` 
##                   -7.919315e+02                   -2.016957e+03 
##           `Pop Grwth 2010 2023` 
##                    3.343593e+05
# Summary of the linear regression model
summary(model)
## 
## Call:
## lm(formula = formula, data = train_data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -86271983  -2159783    160925   1964770 273147482 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      2.653e+06  3.232e+06   0.821  0.41186    
## `Number Of Units`                4.843e+04  3.524e+03  13.742  < 2e-16 ***
## `Land Area`                      1.864e+00  2.749e-01   6.782 2.15e-11 ***
## `2023 Households`                2.549e+02  3.582e+02   0.712  0.47687    
## `2023 Med HH Size`               4.127e+06  1.750e+06   2.358  0.01857 *  
## `2023 Avg HH Size`              -5.125e+06  1.600e+06  -3.203  0.00141 ** 
## `HU Grwth 2010 2023`            -3.276e+05  1.539e+05  -2.129  0.03353 *  
## `2023 Home Value 1000000`        8.484e+03  7.432e+03   1.142  0.25395    
## `2023 Home Value 100000 200000` -7.919e+02  1.402e+03  -0.565  0.57244    
## `2023 Home Value 200000 300000` -2.017e+03  1.581e+03  -1.275  0.20252    
## `Pop Grwth 2010 2023`            3.344e+05  1.596e+05   2.095  0.03649 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12260000 on 898 degrees of freedom
## Multiple R-squared:  0.4356, Adjusted R-squared:  0.4293 
## F-statistic:  69.3 on 10 and 898 DF,  p-value: < 2.2e-16
# Make predictions on the testing set
predictions <- predict(model, newdata = test_data)

# Calculate residuals
residuals <- residuals(model)

# Ensure both predictions and residuals have the same length
n <- min(length(predictions), length(residuals))
predictions <- predictions[1:n]
residuals <- residuals[1:n]

# Plotting residuals vs. fitted values
plot(predictions, residuals)
abline(h = 0, col = "red")  # Add a horizontal line at residual = 0

# Load necessary packages
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
ridge_coefs <- ridge_coefs[1:46]

# Combine all coefficient vectors into a dataframe
df.comp <- data.frame(
  Linear = lm_coefs,    # Coefficients from the multivariate linear regression model
  Lasso = lasso_coefs,  # Coefficients from the lasso regression model
  Ridge = ridge_coefs   # Coefficients from the ridge regression model
)

# Calculate VIF for each variable
vif_values <- sapply(df.comp, function(x) {
  if (is.numeric(x)) {
    vif(lm(x ~ ., data = df.comp))
  } else {
    NA
  }
})
## Warning in summary.lm(object, ...): essentially perfect fit: summary may be
## unreliable

## Warning in summary.lm(object, ...): essentially perfect fit: summary may be
## unreliable

## Warning in summary.lm(object, ...): essentially perfect fit: summary may be
## unreliable
# Combine VIF values into a dataframe
vif_df <- data.frame(VIF = vif_values)

# Print or use VIF dataframe as needed
print(vif_df)
##        VIF.Linear VIF.Lasso VIF.Ridge
## Linear   1.344512  1.344512  1.344512
## Lasso   36.387550 36.387550 36.387550
## Ridge   37.155174 37.155174 37.155174
summary(model)
## 
## Call:
## lm(formula = formula, data = train_data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -86271983  -2159783    160925   1964770 273147482 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      2.653e+06  3.232e+06   0.821  0.41186    
## `Number Of Units`                4.843e+04  3.524e+03  13.742  < 2e-16 ***
## `Land Area`                      1.864e+00  2.749e-01   6.782 2.15e-11 ***
## `2023 Households`                2.549e+02  3.582e+02   0.712  0.47687    
## `2023 Med HH Size`               4.127e+06  1.750e+06   2.358  0.01857 *  
## `2023 Avg HH Size`              -5.125e+06  1.600e+06  -3.203  0.00141 ** 
## `HU Grwth 2010 2023`            -3.276e+05  1.539e+05  -2.129  0.03353 *  
## `2023 Home Value 1000000`        8.484e+03  7.432e+03   1.142  0.25395    
## `2023 Home Value 100000 200000` -7.919e+02  1.402e+03  -0.565  0.57244    
## `2023 Home Value 200000 300000` -2.017e+03  1.581e+03  -1.275  0.20252    
## `Pop Grwth 2010 2023`            3.344e+05  1.596e+05   2.095  0.03649 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12260000 on 898 degrees of freedom
## Multiple R-squared:  0.4356, Adjusted R-squared:  0.4293 
## F-statistic:  69.3 on 10 and 898 DF,  p-value: < 2.2e-16
# Fit the linear regression model
model <- lm(formula, data = train_data)

# Check the significance and direction of coefficients
summary_model <- summary(model)

# Identify statistically insignificant variables with high p-values
insignificant_vars <- names(summary_model$coefficients[,"Pr(>|t|)"][summary_model$coefficients[,"Pr(>|t|)"] > 0.05])

# Drop statistically insignificant variables from both training and testing datasets
train_data <- train_data[, !names(train_data) %in% insignificant_vars]
test_data <- test_data[, !names(test_data) %in% insignificant_vars]

# Refit the model with updated variables
model <- lm(formula, data = train_data)

# Make predictions on the testing set
predictions <- predict(model, newdata = test_data)

# Evaluate the model
mse <- mean((test_data$Last_Sale_Price - predictions)^2)
r_squared <- summary(model)$r.squared
cat("Mean Squared Error:", mse, "\n")
## Mean Squared Error: NaN
cat("R-squared:", r_squared, "\n")
## R-squared: 0.43557
# Print the coefficients
coefficients <- coef(model)
print(coefficients)
##                     (Intercept)               `Number Of Units` 
##                    2.653295e+06                    4.843101e+04 
##                     `Land Area`               `2023 Households` 
##                    1.864141e+00                    2.549349e+02 
##              `2023 Med HH Size`              `2023 Avg HH Size` 
##                    4.127027e+06                   -5.124950e+06 
##            `HU Grwth 2010 2023`       `2023 Home Value 1000000` 
##                   -3.276234e+05                    8.484266e+03 
## `2023 Home Value 100000 200000` `2023 Home Value 200000 300000` 
##                   -7.919315e+02                   -2.016957e+03 
##           `Pop Grwth 2010 2023` 
##                    3.343593e+05
# Plot the coefficients
plot(model)

# Print the summary
summary(model)
## 
## Call:
## lm(formula = formula, data = train_data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -86271983  -2159783    160925   1964770 273147482 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      2.653e+06  3.232e+06   0.821  0.41186    
## `Number Of Units`                4.843e+04  3.524e+03  13.742  < 2e-16 ***
## `Land Area`                      1.864e+00  2.749e-01   6.782 2.15e-11 ***
## `2023 Households`                2.549e+02  3.582e+02   0.712  0.47687    
## `2023 Med HH Size`               4.127e+06  1.750e+06   2.358  0.01857 *  
## `2023 Avg HH Size`              -5.125e+06  1.600e+06  -3.203  0.00141 ** 
## `HU Grwth 2010 2023`            -3.276e+05  1.539e+05  -2.129  0.03353 *  
## `2023 Home Value 1000000`        8.484e+03  7.432e+03   1.142  0.25395    
## `2023 Home Value 100000 200000` -7.919e+02  1.402e+03  -0.565  0.57244    
## `2023 Home Value 200000 300000` -2.017e+03  1.581e+03  -1.275  0.20252    
## `Pop Grwth 2010 2023`            3.344e+05  1.596e+05   2.095  0.03649 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12260000 on 898 degrees of freedom
## Multiple R-squared:  0.4356, Adjusted R-squared:  0.4293 
## F-statistic:  69.3 on 10 and 898 DF,  p-value: < 2.2e-16
# Fit the linear regression model after removing '2023 Med HH Size' and '2023 Avg HH Size'
model <- lm(`Last Sale Price` ~ `Number Of Units` + `Land Area` + `2023 Households` + `HU Grwth 2010 2023` + `2023 Home Value 1000000` + `2023 Home Value 100000 200000` + `2023 Home Value 200000 300000` + `Pop Grwth 2010 2023`, data = train_data)

# Check the significance and direction of coefficients
summary_model <- summary(model)

# Identify statistically insignificant variables with high p-values
insignificant_vars <- names(summary_model$coefficients[,"Pr(>|t|)"][summary_model$coefficients[,"Pr(>|t|)"] > 0.05])

# Drop statistically insignificant variables from both training and testing datasets
train_data <- train_data[, !names(train_data) %in% insignificant_vars]
test_data <- test_data[, !names(test_data) %in% insignificant_vars]

# Refit the model with updated variables
model <- lm(`Last Sale Price` ~ `Number Of Units` + `Land Area` + `2023 Households` + `HU Grwth 2010 2023` + `2023 Home Value 1000000` + `2023 Home Value 100000 200000` + `2023 Home Value 200000 300000` + `Pop Grwth 2010 2023`, data = train_data)

# Make predictions on the testing set
predictions <- predict(model, newdata = test_data)

# Evaluate the model
mse <- mean((test_data$Last_Sale_Price - predictions)^2)
r_squared <- summary(model)$r.squared
cat("Mean Squared Error:", mse, "\n")
## Mean Squared Error: NaN
cat("R-squared:", r_squared, "\n")
## R-squared: 0.4290564
# Plot the coefficients
plot(model)

# Print the summary
summary(model)
## 
## Call:
## lm(formula = `Last Sale Price` ~ `Number Of Units` + `Land Area` + 
##     `2023 Households` + `HU Grwth 2010 2023` + `2023 Home Value 1000000` + 
##     `2023 Home Value 100000 200000` + `2023 Home Value 200000 300000` + 
##     `Pop Grwth 2010 2023`, data = train_data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -88569221  -1889597    275478   1586777 274383145 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     -1.336e+06  7.885e+05  -1.695   0.0904 .  
## `Number Of Units`                4.842e+04  3.535e+03  13.696  < 2e-16 ***
## `Land Area`                      1.974e+00  2.740e-01   7.207 1.21e-12 ***
## `2023 Households`                3.508e+02  3.582e+02   0.979   0.3277    
## `HU Grwth 2010 2023`            -3.306e+05  1.546e+05  -2.139   0.0327 *  
## `2023 Home Value 1000000`        9.325e+03  7.442e+03   1.253   0.2105    
## `2023 Home Value 100000 200000` -6.595e+02  1.404e+03  -0.470   0.6387    
## `2023 Home Value 200000 300000` -2.428e+03  1.553e+03  -1.564   0.1182    
## `Pop Grwth 2010 2023`            3.339e+05  1.604e+05   2.082   0.0376 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12310000 on 900 degrees of freedom
## Multiple R-squared:  0.4291, Adjusted R-squared:  0.424 
## F-statistic: 84.54 on 8 and 900 DF,  p-value: < 2.2e-16

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.