This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Clear the workspace
rm(list = ls()) # Clear environment
gc() # Clear unused memory / Take out the trash
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 692981 37.1 1327144 70.9 1327144 70.9
## Vcells 1246182 9.6 8388608 64.0 1996585 15.3
cat("\f") # Clear the console
if(!is.null(dev.list())) dev.off() # Clear all plots
## null device
## 1
library(readxl)
df <- read_excel("DataDF.xlsx")
## New names:
## • `2023 Pop Wrk Trav Time 60 Min` -> `2023 Pop Wrk Trav Time 60 Min...45`
## • `2023 Pop Wrk Trav Time 60 Min` -> `2023 Pop Wrk Trav Time 60 Min...46`
df <- as.data.frame(df)
glimpse(df)
## Rows: 1,137
## Columns: 46
## $ `Number Of Units` <dbl> 25, 8, 18, 17, 13, 125, 236, …
## $ `Year Built` <dbl> 1984, 1950, 1990, 2007, 1977,…
## $ Vacancy <dbl> 7.04, 10.26, 6.31, 4.61, 6.29…
## $ `Land Area` <dbl> 108900, 277477, 67965, 174240…
## $ `Last Sale Price` <dbl> 3100000, 800000, 1000000, 600…
## $ `House Maint And Repair 2023 Cons Spdng` <dbl> 2315630, 178754, 7470769, 180…
## $ `Household Operations 2023 Cons Spdng` <dbl> 2875112, 197441, 8760263, 158…
## $ `HH Grwth 2010 2023` <dbl> 1.17, 46.67, 2.66, 3.29, 0.00…
## $ `HH Grwth 2023 2028` <dbl> 9.88, 7.91, -1.30, -0.64, 0.0…
## $ `2023 Households` <dbl> 2571, 139, 8107, 156, 21, 832…
## $ `2023 Med HH Size` <dbl> 2, 3, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ `2023 Avg HH Size` <dbl> 2.3, 3.0, 2.0, 2.1, 2.2, 2.1,…
## $ `HU Grwth 2010 2023` <dbl> 8.03, 51.09, 5.03, 10.64, -25…
## $ `2023 Avg HU Size` <dbl> 4, 2, 9, 1, 1, 7, 9, 2, 17, 3…
## $ `2023 Avg HU Value` <dbl> 215342, 225340, 240448, 15709…
## $ `2023 Avg Yr Built` <dbl> 1971, 2003, 1974, 1984, 1984,…
## $ `2023 Group Quarters` <dbl> 491, 0, 130, 0, 0, 62, 209, 1…
## $ `2023 Home Blt 1940 1949` <dbl> 849, 1, 763, 7, 4, 583, 363, …
## $ `2023 Home Blt 1950 1959` <dbl> 381, 2, 1500, 4, 1, 2772, 217…
## $ `2023 Home Blt 1960 1969` <dbl> 277, 2, 1854, 7, 3, 1745, 178…
## $ `2023 Home Blt 1970 1979` <dbl> 247, 8, 3048, 83, 5, 1907, 26…
## $ `2023 Home Blt 1980 1989` <dbl> 310, 8, 1360, 52, 6, 786, 104…
## $ `2023 Home Blt 1990 1999` <dbl> 284, 12, 268, 40, 8, 414, 245…
## $ `2023 Home Blt 2000 2010` <dbl> 372, 72, 917, 21, 10, 56, 135…
## $ `2023 Home Blt 2010` <dbl> 145, 49, 283, 8, 0, 1220, 0, …
## $ `2023 HU 1 Unit` <dbl> 1985, 120, 3791, 77, 16, 6281…
## $ `2023 HU 20 Units` <dbl> 88, 3, 1353, 0, 0, 1658, 1695…
## $ `2023 HU 2 4 Units` <dbl> 127, 0, 926, 0, 0, 458, 435, …
## $ `2023 HU 5 19 Units` <dbl> 432, 5, 1522, 0, 0, 546, 948,…
## $ `2023 Med Yr Built` <dbl> 1966, 2006, 1972, 1981, 1988,…
## $ `2023 Owner Occd Housing` <dbl> 1343, 103, 4289, 124, 18, 631…
## $ `2023 Renter Occd Housing` <dbl> 1229, 36, 3818, 32, 3, 2012, …
## $ `2023 Home Value 1000000` <dbl> 3, 0, 106, 0, 4, 15, 117, 0, …
## $ `2023 Home Value 100000 200000` <dbl> 514, 28, 919, 29, 4, 1958, 12…
## $ `2023 Home Value 200000 300000` <dbl> 341, 56, 671, 3, 2, 1352, 133…
## $ `2023 Home Value 300000 400000` <dbl> 106, 12, 579, 2, 2, 730, 543,…
## $ `2023 Home Value 400000 500000` <dbl> 105, 1, 80, 0, 0, 355, 159, 1…
## $ `2023 Home Value 500000 1000000` <dbl> 33, 0, 361, 12, 0, 324, 624, …
## $ `2023 Home Value 100000` <dbl> 242, 6, 1573, 78, 5, 1575, 65…
## $ `2023 Median Home Value` <dbl> 183657, 231250, 162187, 79486…
## $ `Pop Grwth 2010 2023` <dbl> 9.44, 51.61, 4.49, 10.56, -26…
## $ `Pop Grwth 2023 2028` <dbl> 10.34, 8.04, -1.39, 0.00, -1.…
## $ `2023 Population` <dbl> 6479, 423, 16666, 335, 51, 17…
## $ `2023 Pop Wrk Trav Time 30 Min` <dbl> 1882, 104, 5237, 79, 11, 6442…
## $ `2023 Pop Wrk Trav Time 60 Min...45` <dbl> 476, 93, 2031, 49, 2, 1838, 1…
## $ `2023 Pop Wrk Trav Time 60 Min...46` <dbl> 202, 21, 377, 4, 2, 234, 274,…
library(dplyr)
library(ggcorrplot)
## Loading required package: ggplot2
?ggcorrplot()
?cor
?cor_pmat # cor_pmat(): Compute a correlation matrix p-values.
mycorr<- cor(x = df, use = "pairwise.complete.obs",method = c("spearman"))
p.mat <- ggcorrplot::cor_pmat(x = df)
# head(p.mat)
library(ggcorrplot)
myplot<-ggcorrplot(corr = mycorr, # correlation matrix to visualize
method = "square", # character, the visualization method of correlation matrix to be used. Allowed values are "square" (default), "circle"
type = "lower", # character, "full" (default), "lower" or "upper" display
title = "Correlation Plot", # character, title of the graph
colors = c("red", "white","green"), # vector of 3 colors for low, mid and high correlation values.
lab = TRUE, # If TRUE, add corr coeff on the plot.
lab_size = 2, # labels. used when lab = TRUE.
p.mat = p.mat, # matrix of p-value. If NULL, arguments sig.level, insig, pch, pch.col, pch.cex is invalid. # Barring the no significant coefficient
insig = "pch", # character, specialized insignificant correlation coefficients, "pch" (default), "blank". If "blank", wipe away the corresponding glyphs; if "pch", add characters (see pch for details) on corresponding glyphs.
pch = 4, # add character on the glyphs of insignificant correlation coefficients (only valid when insig is "pch"). Default value is 4.
hc.order = TRUE, # If TRUE, correlation matrix will be hc.ordered using hclust function.
tl.cex = 8, # the size, the color and the string rotation of text label
tl.col = "black",
digits = 2
)
myplot
# Load necessary library
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(stargazer,quietly = T)
stargazer(df,
type = "text",
title = "Summary Statistics",
notes = "Data Source: CoStar, n= 46",
digits = 2,
omit.summary.stat = "n",
out = "my_table.tex")
##
## Summary Statistics
## ====================================================================================
## Statistic Mean St. Dev. Min Max
## ------------------------------------------------------------------------------------
## Number Of Units 119.37 158.94 1 1,519
## Year Built 1,970.41 18.41 1,900 2,022
## Vacancy 6.20 5.56 0.06 100.00
## Land Area 869,519.80 1,862,579.00 1 25,501,431
## Last Sale Price 6,121,384.00 15,591,909.00 2,600 363,125,000
## House Maint And Repair 2023 Cons Spdng 3,201,500.00 2,408,647.00 0 12,607,149
## Household Operations 2023 Cons Spdng 3,457,632.00 2,645,364.00 0 18,560,824
## HH Grwth 2010 2023 12.45 17.91 -24.46 157.93
## HH Grwth 2023 2028 2.78 3.86 -10.79 17.24
## 2023 Households 3,034.50 2,401.24 0 12,942
## 2023 Med HH Size 2.10 0.33 0 4
## 2023 Avg HH Size 2.41 0.36 0.00 4.10
## HU Grwth 2010 2023 17.45 23.32 -49.33 188.33
## 2023 Avg HU Size 4.59 3.58 0 19
## 2023 Avg HU Value 227,786.60 96,678.88 0 764,965
## 2023 Avg Yr Built 1,980.33 83.62 0 2,008
## 2023 Group Quarters 104.62 233.94 0 3,287
## 2023 Home Blt 1940 1949 169.37 309.81 0 4,433
## 2023 Home Blt 1950 1959 345.57 512.42 0 3,297
## 2023 Home Blt 1960 1969 489.47 599.10 0 4,157
## 2023 Home Blt 1970 1979 770.10 826.32 0 5,872
## 2023 Home Blt 1980 1989 688.46 619.67 0 4,214
## 2023 Home Blt 1990 1999 391.28 359.37 0 3,347
## 2023 Home Blt 2000 2010 390.95 409.31 0 3,539
## 2023 Home Blt 2010 335.47 407.24 0 2,570
## 2023 HU 1 Unit 1,732.25 1,280.85 0 8,026
## 2023 HU 20 Units 387.00 724.56 0 7,773
## 2023 HU 2 4 Units 293.79 391.61 0 2,743
## 2023 HU 5 19 Units 420.39 621.17 0 4,326
## 2023 Med Yr Built 1,979.39 83.81 0 2,010
## 2023 Owner Occd Housing 1,860.25 1,393.00 0 7,443
## 2023 Renter Occd Housing 1,174.25 1,233.33 0 8,801
## 2023 Home Value 1000000 24.79 60.63 0 551
## 2023 Home Value 100000 200000 531.79 500.64 0 3,138
## 2023 Home Value 200000 300000 409.99 379.43 0 2,179
## 2023 Home Value 300000 400000 198.22 237.68 0 1,649
## 2023 Home Value 400000 500000 75.17 119.11 0 1,179
## 2023 Home Value 500000 1000000 100.51 144.80 0 1,829
## 2023 Home Value 100000 519.80 585.55 0 4,566
## 2023 Median Home Value 188,513.00 92,171.90 0 767,973
## Pop Grwth 2010 2023 17.37 22.50 -48.46 189.24
## Pop Grwth 2023 2028 2.80 3.81 -8.96 17.13
## 2023 Population 7,401.47 5,884.95 0 32,505
## 2023 Pop Wrk Trav Time 30 Min 2,057.95 1,764.77 0 8,511
## 2023 Pop Wrk Trav Time 60 Min...45 988.64 1,018.90 0 7,965
## 2023 Pop Wrk Trav Time 60 Min...46 245.87 287.87 0 2,940
## ------------------------------------------------------------------------------------
## Data Source: CoStar, n= 46
# Extract numeric values from the dataframe
X <- as.matrix(df[sapply(df, is.numeric)])
# Before standardization
colMeans(X) # Mean of each column
## Number Of Units Year Built
## 1.193685e+02 1.970414e+03
## Vacancy Land Area
## NA 8.695198e+05
## Last Sale Price House Maint And Repair 2023 Cons Spdng
## 6.121384e+06 3.201500e+06
## Household Operations 2023 Cons Spdng HH Grwth 2010 2023
## 3.457632e+06 1.244945e+01
## HH Grwth 2023 2028 2023 Households
## 2.775743e+00 3.034502e+03
## 2023 Med HH Size 2023 Avg HH Size
## 2.098505e+00 2.413632e+00
## HU Grwth 2010 2023 2023 Avg HU Size
## 1.744588e+01 4.585752e+00
## 2023 Avg HU Value 2023 Avg Yr Built
## 2.277866e+05 1.980332e+03
## 2023 Group Quarters 2023 Home Blt 1940 1949
## 1.046218e+02 1.693685e+02
## 2023 Home Blt 1950 1959 2023 Home Blt 1960 1969
## 3.455734e+02 4.894670e+02
## 2023 Home Blt 1970 1979 2023 Home Blt 1980 1989
## 7.701047e+02 6.884617e+02
## 2023 Home Blt 1990 1999 2023 Home Blt 2000 2010
## 3.912832e+02 3.909525e+02
## 2023 Home Blt 2010 2023 HU 1 Unit
## 3.354697e+02 1.732252e+03
## 2023 HU 20 Units 2023 HU 2 4 Units
## 3.869956e+02 2.937863e+02
## 2023 HU 5 19 Units 2023 Med Yr Built
## 4.203896e+02 1.979392e+03
## 2023 Owner Occd Housing 2023 Renter Occd Housing
## 1.860248e+03 1.174253e+03
## 2023 Home Value 1000000 2023 Home Value 100000 200000
## 2.479332e+01 5.317854e+02
## 2023 Home Value 200000 300000 2023 Home Value 300000 400000
## 4.099894e+02 1.982172e+02
## 2023 Home Value 400000 500000 2023 Home Value 500000 1000000
## 7.517326e+01 1.005084e+02
## 2023 Home Value 100000 2023 Median Home Value
## 5.197977e+02 1.885130e+05
## Pop Grwth 2010 2023 Pop Grwth 2023 2028
## 1.737237e+01 2.804186e+00
## 2023 Population 2023 Pop Wrk Trav Time 30 Min
## 7.401467e+03 2.057947e+03
## 2023 Pop Wrk Trav Time 60 Min...45 2023 Pop Wrk Trav Time 60 Min...46
## 9.886376e+02 2.458698e+02
apply(X = X,
MARGIN = 2,
FUN = sd
) # standard deviation
## Number Of Units Year Built
## 1.589372e+02 1.841338e+01
## Vacancy Land Area
## NA 1.862579e+06
## Last Sale Price House Maint And Repair 2023 Cons Spdng
## 1.559191e+07 2.408647e+06
## Household Operations 2023 Cons Spdng HH Grwth 2010 2023
## 2.645364e+06 1.791372e+01
## HH Grwth 2023 2028 2023 Households
## 3.855609e+00 2.401237e+03
## 2023 Med HH Size 2023 Avg HH Size
## 3.263211e-01 3.619425e-01
## HU Grwth 2010 2023 2023 Avg HU Size
## 2.331966e+01 3.582514e+00
## 2023 Avg HU Value 2023 Avg Yr Built
## 9.667888e+04 8.362326e+01
## 2023 Group Quarters 2023 Home Blt 1940 1949
## 2.339396e+02 3.098076e+02
## 2023 Home Blt 1950 1959 2023 Home Blt 1960 1969
## 5.124212e+02 5.991007e+02
## 2023 Home Blt 1970 1979 2023 Home Blt 1980 1989
## 8.263159e+02 6.196659e+02
## 2023 Home Blt 1990 1999 2023 Home Blt 2000 2010
## 3.593700e+02 4.093120e+02
## 2023 Home Blt 2010 2023 HU 1 Unit
## 4.072429e+02 1.280847e+03
## 2023 HU 20 Units 2023 HU 2 4 Units
## 7.245635e+02 3.916130e+02
## 2023 HU 5 19 Units 2023 Med Yr Built
## 6.211725e+02 8.381302e+01
## 2023 Owner Occd Housing 2023 Renter Occd Housing
## 1.393002e+03 1.233330e+03
## 2023 Home Value 1000000 2023 Home Value 100000 200000
## 6.062964e+01 5.006420e+02
## 2023 Home Value 200000 300000 2023 Home Value 300000 400000
## 3.794259e+02 2.376790e+02
## 2023 Home Value 400000 500000 2023 Home Value 500000 1000000
## 1.191127e+02 1.447951e+02
## 2023 Home Value 100000 2023 Median Home Value
## 5.855536e+02 9.217190e+04
## Pop Grwth 2010 2023 Pop Grwth 2023 2028
## 2.250016e+01 3.809244e+00
## 2023 Population 2023 Pop Wrk Trav Time 30 Min
## 5.884947e+03 1.764772e+03
## 2023 Pop Wrk Trav Time 60 Min...45 2023 Pop Wrk Trav Time 60 Min...46
## 1.018898e+03 2.878651e+02
# # scale : mean = 0, std=1
# ?scale
# X = scale(x = X)
#
# # after standardization
# colMeans(x = X) # mean ~ 0
#
apply(X = X,
MARGIN = 2,
FUN = sd
) # standard deviation = 1
## Number Of Units Year Built
## 1.589372e+02 1.841338e+01
## Vacancy Land Area
## NA 1.862579e+06
## Last Sale Price House Maint And Repair 2023 Cons Spdng
## 1.559191e+07 2.408647e+06
## Household Operations 2023 Cons Spdng HH Grwth 2010 2023
## 2.645364e+06 1.791372e+01
## HH Grwth 2023 2028 2023 Households
## 3.855609e+00 2.401237e+03
## 2023 Med HH Size 2023 Avg HH Size
## 3.263211e-01 3.619425e-01
## HU Grwth 2010 2023 2023 Avg HU Size
## 2.331966e+01 3.582514e+00
## 2023 Avg HU Value 2023 Avg Yr Built
## 9.667888e+04 8.362326e+01
## 2023 Group Quarters 2023 Home Blt 1940 1949
## 2.339396e+02 3.098076e+02
## 2023 Home Blt 1950 1959 2023 Home Blt 1960 1969
## 5.124212e+02 5.991007e+02
## 2023 Home Blt 1970 1979 2023 Home Blt 1980 1989
## 8.263159e+02 6.196659e+02
## 2023 Home Blt 1990 1999 2023 Home Blt 2000 2010
## 3.593700e+02 4.093120e+02
## 2023 Home Blt 2010 2023 HU 1 Unit
## 4.072429e+02 1.280847e+03
## 2023 HU 20 Units 2023 HU 2 4 Units
## 7.245635e+02 3.916130e+02
## 2023 HU 5 19 Units 2023 Med Yr Built
## 6.211725e+02 8.381302e+01
## 2023 Owner Occd Housing 2023 Renter Occd Housing
## 1.393002e+03 1.233330e+03
## 2023 Home Value 1000000 2023 Home Value 100000 200000
## 6.062964e+01 5.006420e+02
## 2023 Home Value 200000 300000 2023 Home Value 300000 400000
## 3.794259e+02 2.376790e+02
## 2023 Home Value 400000 500000 2023 Home Value 500000 1000000
## 1.191127e+02 1.447951e+02
## 2023 Home Value 100000 2023 Median Home Value
## 5.855536e+02 9.217190e+04
## Pop Grwth 2010 2023 Pop Grwth 2023 2028
## 2.250016e+01 3.809244e+00
## 2023 Population 2023 Pop Wrk Trav Time 30 Min
## 5.884947e+03 1.764772e+03
## 2023 Pop Wrk Trav Time 60 Min...45 2023 Pop Wrk Trav Time 60 Min...46
## 1.018898e+03 2.878651e+02
library(tidyr)
# Replace missing values with 1
df_cleaned <- df %>% replace(is.na(.), 1)
# Verify the cleaned dataframe
head(df_cleaned)
## Number Of Units Year Built Vacancy Land Area Last Sale Price
## 1 25 1984 7.04 108900 3100000
## 2 8 1950 10.26 277477 800000
## 3 18 1990 6.31 67965 1000000
## 4 17 2007 4.61 174240 600000
## 5 13 1977 6.29 348480 750000
## 6 125 1936 6.04 337154 9500000
## House Maint And Repair 2023 Cons Spdng Household Operations 2023 Cons Spdng
## 1 2315630 2875112
## 2 178754 197441
## 3 7470769 8760263
## 4 180919 158527
## 5 26145 21370
## 6 11054731 11175881
## HH Grwth 2010 2023 HH Grwth 2023 2028 2023 Households 2023 Med HH Size
## 1 1.17 9.88 2571 2
## 2 46.67 7.91 139 3
## 3 2.66 -1.30 8107 2
## 4 3.29 -0.64 156 2
## 5 0.00 0.00 21 2
## 6 13.91 0.40 8322 2
## 2023 Avg HH Size HU Grwth 2010 2023 2023 Avg HU Size 2023 Avg HU Value
## 1 2.3 8.03 4 215342
## 2 3.0 51.09 2 225340
## 3 2.0 5.03 9 240448
## 4 2.1 10.64 1 157097
## 5 2.2 -25.00 1 394118
## 6 2.1 15.95 7 222175
## 2023 Avg Yr Built 2023 Group Quarters 2023 Home Blt 1940 1949
## 1 1971 491 849
## 2 2003 0 1
## 3 1974 130 763
## 4 1984 0 7
## 5 1984 0 4
## 6 1972 62 583
## 2023 Home Blt 1950 1959 2023 Home Blt 1960 1969 2023 Home Blt 1970 1979
## 1 381 277 247
## 2 2 2 8
## 3 1500 1854 3048
## 4 4 7 83
## 5 1 3 5
## 6 2772 1745 1907
## 2023 Home Blt 1980 1989 2023 Home Blt 1990 1999 2023 Home Blt 2000 2010
## 1 310 284 372
## 2 8 12 72
## 3 1360 268 917
## 4 52 40 21
## 5 6 8 10
## 6 786 414 56
## 2023 Home Blt 2010 2023 HU 1 Unit 2023 HU 20 Units 2023 HU 2 4 Units
## 1 145 1985 88 127
## 2 49 120 3 0
## 3 283 3791 1353 926
## 4 8 77 0 0
## 5 0 16 0 0
## 6 1220 6281 1658 458
## 2023 HU 5 19 Units 2023 Med Yr Built 2023 Owner Occd Housing
## 1 432 1966 1343
## 2 5 2006 103
## 3 1522 1972 4289
## 4 0 1981 124
## 5 0 1988 18
## 6 546 1967 6310
## 2023 Renter Occd Housing 2023 Home Value 1000000
## 1 1229 3
## 2 36 0
## 3 3818 106
## 4 32 0
## 5 3 4
## 6 2012 15
## 2023 Home Value 100000 200000 2023 Home Value 200000 300000
## 1 514 341
## 2 28 56
## 3 919 671
## 4 29 3
## 5 4 2
## 6 1958 1352
## 2023 Home Value 300000 400000 2023 Home Value 400000 500000
## 1 106 105
## 2 12 1
## 3 579 80
## 4 2 0
## 5 2 0
## 6 730 355
## 2023 Home Value 500000 1000000 2023 Home Value 100000 2023 Median Home Value
## 1 33 242 183657
## 2 0 6 231250
## 3 361 1573 162187
## 4 12 78 79486
## 5 0 5 187499
## 6 324 1575 180668
## Pop Grwth 2010 2023 Pop Grwth 2023 2028 2023 Population
## 1 9.44 10.34 6479
## 2 51.61 8.04 423
## 3 4.49 -1.39 16666
## 4 10.56 0.00 335
## 5 -26.09 -1.96 51
## 6 14.49 0.17 17448
## 2023 Pop Wrk Trav Time 30 Min 2023 Pop Wrk Trav Time 60 Min...45
## 1 1882 476
## 2 104 93
## 3 5237 2031
## 4 79 49
## 5 11 2
## 6 6442 1838
## 2023 Pop Wrk Trav Time 60 Min...46
## 1 202
## 2 21
## 3 377
## 4 4
## 5 2
## 6 234
# Load necessary library
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-8
# Assuming df contains your predictors and 'last_sale_price' is your target variable
# Extract the target variable y (Last Sale Price)
y <- df_cleaned$`Last Sale Price`
# Prepare the predictor variables (excluding the target variable)
X <- df_cleaned[, !colnames(df_cleaned) %in% c("Last Sale Price")]
# Standardize the predictor variables
X_scaled <- scale(X)
# Fit a Lasso model
lasso_model <- cv.glmnet(X_scaled, y, alpha = 1) # alpha = 1 for Lasso
# Get the selected lambda (regularization strength)
best_lambda <- lasso_model$lambda.min
# Extract the coefficients for the selected lambda
lasso_coef <- coef(lasso_model, s = best_lambda)
# Print the coefficients
print(lasso_coef)
## 46 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 6121384.17
## Number Of Units 6896306.68
## Year Built .
## Vacancy .
## Land Area 4033290.13
## House Maint And Repair 2023 Cons Spdng .
## Household Operations 2023 Cons Spdng .
## HH Grwth 2010 2023 .
## HH Grwth 2023 2028 .
## 2023 Households .
## 2023 Med HH Size 900178.95
## 2023 Avg HH Size -1039301.39
## HU Grwth 2010 2023 -991622.69
## 2023 Avg HU Size 883102.82
## 2023 Avg HU Value .
## 2023 Avg Yr Built 18053.87
## 2023 Group Quarters .
## 2023 Home Blt 1940 1949 .
## 2023 Home Blt 1950 1959 .
## 2023 Home Blt 1960 1969 -124236.76
## 2023 Home Blt 1970 1979 -283008.45
## 2023 Home Blt 1980 1989 610197.19
## 2023 Home Blt 1990 1999 -129183.67
## 2023 Home Blt 2000 2010 .
## 2023 Home Blt 2010 876842.56
## 2023 HU 1 Unit .
## 2023 HU 20 Units -241903.95
## 2023 HU 2 4 Units .
## 2023 HU 5 19 Units -1097736.71
## 2023 Med Yr Built .
## 2023 Owner Occd Housing .
## 2023 Renter Occd Housing .
## 2023 Home Value 1000000 .
## 2023 Home Value 100000 200000 -207854.07
## 2023 Home Value 200000 300000 -136592.67
## 2023 Home Value 300000 400000 425753.53
## 2023 Home Value 400000 500000 .
## 2023 Home Value 500000 1000000 988541.39
## 2023 Home Value 100000 .
## 2023 Median Home Value -336579.02
## Pop Grwth 2010 2023 .
## Pop Grwth 2023 2028 651974.89
## 2023 Population .
## 2023 Pop Wrk Trav Time 30 Min .
## 2023 Pop Wrk Trav Time 60 Min...45 -86112.85
## 2023 Pop Wrk Trav Time 60 Min...46 -572852.03
library(glmnet)
# Fit a Ridge model
ridge_model <- cv.glmnet(X_scaled, y, alpha = 0) # alpha = 0 for Ridge
# Get the selected lambda (regularization strength)
best_lambda <- ridge_model$lambda.min
# Extract the coefficients for the selected lambda
ridge_coef <- coef(ridge_model, s = best_lambda)
# Print the coefficients
print(ridge_coef)
## 46 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 6121384.17
## Number Of Units 6122851.88
## Year Built 115686.96
## Vacancy -112623.76
## Land Area 4082979.12
## House Maint And Repair 2023 Cons Spdng 72562.61
## Household Operations 2023 Cons Spdng 258208.54
## HH Grwth 2010 2023 215037.74
## HH Grwth 2023 2028 212661.70
## 2023 Households 232800.16
## 2023 Med HH Size 857457.45
## 2023 Avg HH Size -1067990.89
## HU Grwth 2010 2023 -904010.06
## 2023 Avg HU Size 1020802.93
## 2023 Avg HU Value 204002.68
## 2023 Avg Yr Built 138318.79
## 2023 Group Quarters -58098.78
## 2023 Home Blt 1940 1949 -261420.59
## 2023 Home Blt 1950 1959 -195418.15
## 2023 Home Blt 1960 1969 -601532.40
## 2023 Home Blt 1970 1979 -793365.62
## 2023 Home Blt 1980 1989 685487.71
## 2023 Home Blt 1990 1999 -530614.52
## 2023 Home Blt 2000 2010 -54513.94
## 2023 Home Blt 2010 773975.32
## 2023 HU 1 Unit 383416.79
## 2023 HU 20 Units -430992.14
## 2023 HU 2 4 Units 126537.03
## 2023 HU 5 19 Units -1040881.48
## 2023 Med Yr Built 5774.95
## 2023 Owner Occd Housing 68479.78
## 2023 Renter Occd Housing 432183.72
## 2023 Home Value 1000000 -131385.87
## 2023 Home Value 100000 200000 -547013.51
## 2023 Home Value 200000 300000 -237505.02
## 2023 Home Value 300000 400000 612758.29
## 2023 Home Value 400000 500000 209384.50
## 2023 Home Value 500000 1000000 938463.42
## 2023 Home Value 100000 239574.39
## 2023 Median Home Value -705520.18
## Pop Grwth 2010 2023 -276963.77
## Pop Grwth 2023 2028 496516.36
## 2023 Population 366932.60
## 2023 Pop Wrk Trav Time 30 Min 107498.06
## 2023 Pop Wrk Trav Time 60 Min...45 -439942.54
## 2023 Pop Wrk Trav Time 60 Min...46 -598047.27
# Extract non-zero coefficients from Lasso model
#lasso_coefs <- coef(lasso_model, s = "lambda.min")[-1] # Exclude intercept
#keep_X <- which(lasso_coefs != 0)
# Subset predictor variables based on non-zero coefficients from Lasso
#X_H <- X_scaled[, keep_X]
# Perform Linear regression
#li.eq <- summary(lm(y ~ X_H))
#
# Perform Ridge regression using lambda from Lasso
#ri.eq <- glmnet(x = X_scaled, y = y, lambda = lasso_model$lambda.min, family = "gaussian", intercept = FALSE, alpha = 0)
# Ensure all coefficients have the same length
#num_features <- max(length(beta), ncol(X_H), length(lasso_coefs), ncol(coef(ri.eq)))
# Create vectors with the same length
#beta <- c(beta, rep(NA, num_features - length(beta)))
#lasso_coefs <- c(lasso_coefs, rep(NA, num_features - length(lasso_coefs)))
#ridge_coefs <- c(coef(ri.eq)[, 1], rep(NA, num_features - ncol(coef(ri.eq))))
# Ensure all coefficients have the same length
#num_features <- max(length(beta), length(li.eq$coefficients), length(lasso_coefs), ncol(coef(ri.eq)))
# Create vectors with the same length and fill missing values with NA
#beta <- c(beta, rep(NA, num_features - length(beta)))
#lasso_coefs <- c(lasso_coefs, rep(NA, num_features - length(lasso_coefs)))
# Extract Ridge coefficients and fill missing values with NA
#ridge_coefs <- predict(ri.eq, s = lasso_model$lambda.min, type = "coefficients")[, 1]
#ridge_coefs <- as.vector(ridge_coefs)
# Ensure all coefficients have the same length
#num_features <- max(length(beta), length(li.eq$coefficients), length(lasso_coefs), length(ridge_coefs))
# Create vectors with the same length and fill missing values with NA
#beta <- c(beta, rep(NA, num_features - length(beta)))
#lasso_coefs <- c(lasso_coefs, rep(NA, num_features - length(lasso_coefs)))
#ridge_coefs <- c(ridge_coefs, rep(NA, num_features - length(ridge_coefs)))
#li_eq_coefs <- c(li.eq$coefficients, rep(NA, num_features - length(li.eq$coefficients)))
# Ensure all coefficients have the same length
#num_features <- max(length(beta), length(li.eq$coefficients), length(lasso_coefs), length(ridge_coefs))
# Pad coefficient vectors with NA if needed
#beta <- c(beta, rep(NA, num_features - length(beta)))
#li_eq_coefs <- c(li.eq$coefficients, rep(NA, num_features - length(li.eq$coefficients)))
#lasso_coefs <- c(lasso_coefs, rep(NA, num_features - length(lasso_coefs)))
#ridge_coefs <- c(ridge_coefs, rep(NA, num_features - length(ridge_coefs)))
# Ensure all coefficients have the same length
#num_features <- max(length(beta), length(li_eq_coefs), length(lasso_coefs), length(ridge_coefs))
#
# Create vectors with the same length and fill missing values with NA
#beta <- c(beta, rep(NA, num_features - length(beta)))
#li_eq_coefs <- c(li_eq_coefs, rep(NA, num_features - length(li_eq_coefs)))
#lasso_coefs <- c(lasso_coefs, rep(NA, num_features - length(lasso_coefs)))
#ridge_coefs <- c(ridge_coefs, rep(NA, num_features - length(ridge_coefs)))
# Combine coefficient vectors into a matrix
#coef_matrix <- cbind(beta, li_eq_coefs, lasso_coefs, ridge_coefs)
# Convert the matrix to a dataframe
#df.comp <- as.data.frame(coef_matrix)
#
# Rename the columns
#colnames(df.comp) <- c("beta", "Linear", "Lasso", "Ridge")
#
# Print the coefficients dataframe
#print(df.comp)
lasso_coefs <- lasso_coef[1:46]
ridge_coefs <- ridge_coef[1:46]
# Fit a multivariate linear regression model
lm_model <- lm(`Last Sale Price` ~ ., data = df)
# Print the summary of the regression model
summary(lm_model)
##
## Call:
## lm(formula = `Last Sale Price` ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -82623712 -2597135 -14414 2321687 268637835
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.484e+07 4.080e+07 -0.364 0.71611
## `Number Of Units` 4.520e+04 3.139e+03 14.398 < 2e-16
## `Year Built` 5.871e+03 2.023e+04 0.290 0.77166
## Vacancy -1.154e+04 6.431e+04 -0.179 0.85763
## `Land Area` 2.100e+00 2.577e-01 8.148 1.01e-15
## `House Maint And Repair 2023 Cons Spdng` -6.503e+00 9.739e+00 -0.668 0.50443
## `Household Operations 2023 Cons Spdng` 8.233e-01 1.991e+00 0.414 0.67925
## `HH Grwth 2010 2023` 1.207e+05 5.516e+04 2.188 0.02886
## `HH Grwth 2023 2028` -7.993e+04 1.190e+06 -0.067 0.94648
## `2023 Households` -7.495e+05 7.146e+05 -1.049 0.29449
## `2023 Med HH Size` 3.548e+06 1.563e+06 2.270 0.02337
## `2023 Avg HH Size` -3.660e+06 2.037e+06 -1.797 0.07260
## `HU Grwth 2010 2023` -4.389e+05 1.830e+05 -2.399 0.01663
## `2023 Avg HU Size` 4.376e+05 2.130e+05 2.054 0.04019
## `2023 Avg HU Value` 1.332e+01 1.505e+01 0.885 0.37623
## `2023 Avg Yr Built` 2.682e+05 1.713e+05 1.566 0.11763
## `2023 Group Quarters` -9.404e+02 1.828e+03 -0.514 0.60711
## `2023 Home Blt 1940 1949` -4.573e+03 2.466e+03 -1.854 0.06398
## `2023 Home Blt 1950 1959` -3.108e+03 2.461e+03 -1.263 0.20686
## `2023 Home Blt 1960 1969` -4.044e+03 1.966e+03 -2.057 0.03992
## `2023 Home Blt 1970 1979` -4.507e+03 1.898e+03 -2.375 0.01772
## `2023 Home Blt 1980 1989` -5.808e+02 2.000e+03 -0.290 0.77155
## `2023 Home Blt 1990 1999` -5.509e+03 2.130e+03 -2.586 0.00984
## `2023 Home Blt 2000 2010` -2.127e+03 1.665e+03 -1.278 0.20165
## `2023 Home Blt 2010` 6.067e+02 1.958e+03 0.310 0.75674
## `2023 HU 1 Unit` 1.111e+03 1.082e+03 1.027 0.30478
## `2023 HU 20 Units` -2.562e+02 1.211e+03 -0.212 0.83247
## `2023 HU 2 4 Units` 1.658e+03 1.922e+03 0.863 0.38836
## `2023 HU 5 19 Units` -2.797e+03 1.487e+03 -1.881 0.06020
## `2023 Med Yr Built` -2.670e+05 1.715e+05 -1.557 0.11969
## `2023 Owner Occd Housing` 7.712e+05 7.641e+05 1.009 0.31310
## `2023 Renter Occd Housing` 7.529e+05 7.145e+05 1.054 0.29222
## `2023 Home Value 1000000` -1.350e+04 4.783e+05 -0.028 0.97749
## `2023 Home Value 100000 200000` -1.049e+04 4.780e+05 -0.022 0.98249
## `2023 Home Value 200000 300000` -1.011e+04 4.780e+05 -0.021 0.98313
## `2023 Home Value 300000 400000` -4.351e+03 4.780e+05 -0.009 0.99274
## `2023 Home Value 400000 500000` -7.447e+03 4.781e+05 -0.016 0.98757
## `2023 Home Value 500000 1000000` -1.165e+02 4.780e+05 0.000 0.99981
## `2023 Home Value 100000` -8.129e+03 4.781e+05 -0.017 0.98644
## `2023 Median Home Value` -1.987e+01 1.460e+01 -1.361 0.17369
## `Pop Grwth 2010 2023` 3.095e+05 1.816e+05 1.704 0.08870
## `Pop Grwth 2023 2028` 2.842e+05 1.209e+06 0.235 0.81412
## `2023 Population` 4.303e+02 8.436e+02 0.510 0.61012
## `2023 Pop Wrk Trav Time 30 Min` -9.546e+02 1.083e+03 -0.882 0.37823
## `2023 Pop Wrk Trav Time 60 Min...45` -1.370e+03 1.355e+03 -1.011 0.31228
## `2023 Pop Wrk Trav Time 60 Min...46` -4.700e+03 2.859e+03 -1.644 0.10055
##
## (Intercept)
## `Number Of Units` ***
## `Year Built`
## Vacancy
## `Land Area` ***
## `House Maint And Repair 2023 Cons Spdng`
## `Household Operations 2023 Cons Spdng`
## `HH Grwth 2010 2023` *
## `HH Grwth 2023 2028`
## `2023 Households`
## `2023 Med HH Size` *
## `2023 Avg HH Size` .
## `HU Grwth 2010 2023` *
## `2023 Avg HU Size` *
## `2023 Avg HU Value`
## `2023 Avg Yr Built`
## `2023 Group Quarters`
## `2023 Home Blt 1940 1949` .
## `2023 Home Blt 1950 1959`
## `2023 Home Blt 1960 1969` *
## `2023 Home Blt 1970 1979` *
## `2023 Home Blt 1980 1989`
## `2023 Home Blt 1990 1999` **
## `2023 Home Blt 2000 2010`
## `2023 Home Blt 2010`
## `2023 HU 1 Unit`
## `2023 HU 20 Units`
## `2023 HU 2 4 Units`
## `2023 HU 5 19 Units` .
## `2023 Med Yr Built`
## `2023 Owner Occd Housing`
## `2023 Renter Occd Housing`
## `2023 Home Value 1000000`
## `2023 Home Value 100000 200000`
## `2023 Home Value 200000 300000`
## `2023 Home Value 300000 400000`
## `2023 Home Value 400000 500000`
## `2023 Home Value 500000 1000000`
## `2023 Home Value 100000`
## `2023 Median Home Value`
## `Pop Grwth 2010 2023` .
## `Pop Grwth 2023 2028`
## `2023 Population`
## `2023 Pop Wrk Trav Time 30 Min`
## `2023 Pop Wrk Trav Time 60 Min...45`
## `2023 Pop Wrk Trav Time 60 Min...46`
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11560000 on 1089 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.4728, Adjusted R-squared: 0.451
## F-statistic: 21.7 on 45 and 1089 DF, p-value: < 2.2e-16
# Extract coefficients from the multivariate linear regression model
lm_coefs <- coef(lm_model)
# Convert lasso_coef to a matrix with one row
lasso_coef_matrix <- as.matrix(lasso_coef)
# Convert lasso_coef_matrix to a regular matrix if it's not already in matrix form
lasso_coef_matrix <- as.matrix(lasso_coef_matrix)
# Convert lasso_coef_matrix to a regular matrix
lasso_coef_matrix <- as.matrix(lasso_coef_matrix)
# Combine all coefficient vectors into a data frame
df.comp <- data.frame(
Linear = as.vector(lm_coefs), # Coefficients from the multivariate linear regression model
Lasso = as.vector(lasso_coef_matrix), # Coefficients from the lasso regression model (converted to matrix)
Ridge = as.vector(ridge_coef) # Coefficients from the ridge regression model
)
# Print the coefficients dataframe
print(df.comp)
## Linear Lasso Ridge
## 1 -1.483973e+07 6121384.17 6121384.17
## 2 4.520057e+04 6896306.68 6122851.88
## 3 5.871427e+03 0.00 115686.96
## 4 -1.153896e+04 0.00 -112623.76
## 5 2.100077e+00 4033290.13 4082979.12
## 6 -6.503166e+00 0.00 72562.61
## 7 8.233457e-01 0.00 258208.54
## 8 1.206935e+05 0.00 215037.74
## 9 -7.992534e+04 0.00 212661.70
## 10 -7.494807e+05 0.00 232800.16
## 11 3.548246e+06 900178.95 857457.45
## 12 -3.660241e+06 -1039301.39 -1067990.89
## 13 -4.388886e+05 -991622.69 -904010.06
## 14 4.376257e+05 883102.82 1020802.93
## 15 1.331867e+01 0.00 204002.68
## 16 2.682260e+05 18053.87 138318.79
## 17 -9.403814e+02 0.00 -58098.78
## 18 -4.573036e+03 0.00 -261420.59
## 19 -3.108226e+03 0.00 -195418.15
## 20 -4.043533e+03 -124236.76 -601532.40
## 21 -4.506785e+03 -283008.45 -793365.62
## 22 -5.807567e+02 610197.19 685487.71
## 23 -5.509304e+03 -129183.67 -530614.52
## 24 -2.126723e+03 0.00 -54513.94
## 25 6.067036e+02 876842.56 773975.32
## 26 1.110805e+03 0.00 383416.79
## 27 -2.562441e+02 -241903.95 -430992.14
## 28 1.658187e+03 0.00 126537.03
## 29 -2.796718e+03 -1097736.71 -1040881.48
## 30 -2.670147e+05 0.00 5774.95
## 31 7.711626e+05 0.00 68479.78
## 32 7.528854e+05 0.00 432183.72
## 33 -1.349811e+04 0.00 -131385.87
## 34 -1.049087e+04 -207854.07 -547013.51
## 35 -1.010864e+04 -136592.67 -237505.02
## 36 -4.350944e+03 425753.53 612758.29
## 37 -7.447363e+03 0.00 209384.50
## 38 -1.165058e+02 988541.39 938463.42
## 39 -8.128898e+03 0.00 239574.39
## 40 -1.987150e+01 -336579.02 -705520.18
## 41 3.094792e+05 0.00 -276963.77
## 42 2.842377e+05 651974.89 496516.36
## 43 4.302822e+02 0.00 366932.60
## 44 -9.546439e+02 0.00 107498.06
## 45 -1.370296e+03 -86112.85 -439942.54
## 46 -4.699641e+03 -572852.03 -598047.27
# Plotting coefficients
barplot(t(df.comp), beside = TRUE, col = c("blue", "green", "red"),
main = "Coefficients from Different Models",
xlab = "Variables", ylab = "Coefficients",
legend.text = TRUE)
legend("topright", inset = 0.05, legend = colnames(df.comp), fill = c("blue", "green", "red"))
# Calculate the number of missing coefficients to fill
#missing_coef_count <- num_features - length(ridge_coefs)
# Create a vector with missing coefficients filled with NA
#if (missing_coef_count > 0) {
# ridge_coefs <- c(ridge_coefs, rep(NA, missing_coef_count))
#}
# Split the data into training and testing sets (80% train, 20% test)
set.seed(42) # for reproducibility
train_index <- sample(1:nrow(df), 0.8 * nrow(df))
train_data <- df[train_index, ]
test_data <- df[-train_index, ]
# Define the formula for the regression model
formula <- as.formula("`Last Sale Price` ~ `Number Of Units` + `Land Area` + `2023 Households` + `2023 Med HH Size` + `2023 Avg HH Size` + `HU Grwth 2010 2023` + `2023 Home Value 1000000` + `2023 Home Value 100000 200000` + `2023 Home Value 200000 300000` + `Pop Grwth 2010 2023`")
# Fit the linear regression model
model <- lm(formula, data = train_data)
# Make predictions on the testing set
predictions <- predict(model, newdata = test_data)
# Evaluate the model
mse <- mean((test_data$Last_Sale_Price - predictions)^2)
r_squared <- summary(model)$r.squared
cat("Mean Squared Error:", mse, "\n")
## Mean Squared Error: NaN
cat("R-squared:", r_squared, "\n")
## R-squared: 0.43557
# Print the coefficients
coefficients <- coef(model)
print(coefficients)
## (Intercept) `Number Of Units`
## 2.653295e+06 4.843101e+04
## `Land Area` `2023 Households`
## 1.864141e+00 2.549349e+02
## `2023 Med HH Size` `2023 Avg HH Size`
## 4.127027e+06 -5.124950e+06
## `HU Grwth 2010 2023` `2023 Home Value 1000000`
## -3.276234e+05 8.484266e+03
## `2023 Home Value 100000 200000` `2023 Home Value 200000 300000`
## -7.919315e+02 -2.016957e+03
## `Pop Grwth 2010 2023`
## 3.343593e+05
# Summary of the linear regression model
summary(model)
##
## Call:
## lm(formula = formula, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -86271983 -2159783 160925 1964770 273147482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.653e+06 3.232e+06 0.821 0.41186
## `Number Of Units` 4.843e+04 3.524e+03 13.742 < 2e-16 ***
## `Land Area` 1.864e+00 2.749e-01 6.782 2.15e-11 ***
## `2023 Households` 2.549e+02 3.582e+02 0.712 0.47687
## `2023 Med HH Size` 4.127e+06 1.750e+06 2.358 0.01857 *
## `2023 Avg HH Size` -5.125e+06 1.600e+06 -3.203 0.00141 **
## `HU Grwth 2010 2023` -3.276e+05 1.539e+05 -2.129 0.03353 *
## `2023 Home Value 1000000` 8.484e+03 7.432e+03 1.142 0.25395
## `2023 Home Value 100000 200000` -7.919e+02 1.402e+03 -0.565 0.57244
## `2023 Home Value 200000 300000` -2.017e+03 1.581e+03 -1.275 0.20252
## `Pop Grwth 2010 2023` 3.344e+05 1.596e+05 2.095 0.03649 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12260000 on 898 degrees of freedom
## Multiple R-squared: 0.4356, Adjusted R-squared: 0.4293
## F-statistic: 69.3 on 10 and 898 DF, p-value: < 2.2e-16
# Make predictions on the testing set
predictions <- predict(model, newdata = test_data)
# Calculate residuals
residuals <- residuals(model)
# Ensure both predictions and residuals have the same length
n <- min(length(predictions), length(residuals))
predictions <- predictions[1:n]
residuals <- residuals[1:n]
# Plotting residuals vs. fitted values
plot(predictions, residuals)
abline(h = 0, col = "red") # Add a horizontal line at residual = 0
# Load necessary packages
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
ridge_coefs <- ridge_coefs[1:46]
# Combine all coefficient vectors into a dataframe
df.comp <- data.frame(
Linear = lm_coefs, # Coefficients from the multivariate linear regression model
Lasso = lasso_coefs, # Coefficients from the lasso regression model
Ridge = ridge_coefs # Coefficients from the ridge regression model
)
# Calculate VIF for each variable
vif_values <- sapply(df.comp, function(x) {
if (is.numeric(x)) {
vif(lm(x ~ ., data = df.comp))
} else {
NA
}
})
## Warning in summary.lm(object, ...): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(object, ...): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(object, ...): essentially perfect fit: summary may be
## unreliable
# Combine VIF values into a dataframe
vif_df <- data.frame(VIF = vif_values)
# Print or use VIF dataframe as needed
print(vif_df)
## VIF.Linear VIF.Lasso VIF.Ridge
## Linear 1.344512 1.344512 1.344512
## Lasso 36.387550 36.387550 36.387550
## Ridge 37.155174 37.155174 37.155174
summary(model)
##
## Call:
## lm(formula = formula, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -86271983 -2159783 160925 1964770 273147482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.653e+06 3.232e+06 0.821 0.41186
## `Number Of Units` 4.843e+04 3.524e+03 13.742 < 2e-16 ***
## `Land Area` 1.864e+00 2.749e-01 6.782 2.15e-11 ***
## `2023 Households` 2.549e+02 3.582e+02 0.712 0.47687
## `2023 Med HH Size` 4.127e+06 1.750e+06 2.358 0.01857 *
## `2023 Avg HH Size` -5.125e+06 1.600e+06 -3.203 0.00141 **
## `HU Grwth 2010 2023` -3.276e+05 1.539e+05 -2.129 0.03353 *
## `2023 Home Value 1000000` 8.484e+03 7.432e+03 1.142 0.25395
## `2023 Home Value 100000 200000` -7.919e+02 1.402e+03 -0.565 0.57244
## `2023 Home Value 200000 300000` -2.017e+03 1.581e+03 -1.275 0.20252
## `Pop Grwth 2010 2023` 3.344e+05 1.596e+05 2.095 0.03649 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12260000 on 898 degrees of freedom
## Multiple R-squared: 0.4356, Adjusted R-squared: 0.4293
## F-statistic: 69.3 on 10 and 898 DF, p-value: < 2.2e-16
# Fit the linear regression model
model <- lm(formula, data = train_data)
# Check the significance and direction of coefficients
summary_model <- summary(model)
# Identify statistically insignificant variables with high p-values
insignificant_vars <- names(summary_model$coefficients[,"Pr(>|t|)"][summary_model$coefficients[,"Pr(>|t|)"] > 0.05])
# Drop statistically insignificant variables from both training and testing datasets
train_data <- train_data[, !names(train_data) %in% insignificant_vars]
test_data <- test_data[, !names(test_data) %in% insignificant_vars]
# Refit the model with updated variables
model <- lm(formula, data = train_data)
# Make predictions on the testing set
predictions <- predict(model, newdata = test_data)
# Evaluate the model
mse <- mean((test_data$Last_Sale_Price - predictions)^2)
r_squared <- summary(model)$r.squared
cat("Mean Squared Error:", mse, "\n")
## Mean Squared Error: NaN
cat("R-squared:", r_squared, "\n")
## R-squared: 0.43557
# Print the coefficients
coefficients <- coef(model)
print(coefficients)
## (Intercept) `Number Of Units`
## 2.653295e+06 4.843101e+04
## `Land Area` `2023 Households`
## 1.864141e+00 2.549349e+02
## `2023 Med HH Size` `2023 Avg HH Size`
## 4.127027e+06 -5.124950e+06
## `HU Grwth 2010 2023` `2023 Home Value 1000000`
## -3.276234e+05 8.484266e+03
## `2023 Home Value 100000 200000` `2023 Home Value 200000 300000`
## -7.919315e+02 -2.016957e+03
## `Pop Grwth 2010 2023`
## 3.343593e+05
# Plot the coefficients
plot(model)
# Print the summary
summary(model)
##
## Call:
## lm(formula = formula, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -86271983 -2159783 160925 1964770 273147482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.653e+06 3.232e+06 0.821 0.41186
## `Number Of Units` 4.843e+04 3.524e+03 13.742 < 2e-16 ***
## `Land Area` 1.864e+00 2.749e-01 6.782 2.15e-11 ***
## `2023 Households` 2.549e+02 3.582e+02 0.712 0.47687
## `2023 Med HH Size` 4.127e+06 1.750e+06 2.358 0.01857 *
## `2023 Avg HH Size` -5.125e+06 1.600e+06 -3.203 0.00141 **
## `HU Grwth 2010 2023` -3.276e+05 1.539e+05 -2.129 0.03353 *
## `2023 Home Value 1000000` 8.484e+03 7.432e+03 1.142 0.25395
## `2023 Home Value 100000 200000` -7.919e+02 1.402e+03 -0.565 0.57244
## `2023 Home Value 200000 300000` -2.017e+03 1.581e+03 -1.275 0.20252
## `Pop Grwth 2010 2023` 3.344e+05 1.596e+05 2.095 0.03649 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12260000 on 898 degrees of freedom
## Multiple R-squared: 0.4356, Adjusted R-squared: 0.4293
## F-statistic: 69.3 on 10 and 898 DF, p-value: < 2.2e-16
# Fit the linear regression model after removing '2023 Med HH Size' and '2023 Avg HH Size'
model <- lm(`Last Sale Price` ~ `Number Of Units` + `Land Area` + `2023 Households` + `HU Grwth 2010 2023` + `2023 Home Value 1000000` + `2023 Home Value 100000 200000` + `2023 Home Value 200000 300000` + `Pop Grwth 2010 2023`, data = train_data)
# Check the significance and direction of coefficients
summary_model <- summary(model)
# Identify statistically insignificant variables with high p-values
insignificant_vars <- names(summary_model$coefficients[,"Pr(>|t|)"][summary_model$coefficients[,"Pr(>|t|)"] > 0.05])
# Drop statistically insignificant variables from both training and testing datasets
train_data <- train_data[, !names(train_data) %in% insignificant_vars]
test_data <- test_data[, !names(test_data) %in% insignificant_vars]
# Refit the model with updated variables
model <- lm(`Last Sale Price` ~ `Number Of Units` + `Land Area` + `2023 Households` + `HU Grwth 2010 2023` + `2023 Home Value 1000000` + `2023 Home Value 100000 200000` + `2023 Home Value 200000 300000` + `Pop Grwth 2010 2023`, data = train_data)
# Make predictions on the testing set
predictions <- predict(model, newdata = test_data)
# Evaluate the model
mse <- mean((test_data$Last_Sale_Price - predictions)^2)
r_squared <- summary(model)$r.squared
cat("Mean Squared Error:", mse, "\n")
## Mean Squared Error: NaN
cat("R-squared:", r_squared, "\n")
## R-squared: 0.4290564
# Plot the coefficients
plot(model)
# Print the summary
summary(model)
##
## Call:
## lm(formula = `Last Sale Price` ~ `Number Of Units` + `Land Area` +
## `2023 Households` + `HU Grwth 2010 2023` + `2023 Home Value 1000000` +
## `2023 Home Value 100000 200000` + `2023 Home Value 200000 300000` +
## `Pop Grwth 2010 2023`, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -88569221 -1889597 275478 1586777 274383145
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.336e+06 7.885e+05 -1.695 0.0904 .
## `Number Of Units` 4.842e+04 3.535e+03 13.696 < 2e-16 ***
## `Land Area` 1.974e+00 2.740e-01 7.207 1.21e-12 ***
## `2023 Households` 3.508e+02 3.582e+02 0.979 0.3277
## `HU Grwth 2010 2023` -3.306e+05 1.546e+05 -2.139 0.0327 *
## `2023 Home Value 1000000` 9.325e+03 7.442e+03 1.253 0.2105
## `2023 Home Value 100000 200000` -6.595e+02 1.404e+03 -0.470 0.6387
## `2023 Home Value 200000 300000` -2.428e+03 1.553e+03 -1.564 0.1182
## `Pop Grwth 2010 2023` 3.339e+05 1.604e+05 2.082 0.0376 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12310000 on 900 degrees of freedom
## Multiple R-squared: 0.4291, Adjusted R-squared: 0.424
## F-statistic: 84.54 on 8 and 900 DF, p-value: < 2.2e-16
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.