This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Clear the workspace
rm(list = ls()) # Clear environment
gc() # Clear unused memory / Take out the trash
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 692962 37.1 1326972 70.9 1326972 70.9
## Vcells 1246123 9.6 8388608 64.0 2000868 15.3
cat("\f") # Clear the console
if(!is.null(dev.list())) dev.off() # Clear all plots
## null device
## 1
library(readxl)
df <- read_excel("DataDF.xlsx")
## New names:
## • `2023 Pop Wrk Trav Time 60 Min` -> `2023 Pop Wrk Trav Time 60 Min...45`
## • `2023 Pop Wrk Trav Time 60 Min` -> `2023 Pop Wrk Trav Time 60 Min...46`
df <- as.data.frame(df)
glimpse(df)
## Rows: 1,137
## Columns: 46
## $ `Number Of Units` <dbl> 25, 8, 18, 17, 13, 125, 236, …
## $ `Year Built` <dbl> 1984, 1950, 1990, 2007, 1977,…
## $ Vacancy <dbl> 7.04, 10.26, 6.31, 4.61, 6.29…
## $ `Land Area` <dbl> 108900, 277477, 67965, 174240…
## $ `Last Sale Price` <dbl> 3100000, 800000, 1000000, 600…
## $ `House Maint And Repair 2023 Cons Spdng` <dbl> 2315630, 178754, 7470769, 180…
## $ `Household Operations 2023 Cons Spdng` <dbl> 2875112, 197441, 8760263, 158…
## $ `HH Grwth 2010 2023` <dbl> 1.17, 46.67, 2.66, 3.29, 0.00…
## $ `HH Grwth 2023 2028` <dbl> 9.88, 7.91, -1.30, -0.64, 0.0…
## $ `2023 Households` <dbl> 2571, 139, 8107, 156, 21, 832…
## $ `2023 Med HH Size` <dbl> 2, 3, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ `2023 Avg HH Size` <dbl> 2.3, 3.0, 2.0, 2.1, 2.2, 2.1,…
## $ `HU Grwth 2010 2023` <dbl> 8.03, 51.09, 5.03, 10.64, -25…
## $ `2023 Avg HU Size` <dbl> 4, 2, 9, 1, 1, 7, 9, 2, 17, 3…
## $ `2023 Avg HU Value` <dbl> 215342, 225340, 240448, 15709…
## $ `2023 Avg Yr Built` <dbl> 1971, 2003, 1974, 1984, 1984,…
## $ `2023 Group Quarters` <dbl> 491, 0, 130, 0, 0, 62, 209, 1…
## $ `2023 Home Blt 1940 1949` <dbl> 849, 1, 763, 7, 4, 583, 363, …
## $ `2023 Home Blt 1950 1959` <dbl> 381, 2, 1500, 4, 1, 2772, 217…
## $ `2023 Home Blt 1960 1969` <dbl> 277, 2, 1854, 7, 3, 1745, 178…
## $ `2023 Home Blt 1970 1979` <dbl> 247, 8, 3048, 83, 5, 1907, 26…
## $ `2023 Home Blt 1980 1989` <dbl> 310, 8, 1360, 52, 6, 786, 104…
## $ `2023 Home Blt 1990 1999` <dbl> 284, 12, 268, 40, 8, 414, 245…
## $ `2023 Home Blt 2000 2010` <dbl> 372, 72, 917, 21, 10, 56, 135…
## $ `2023 Home Blt 2010` <dbl> 145, 49, 283, 8, 0, 1220, 0, …
## $ `2023 HU 1 Unit` <dbl> 1985, 120, 3791, 77, 16, 6281…
## $ `2023 HU 20 Units` <dbl> 88, 3, 1353, 0, 0, 1658, 1695…
## $ `2023 HU 2 4 Units` <dbl> 127, 0, 926, 0, 0, 458, 435, …
## $ `2023 HU 5 19 Units` <dbl> 432, 5, 1522, 0, 0, 546, 948,…
## $ `2023 Med Yr Built` <dbl> 1966, 2006, 1972, 1981, 1988,…
## $ `2023 Owner Occd Housing` <dbl> 1343, 103, 4289, 124, 18, 631…
## $ `2023 Renter Occd Housing` <dbl> 1229, 36, 3818, 32, 3, 2012, …
## $ `2023 Home Value 1000000` <dbl> 3, 0, 106, 0, 4, 15, 117, 0, …
## $ `2023 Home Value 100000 200000` <dbl> 514, 28, 919, 29, 4, 1958, 12…
## $ `2023 Home Value 200000 300000` <dbl> 341, 56, 671, 3, 2, 1352, 133…
## $ `2023 Home Value 300000 400000` <dbl> 106, 12, 579, 2, 2, 730, 543,…
## $ `2023 Home Value 400000 500000` <dbl> 105, 1, 80, 0, 0, 355, 159, 1…
## $ `2023 Home Value 500000 1000000` <dbl> 33, 0, 361, 12, 0, 324, 624, …
## $ `2023 Home Value 100000` <dbl> 242, 6, 1573, 78, 5, 1575, 65…
## $ `2023 Median Home Value` <dbl> 183657, 231250, 162187, 79486…
## $ `Pop Grwth 2010 2023` <dbl> 9.44, 51.61, 4.49, 10.56, -26…
## $ `Pop Grwth 2023 2028` <dbl> 10.34, 8.04, -1.39, 0.00, -1.…
## $ `2023 Population` <dbl> 6479, 423, 16666, 335, 51, 17…
## $ `2023 Pop Wrk Trav Time 30 Min` <dbl> 1882, 104, 5237, 79, 11, 6442…
## $ `2023 Pop Wrk Trav Time 60 Min...45` <dbl> 476, 93, 2031, 49, 2, 1838, 1…
## $ `2023 Pop Wrk Trav Time 60 Min...46` <dbl> 202, 21, 377, 4, 2, 234, 274,…
library(dplyr)
library(ggcorrplot)
## Loading required package: ggplot2
?ggcorrplot()
?cor
?cor_pmat # cor_pmat(): Compute a correlation matrix p-values.
mycorr<- cor(x = df, use = "pairwise.complete.obs",method = c("spearman"))
p.mat <- ggcorrplot::cor_pmat(x = df)
# head(p.mat)
library(ggcorrplot)
myplot<-ggcorrplot(corr = mycorr, # correlation matrix to visualize
method = "square", # character, the visualization method of correlation matrix to be used. Allowed values are "square" (default), "circle"
type = "lower", # character, "full" (default), "lower" or "upper" display
title = "Correlation Plot", # character, title of the graph
colors = c("red", "white","green"), # vector of 3 colors for low, mid and high correlation values.
lab = TRUE, # If TRUE, add corr coeff on the plot.
lab_size = 2, # labels. used when lab = TRUE.
p.mat = p.mat, # matrix of p-value. If NULL, arguments sig.level, insig, pch, pch.col, pch.cex is invalid. # Barring the no significant coefficient
insig = "pch", # character, specialized insignificant correlation coefficients, "pch" (default), "blank". If "blank", wipe away the corresponding glyphs; if "pch", add characters (see pch for details) on corresponding glyphs.
pch = 4, # add character on the glyphs of insignificant correlation coefficients (only valid when insig is "pch"). Default value is 4.
hc.order = TRUE, # If TRUE, correlation matrix will be hc.ordered using hclust function.
tl.cex = 8, # the size, the color and the string rotation of text label
tl.col = "black",
digits = 2
)
myplot
# Load necessary library
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(stargazer,quietly = T)
stargazer(df,
type = "text",
title = "Summary Statistics",
notes = "Data Source: CoStar, n= 46",
digits = 2,
omit.summary.stat = "n",
out = "my_table.tex")
##
## Summary Statistics
## ====================================================================================
## Statistic Mean St. Dev. Min Max
## ------------------------------------------------------------------------------------
## Number Of Units 119.37 158.94 1 1,519
## Year Built 1,970.41 18.41 1,900 2,022
## Vacancy 6.20 5.56 0.06 100.00
## Land Area 869,519.80 1,862,579.00 1 25,501,431
## Last Sale Price 6,121,384.00 15,591,909.00 2,600 363,125,000
## House Maint And Repair 2023 Cons Spdng 3,201,500.00 2,408,647.00 0 12,607,149
## Household Operations 2023 Cons Spdng 3,457,632.00 2,645,364.00 0 18,560,824
## HH Grwth 2010 2023 12.45 17.91 -24.46 157.93
## HH Grwth 2023 2028 2.78 3.86 -10.79 17.24
## 2023 Households 3,034.50 2,401.24 0 12,942
## 2023 Med HH Size 2.10 0.33 0 4
## 2023 Avg HH Size 2.41 0.36 0.00 4.10
## HU Grwth 2010 2023 17.45 23.32 -49.33 188.33
## 2023 Avg HU Size 4.59 3.58 0 19
## 2023 Avg HU Value 227,786.60 96,678.88 0 764,965
## 2023 Avg Yr Built 1,980.33 83.62 0 2,008
## 2023 Group Quarters 104.62 233.94 0 3,287
## 2023 Home Blt 1940 1949 169.37 309.81 0 4,433
## 2023 Home Blt 1950 1959 345.57 512.42 0 3,297
## 2023 Home Blt 1960 1969 489.47 599.10 0 4,157
## 2023 Home Blt 1970 1979 770.10 826.32 0 5,872
## 2023 Home Blt 1980 1989 688.46 619.67 0 4,214
## 2023 Home Blt 1990 1999 391.28 359.37 0 3,347
## 2023 Home Blt 2000 2010 390.95 409.31 0 3,539
## 2023 Home Blt 2010 335.47 407.24 0 2,570
## 2023 HU 1 Unit 1,732.25 1,280.85 0 8,026
## 2023 HU 20 Units 387.00 724.56 0 7,773
## 2023 HU 2 4 Units 293.79 391.61 0 2,743
## 2023 HU 5 19 Units 420.39 621.17 0 4,326
## 2023 Med Yr Built 1,979.39 83.81 0 2,010
## 2023 Owner Occd Housing 1,860.25 1,393.00 0 7,443
## 2023 Renter Occd Housing 1,174.25 1,233.33 0 8,801
## 2023 Home Value 1000000 24.79 60.63 0 551
## 2023 Home Value 100000 200000 531.79 500.64 0 3,138
## 2023 Home Value 200000 300000 409.99 379.43 0 2,179
## 2023 Home Value 300000 400000 198.22 237.68 0 1,649
## 2023 Home Value 400000 500000 75.17 119.11 0 1,179
## 2023 Home Value 500000 1000000 100.51 144.80 0 1,829
## 2023 Home Value 100000 519.80 585.55 0 4,566
## 2023 Median Home Value 188,513.00 92,171.90 0 767,973
## Pop Grwth 2010 2023 17.37 22.50 -48.46 189.24
## Pop Grwth 2023 2028 2.80 3.81 -8.96 17.13
## 2023 Population 7,401.47 5,884.95 0 32,505
## 2023 Pop Wrk Trav Time 30 Min 2,057.95 1,764.77 0 8,511
## 2023 Pop Wrk Trav Time 60 Min...45 988.64 1,018.90 0 7,965
## 2023 Pop Wrk Trav Time 60 Min...46 245.87 287.87 0 2,940
## ------------------------------------------------------------------------------------
## Data Source: CoStar, n= 46
# Extract numeric values from the dataframe
X <- as.matrix(df[sapply(df, is.numeric)])
# Before standardization
colMeans(X) # Mean of each column
## Number Of Units Year Built
## 1.193685e+02 1.970414e+03
## Vacancy Land Area
## NA 8.695198e+05
## Last Sale Price House Maint And Repair 2023 Cons Spdng
## 6.121384e+06 3.201500e+06
## Household Operations 2023 Cons Spdng HH Grwth 2010 2023
## 3.457632e+06 1.244945e+01
## HH Grwth 2023 2028 2023 Households
## 2.775743e+00 3.034502e+03
## 2023 Med HH Size 2023 Avg HH Size
## 2.098505e+00 2.413632e+00
## HU Grwth 2010 2023 2023 Avg HU Size
## 1.744588e+01 4.585752e+00
## 2023 Avg HU Value 2023 Avg Yr Built
## 2.277866e+05 1.980332e+03
## 2023 Group Quarters 2023 Home Blt 1940 1949
## 1.046218e+02 1.693685e+02
## 2023 Home Blt 1950 1959 2023 Home Blt 1960 1969
## 3.455734e+02 4.894670e+02
## 2023 Home Blt 1970 1979 2023 Home Blt 1980 1989
## 7.701047e+02 6.884617e+02
## 2023 Home Blt 1990 1999 2023 Home Blt 2000 2010
## 3.912832e+02 3.909525e+02
## 2023 Home Blt 2010 2023 HU 1 Unit
## 3.354697e+02 1.732252e+03
## 2023 HU 20 Units 2023 HU 2 4 Units
## 3.869956e+02 2.937863e+02
## 2023 HU 5 19 Units 2023 Med Yr Built
## 4.203896e+02 1.979392e+03
## 2023 Owner Occd Housing 2023 Renter Occd Housing
## 1.860248e+03 1.174253e+03
## 2023 Home Value 1000000 2023 Home Value 100000 200000
## 2.479332e+01 5.317854e+02
## 2023 Home Value 200000 300000 2023 Home Value 300000 400000
## 4.099894e+02 1.982172e+02
## 2023 Home Value 400000 500000 2023 Home Value 500000 1000000
## 7.517326e+01 1.005084e+02
## 2023 Home Value 100000 2023 Median Home Value
## 5.197977e+02 1.885130e+05
## Pop Grwth 2010 2023 Pop Grwth 2023 2028
## 1.737237e+01 2.804186e+00
## 2023 Population 2023 Pop Wrk Trav Time 30 Min
## 7.401467e+03 2.057947e+03
## 2023 Pop Wrk Trav Time 60 Min...45 2023 Pop Wrk Trav Time 60 Min...46
## 9.886376e+02 2.458698e+02
apply(X = X,
MARGIN = 2,
FUN = sd
) # standard deviation
## Number Of Units Year Built
## 1.589372e+02 1.841338e+01
## Vacancy Land Area
## NA 1.862579e+06
## Last Sale Price House Maint And Repair 2023 Cons Spdng
## 1.559191e+07 2.408647e+06
## Household Operations 2023 Cons Spdng HH Grwth 2010 2023
## 2.645364e+06 1.791372e+01
## HH Grwth 2023 2028 2023 Households
## 3.855609e+00 2.401237e+03
## 2023 Med HH Size 2023 Avg HH Size
## 3.263211e-01 3.619425e-01
## HU Grwth 2010 2023 2023 Avg HU Size
## 2.331966e+01 3.582514e+00
## 2023 Avg HU Value 2023 Avg Yr Built
## 9.667888e+04 8.362326e+01
## 2023 Group Quarters 2023 Home Blt 1940 1949
## 2.339396e+02 3.098076e+02
## 2023 Home Blt 1950 1959 2023 Home Blt 1960 1969
## 5.124212e+02 5.991007e+02
## 2023 Home Blt 1970 1979 2023 Home Blt 1980 1989
## 8.263159e+02 6.196659e+02
## 2023 Home Blt 1990 1999 2023 Home Blt 2000 2010
## 3.593700e+02 4.093120e+02
## 2023 Home Blt 2010 2023 HU 1 Unit
## 4.072429e+02 1.280847e+03
## 2023 HU 20 Units 2023 HU 2 4 Units
## 7.245635e+02 3.916130e+02
## 2023 HU 5 19 Units 2023 Med Yr Built
## 6.211725e+02 8.381302e+01
## 2023 Owner Occd Housing 2023 Renter Occd Housing
## 1.393002e+03 1.233330e+03
## 2023 Home Value 1000000 2023 Home Value 100000 200000
## 6.062964e+01 5.006420e+02
## 2023 Home Value 200000 300000 2023 Home Value 300000 400000
## 3.794259e+02 2.376790e+02
## 2023 Home Value 400000 500000 2023 Home Value 500000 1000000
## 1.191127e+02 1.447951e+02
## 2023 Home Value 100000 2023 Median Home Value
## 5.855536e+02 9.217190e+04
## Pop Grwth 2010 2023 Pop Grwth 2023 2028
## 2.250016e+01 3.809244e+00
## 2023 Population 2023 Pop Wrk Trav Time 30 Min
## 5.884947e+03 1.764772e+03
## 2023 Pop Wrk Trav Time 60 Min...45 2023 Pop Wrk Trav Time 60 Min...46
## 1.018898e+03 2.878651e+02
# # scale : mean = 0, std=1
# ?scale
# X = scale(x = X)
#
# # after standardization
# colMeans(x = X) # mean ~ 0
#
apply(X = X,
MARGIN = 2,
FUN = sd
) # standard deviation = 1
## Number Of Units Year Built
## 1.589372e+02 1.841338e+01
## Vacancy Land Area
## NA 1.862579e+06
## Last Sale Price House Maint And Repair 2023 Cons Spdng
## 1.559191e+07 2.408647e+06
## Household Operations 2023 Cons Spdng HH Grwth 2010 2023
## 2.645364e+06 1.791372e+01
## HH Grwth 2023 2028 2023 Households
## 3.855609e+00 2.401237e+03
## 2023 Med HH Size 2023 Avg HH Size
## 3.263211e-01 3.619425e-01
## HU Grwth 2010 2023 2023 Avg HU Size
## 2.331966e+01 3.582514e+00
## 2023 Avg HU Value 2023 Avg Yr Built
## 9.667888e+04 8.362326e+01
## 2023 Group Quarters 2023 Home Blt 1940 1949
## 2.339396e+02 3.098076e+02
## 2023 Home Blt 1950 1959 2023 Home Blt 1960 1969
## 5.124212e+02 5.991007e+02
## 2023 Home Blt 1970 1979 2023 Home Blt 1980 1989
## 8.263159e+02 6.196659e+02
## 2023 Home Blt 1990 1999 2023 Home Blt 2000 2010
## 3.593700e+02 4.093120e+02
## 2023 Home Blt 2010 2023 HU 1 Unit
## 4.072429e+02 1.280847e+03
## 2023 HU 20 Units 2023 HU 2 4 Units
## 7.245635e+02 3.916130e+02
## 2023 HU 5 19 Units 2023 Med Yr Built
## 6.211725e+02 8.381302e+01
## 2023 Owner Occd Housing 2023 Renter Occd Housing
## 1.393002e+03 1.233330e+03
## 2023 Home Value 1000000 2023 Home Value 100000 200000
## 6.062964e+01 5.006420e+02
## 2023 Home Value 200000 300000 2023 Home Value 300000 400000
## 3.794259e+02 2.376790e+02
## 2023 Home Value 400000 500000 2023 Home Value 500000 1000000
## 1.191127e+02 1.447951e+02
## 2023 Home Value 100000 2023 Median Home Value
## 5.855536e+02 9.217190e+04
## Pop Grwth 2010 2023 Pop Grwth 2023 2028
## 2.250016e+01 3.809244e+00
## 2023 Population 2023 Pop Wrk Trav Time 30 Min
## 5.884947e+03 1.764772e+03
## 2023 Pop Wrk Trav Time 60 Min...45 2023 Pop Wrk Trav Time 60 Min...46
## 1.018898e+03 2.878651e+02
library(tidyr)
# Replace missing values with 1
df_cleaned <- df %>% replace(is.na(.), 1)
# Verify the cleaned dataframe
head(df_cleaned)
## Number Of Units Year Built Vacancy Land Area Last Sale Price
## 1 25 1984 7.04 108900 3100000
## 2 8 1950 10.26 277477 800000
## 3 18 1990 6.31 67965 1000000
## 4 17 2007 4.61 174240 600000
## 5 13 1977 6.29 348480 750000
## 6 125 1936 6.04 337154 9500000
## House Maint And Repair 2023 Cons Spdng Household Operations 2023 Cons Spdng
## 1 2315630 2875112
## 2 178754 197441
## 3 7470769 8760263
## 4 180919 158527
## 5 26145 21370
## 6 11054731 11175881
## HH Grwth 2010 2023 HH Grwth 2023 2028 2023 Households 2023 Med HH Size
## 1 1.17 9.88 2571 2
## 2 46.67 7.91 139 3
## 3 2.66 -1.30 8107 2
## 4 3.29 -0.64 156 2
## 5 0.00 0.00 21 2
## 6 13.91 0.40 8322 2
## 2023 Avg HH Size HU Grwth 2010 2023 2023 Avg HU Size 2023 Avg HU Value
## 1 2.3 8.03 4 215342
## 2 3.0 51.09 2 225340
## 3 2.0 5.03 9 240448
## 4 2.1 10.64 1 157097
## 5 2.2 -25.00 1 394118
## 6 2.1 15.95 7 222175
## 2023 Avg Yr Built 2023 Group Quarters 2023 Home Blt 1940 1949
## 1 1971 491 849
## 2 2003 0 1
## 3 1974 130 763
## 4 1984 0 7
## 5 1984 0 4
## 6 1972 62 583
## 2023 Home Blt 1950 1959 2023 Home Blt 1960 1969 2023 Home Blt 1970 1979
## 1 381 277 247
## 2 2 2 8
## 3 1500 1854 3048
## 4 4 7 83
## 5 1 3 5
## 6 2772 1745 1907
## 2023 Home Blt 1980 1989 2023 Home Blt 1990 1999 2023 Home Blt 2000 2010
## 1 310 284 372
## 2 8 12 72
## 3 1360 268 917
## 4 52 40 21
## 5 6 8 10
## 6 786 414 56
## 2023 Home Blt 2010 2023 HU 1 Unit 2023 HU 20 Units 2023 HU 2 4 Units
## 1 145 1985 88 127
## 2 49 120 3 0
## 3 283 3791 1353 926
## 4 8 77 0 0
## 5 0 16 0 0
## 6 1220 6281 1658 458
## 2023 HU 5 19 Units 2023 Med Yr Built 2023 Owner Occd Housing
## 1 432 1966 1343
## 2 5 2006 103
## 3 1522 1972 4289
## 4 0 1981 124
## 5 0 1988 18
## 6 546 1967 6310
## 2023 Renter Occd Housing 2023 Home Value 1000000
## 1 1229 3
## 2 36 0
## 3 3818 106
## 4 32 0
## 5 3 4
## 6 2012 15
## 2023 Home Value 100000 200000 2023 Home Value 200000 300000
## 1 514 341
## 2 28 56
## 3 919 671
## 4 29 3
## 5 4 2
## 6 1958 1352
## 2023 Home Value 300000 400000 2023 Home Value 400000 500000
## 1 106 105
## 2 12 1
## 3 579 80
## 4 2 0
## 5 2 0
## 6 730 355
## 2023 Home Value 500000 1000000 2023 Home Value 100000 2023 Median Home Value
## 1 33 242 183657
## 2 0 6 231250
## 3 361 1573 162187
## 4 12 78 79486
## 5 0 5 187499
## 6 324 1575 180668
## Pop Grwth 2010 2023 Pop Grwth 2023 2028 2023 Population
## 1 9.44 10.34 6479
## 2 51.61 8.04 423
## 3 4.49 -1.39 16666
## 4 10.56 0.00 335
## 5 -26.09 -1.96 51
## 6 14.49 0.17 17448
## 2023 Pop Wrk Trav Time 30 Min 2023 Pop Wrk Trav Time 60 Min...45
## 1 1882 476
## 2 104 93
## 3 5237 2031
## 4 79 49
## 5 11 2
## 6 6442 1838
## 2023 Pop Wrk Trav Time 60 Min...46
## 1 202
## 2 21
## 3 377
## 4 4
## 5 2
## 6 234
# Load necessary library
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-8
# Assuming df contains your predictors and 'last_sale_price' is your target variable
# Extract the target variable y (Last Sale Price)
y <- df_cleaned$`Last Sale Price`
# Prepare the predictor variables (excluding the target variable)
X <- df_cleaned[, !colnames(df_cleaned) %in% c("Last Sale Price")]
# Standardize the predictor variables
X_scaled <- scale(X)
# Fit a Lasso model
lasso_model <- cv.glmnet(X_scaled, y, alpha = 1) # alpha = 1 for Lasso
# Get the selected lambda (regularization strength)
best_lambda <- lasso_model$lambda.min
# Extract the coefficients for the selected lambda
lasso_coef <- coef(lasso_model, s = best_lambda)
# Print the coefficients
print(lasso_coef)
## 46 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 6121384.17
## Number Of Units 6896306.68
## Year Built .
## Vacancy .
## Land Area 4033290.13
## House Maint And Repair 2023 Cons Spdng .
## Household Operations 2023 Cons Spdng .
## HH Grwth 2010 2023 .
## HH Grwth 2023 2028 .
## 2023 Households .
## 2023 Med HH Size 900178.95
## 2023 Avg HH Size -1039301.39
## HU Grwth 2010 2023 -991622.69
## 2023 Avg HU Size 883102.82
## 2023 Avg HU Value .
## 2023 Avg Yr Built 18053.87
## 2023 Group Quarters .
## 2023 Home Blt 1940 1949 .
## 2023 Home Blt 1950 1959 .
## 2023 Home Blt 1960 1969 -124236.76
## 2023 Home Blt 1970 1979 -283008.45
## 2023 Home Blt 1980 1989 610197.19
## 2023 Home Blt 1990 1999 -129183.67
## 2023 Home Blt 2000 2010 .
## 2023 Home Blt 2010 876842.56
## 2023 HU 1 Unit .
## 2023 HU 20 Units -241903.95
## 2023 HU 2 4 Units .
## 2023 HU 5 19 Units -1097736.71
## 2023 Med Yr Built .
## 2023 Owner Occd Housing .
## 2023 Renter Occd Housing .
## 2023 Home Value 1000000 .
## 2023 Home Value 100000 200000 -207854.07
## 2023 Home Value 200000 300000 -136592.67
## 2023 Home Value 300000 400000 425753.53
## 2023 Home Value 400000 500000 .
## 2023 Home Value 500000 1000000 988541.39
## 2023 Home Value 100000 .
## 2023 Median Home Value -336579.02
## Pop Grwth 2010 2023 .
## Pop Grwth 2023 2028 651974.89
## 2023 Population .
## 2023 Pop Wrk Trav Time 30 Min .
## 2023 Pop Wrk Trav Time 60 Min...45 -86112.85
## 2023 Pop Wrk Trav Time 60 Min...46 -572852.03
library(glmnet)
# Fit a Ridge model
ridge_model <- cv.glmnet(X_scaled, y, alpha = 0) # alpha = 0 for Ridge
# Get the selected lambda (regularization strength)
best_lambda <- ridge_model$lambda.min
# Extract the coefficients for the selected lambda
ridge_coef <- coef(ridge_model, s = best_lambda)
# Print the coefficients
print(ridge_coef)
## 46 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 6121384.17
## Number Of Units 5978932.49
## Year Built 128968.60
## Vacancy -115049.00
## Land Area 4067986.38
## House Maint And Repair 2023 Cons Spdng 88080.37
## Household Operations 2023 Cons Spdng 240308.96
## HH Grwth 2010 2023 176947.55
## HH Grwth 2023 2028 215427.47
## 2023 Households 200163.18
## 2023 Med HH Size 803462.67
## 2023 Avg HH Size -1028610.93
## HU Grwth 2010 2023 -798863.29
## 2023 Avg HU Size 953183.13
## 2023 Avg HU Value 174202.06
## 2023 Avg Yr Built 129776.30
## 2023 Group Quarters -51317.46
## 2023 Home Blt 1940 1949 -231552.97
## 2023 Home Blt 1950 1959 -176413.81
## 2023 Home Blt 1960 1969 -554580.71
## 2023 Home Blt 1970 1979 -708541.10
## 2023 Home Blt 1980 1989 648588.65
## 2023 Home Blt 1990 1999 -452663.56
## 2023 Home Blt 2000 2010 -26031.79
## 2023 Home Blt 2010 716038.34
## 2023 HU 1 Unit 318707.17
## 2023 HU 20 Units -404483.32
## 2023 HU 2 4 Units 102965.24
## 2023 HU 5 19 Units -954785.15
## 2023 Med Yr Built 17642.05
## 2023 Owner Occd Housing 78144.12
## 2023 Renter Occd Housing 343688.01
## 2023 Home Value 1000000 -112549.98
## 2023 Home Value 100000 200000 -517287.90
## 2023 Home Value 200000 300000 -212070.04
## 2023 Home Value 300000 400000 576890.46
## 2023 Home Value 400000 500000 217328.41
## 2023 Home Value 500000 1000000 905602.99
## 2023 Home Value 100000 254445.99
## 2023 Median Home Value -650055.85
## Pop Grwth 2010 2023 -277128.42
## Pop Grwth 2023 2028 454016.13
## 2023 Population 312998.08
## 2023 Pop Wrk Trav Time 30 Min 111186.92
## 2023 Pop Wrk Trav Time 60 Min...45 -416298.98
## 2023 Pop Wrk Trav Time 60 Min...46 -551725.94
# Extract non-zero coefficients from Lasso model
lasso_coefs <- coef(lasso_model, s = "lambda.min")[-1] # Exclude intercept
keep_X <- which(lasso_coefs != 0)
# Subset predictor variables based on non-zero coefficients from Lasso
X_H <- X_scaled[, keep_X]
# Perform Linear regression
li.eq <- summary(lm(y ~ X_H))
#
# Perform Ridge regression using lambda from Lasso
ri.eq <- glmnet(x = X_scaled, y = y, lambda = lasso_model$lambda.min, family = "gaussian", intercept = FALSE, alpha = 0)
# Ensure all coefficients have the same length
num_features <- max(length(beta), ncol(X_H), length(lasso_coefs), ncol(coef(ri.eq)))
# Create vectors with the same length
beta <- c(beta, rep(NA, num_features - length(beta)))
lasso_coefs <- c(lasso_coefs, rep(NA, num_features - length(lasso_coefs)))
ridge_coefs <- c(coef(ri.eq)[, 1], rep(NA, num_features - ncol(coef(ri.eq))))
# Ensure all coefficients have the same length
num_features <- max(length(beta), length(li.eq$coefficients), length(lasso_coefs), ncol(coef(ri.eq)))
# Create vectors with the same length and fill missing values with NA
beta <- c(beta, rep(NA, num_features - length(beta)))
lasso_coefs <- c(lasso_coefs, rep(NA, num_features - length(lasso_coefs)))
# Extract Ridge coefficients and fill missing values with NA
ridge_coefs <- predict(ri.eq, s = lasso_model$lambda.min, type = "coefficients")[, 1]
ridge_coefs <- as.vector(ridge_coefs)
# Ensure all coefficients have the same length
num_features <- max(length(beta), length(li.eq$coefficients), length(lasso_coefs), length(ridge_coefs))
# Create vectors with the same length and fill missing values with NA
beta <- c(beta, rep(NA, num_features - length(beta)))
lasso_coefs <- c(lasso_coefs, rep(NA, num_features - length(lasso_coefs)))
ridge_coefs <- c(ridge_coefs, rep(NA, num_features - length(ridge_coefs)))
li_eq_coefs <- c(li.eq$coefficients, rep(NA, num_features - length(li.eq$coefficients)))
# Ensure all coefficients have the same length
num_features <- max(length(beta), length(li.eq$coefficients), length(lasso_coefs), length(ridge_coefs))
# Pad coefficient vectors with NA if needed
beta <- c(beta, rep(NA, num_features - length(beta)))
li_eq_coefs <- c(li.eq$coefficients, rep(NA, num_features - length(li.eq$coefficients)))
lasso_coefs <- c(lasso_coefs, rep(NA, num_features - length(lasso_coefs)))
ridge_coefs <- c(ridge_coefs, rep(NA, num_features - length(ridge_coefs)))
# Ensure all coefficients have the same length
num_features <- max(length(beta), length(li_eq_coefs), length(lasso_coefs), length(ridge_coefs))
#
# Create vectors with the same length and fill missing values with NA
beta <- c(beta, rep(NA, num_features - length(beta)))
li_eq_coefs <- c(li_eq_coefs, rep(NA, num_features - length(li_eq_coefs)))
lasso_coefs <- c(lasso_coefs, rep(NA, num_features - length(lasso_coefs)))
ridge_coefs <- c(ridge_coefs, rep(NA, num_features - length(ridge_coefs)))
# Combine coefficient vectors into a matrix
coef_matrix <- cbind(beta, li_eq_coefs, lasso_coefs, ridge_coefs)
# Convert the matrix to a dataframe
df.comp <- as.data.frame(coef_matrix)
#
# Rename the columns
colnames(df.comp) <- c("beta", "Linear", "Lasso", "Ridge")
#
# Print the coefficients dataframe
print(df.comp)
## beta Linear Lasso Ridge
## 1 function (a, b) , .Internal(beta(a, b)) 6121384 6896307 0
## 2 NA 7007428 0 7044699
## 3 NA 4009916 0 68573.8
## 4 NA 1239946 4033290 -90576.53
## 5 NA -1194137 0 3975915
## 6 NA -1744278 0 -541370.9
## 7 NA 1702244 0 114455.6
## 8 NA 98550.97 0 1150933
## 9 NA -258631.1 0 -717099.4
## 10 NA -1161659 900179 1298148
## 11 NA 1765051 -1039301 1212216
## 12 NA -926558.4 -991622.7 -1390163
## 13 NA 1706347 883102.8 -4845459
## 14 NA -851184.9 0 1597221
## 15 NA -1534307 18053.87 1020904
## 16 NA -486956.7 0 1058183
## 17 NA -338480.8 0 -192817
## 18 NA 1058706 0 -983712.5
## 19 NA 1260835 -124236.8 -1012368
## 20 NA -796623.5 -283008.4 -1681494
## 21 NA 1074649 610197.2 -2620328
## 22 NA 61029.08 -129183.7 252716.3
## 23 NA -805711.7 0 -1583583
## 24 NA 343118.6 876842.6 -663675.9
## 25 NA 484676.7 0 761318.6
## 26 NA 472122.3 -241904 1223259
## 27 NA 499127 0 -500044
## 28 NA 604083 -1097737 377071.9
## 29 NA 563517.7 0 -1717398
## 30 NA 694574 0 -945766.6
## 31 NA 378845.8 0 1024382
## 32 NA 757110.3 0 2721448
## 33 NA 871769.7 -207854.1 -233806.6
## 34 NA 835676.4 -136592.7 -444620.1
## 35 NA 566055.5 425753.5 -200905.2
## 36 NA 580113.4 0 1116089
## 37 NA 740347.9 988541.4 295689.1
## 38 NA 676411 0 1189374
## 39 NA 665930.7 -336579 676340.7
## 40 NA 617704.2 0 -1640758
## 41 NA 594096.2 651974.9 2326562
## 42 NA 502654.1 0 1587111
## 43 NA 486099.7 0 1946678
## 44 NA 519007.1 -86112.85 -923504.8
## 45 NA 921137 -572852 -1019168
## 46 NA 668950.3 NA -1205462
## 47 NA 17.84043 NA NA
## 48 NA 14.45794 NA NA
## 49 NA 8.493382 NA NA
## 50 NA 2.484229 NA NA
## 51 NA -1.976776 NA NA
## 52 NA -3.095338 NA NA
## 53 NA 2.450774 NA NA
## 54 NA 0.2601348 NA NA
## 55 NA -0.341603 NA NA
## 56 NA -1.332529 NA NA
## 57 NA 2.112123 NA NA
## 58 NA -1.636869 NA NA
## 59 NA 2.941403 NA NA
## 60 NA -1.149709 NA NA
## 61 NA -2.268306 NA NA
## 62 NA -0.7312423 NA NA
## 63 NA -0.5479659 NA NA
## 64 NA 1.782044 NA NA
## 65 NA 2.508354 NA NA
## 66 NA -1.638807 NA NA
## 67 NA 2.070587 NA NA
## 68 NA 0.06625408 NA NA
## 69 NA -1.204442 NA NA
## 70 NA 8.146926e-63 NA NA
## 71 NA 1.504414e-43 NA NA
## 72 NA 6.344525e-17 NA NA
## 73 NA 0.0131293 NA NA
## 74 NA 0.04831326 NA NA
## 75 NA 0.002014943 NA NA
## 76 NA 0.01440785 NA NA
## 77 NA 0.7948079 NA NA
## 78 NA 0.7327141 NA NA
## 79 NA 0.1829588 NA NA
## 80 NA 0.03489805 NA NA
## 81 NA 0.1019404 NA NA
## 82 NA 0.003334857 NA NA
## 83 NA 0.2505104 NA NA
## 84 NA 0.02350133 NA NA
## 85 NA 0.4647849 NA NA
## 86 NA 0.5838251 NA NA
## 87 NA 0.07501433 NA NA
## 88 NA 0.01227073 NA NA
## 89 NA 0.1015358 NA NA
## 90 NA 0.0386273 NA NA
## 91 NA 0.9471874 NA NA
## 92 NA 0.2286747 NA NA
lasso_coefs <- lasso_coefs[1:46]
ridge_coefs <- ridge_coefs[1:46]
# Fit a multivariate linear regression model
lm_model <- lm(`Last Sale Price` ~ ., data = df)
# Print the summary of the regression model
summary(lm_model)
##
## Call:
## lm(formula = `Last Sale Price` ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -82623712 -2597135 -14414 2321687 268637835
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.484e+07 4.080e+07 -0.364 0.71611
## `Number Of Units` 4.520e+04 3.139e+03 14.398 < 2e-16
## `Year Built` 5.871e+03 2.023e+04 0.290 0.77166
## Vacancy -1.154e+04 6.431e+04 -0.179 0.85763
## `Land Area` 2.100e+00 2.577e-01 8.148 1.01e-15
## `House Maint And Repair 2023 Cons Spdng` -6.503e+00 9.739e+00 -0.668 0.50443
## `Household Operations 2023 Cons Spdng` 8.233e-01 1.991e+00 0.414 0.67925
## `HH Grwth 2010 2023` 1.207e+05 5.516e+04 2.188 0.02886
## `HH Grwth 2023 2028` -7.993e+04 1.190e+06 -0.067 0.94648
## `2023 Households` -7.495e+05 7.146e+05 -1.049 0.29449
## `2023 Med HH Size` 3.548e+06 1.563e+06 2.270 0.02337
## `2023 Avg HH Size` -3.660e+06 2.037e+06 -1.797 0.07260
## `HU Grwth 2010 2023` -4.389e+05 1.830e+05 -2.399 0.01663
## `2023 Avg HU Size` 4.376e+05 2.130e+05 2.054 0.04019
## `2023 Avg HU Value` 1.332e+01 1.505e+01 0.885 0.37623
## `2023 Avg Yr Built` 2.682e+05 1.713e+05 1.566 0.11763
## `2023 Group Quarters` -9.404e+02 1.828e+03 -0.514 0.60711
## `2023 Home Blt 1940 1949` -4.573e+03 2.466e+03 -1.854 0.06398
## `2023 Home Blt 1950 1959` -3.108e+03 2.461e+03 -1.263 0.20686
## `2023 Home Blt 1960 1969` -4.044e+03 1.966e+03 -2.057 0.03992
## `2023 Home Blt 1970 1979` -4.507e+03 1.898e+03 -2.375 0.01772
## `2023 Home Blt 1980 1989` -5.808e+02 2.000e+03 -0.290 0.77155
## `2023 Home Blt 1990 1999` -5.509e+03 2.130e+03 -2.586 0.00984
## `2023 Home Blt 2000 2010` -2.127e+03 1.665e+03 -1.278 0.20165
## `2023 Home Blt 2010` 6.067e+02 1.958e+03 0.310 0.75674
## `2023 HU 1 Unit` 1.111e+03 1.082e+03 1.027 0.30478
## `2023 HU 20 Units` -2.562e+02 1.211e+03 -0.212 0.83247
## `2023 HU 2 4 Units` 1.658e+03 1.922e+03 0.863 0.38836
## `2023 HU 5 19 Units` -2.797e+03 1.487e+03 -1.881 0.06020
## `2023 Med Yr Built` -2.670e+05 1.715e+05 -1.557 0.11969
## `2023 Owner Occd Housing` 7.712e+05 7.641e+05 1.009 0.31310
## `2023 Renter Occd Housing` 7.529e+05 7.145e+05 1.054 0.29222
## `2023 Home Value 1000000` -1.350e+04 4.783e+05 -0.028 0.97749
## `2023 Home Value 100000 200000` -1.049e+04 4.780e+05 -0.022 0.98249
## `2023 Home Value 200000 300000` -1.011e+04 4.780e+05 -0.021 0.98313
## `2023 Home Value 300000 400000` -4.351e+03 4.780e+05 -0.009 0.99274
## `2023 Home Value 400000 500000` -7.447e+03 4.781e+05 -0.016 0.98757
## `2023 Home Value 500000 1000000` -1.165e+02 4.780e+05 0.000 0.99981
## `2023 Home Value 100000` -8.129e+03 4.781e+05 -0.017 0.98644
## `2023 Median Home Value` -1.987e+01 1.460e+01 -1.361 0.17369
## `Pop Grwth 2010 2023` 3.095e+05 1.816e+05 1.704 0.08870
## `Pop Grwth 2023 2028` 2.842e+05 1.209e+06 0.235 0.81412
## `2023 Population` 4.303e+02 8.436e+02 0.510 0.61012
## `2023 Pop Wrk Trav Time 30 Min` -9.546e+02 1.083e+03 -0.882 0.37823
## `2023 Pop Wrk Trav Time 60 Min...45` -1.370e+03 1.355e+03 -1.011 0.31228
## `2023 Pop Wrk Trav Time 60 Min...46` -4.700e+03 2.859e+03 -1.644 0.10055
##
## (Intercept)
## `Number Of Units` ***
## `Year Built`
## Vacancy
## `Land Area` ***
## `House Maint And Repair 2023 Cons Spdng`
## `Household Operations 2023 Cons Spdng`
## `HH Grwth 2010 2023` *
## `HH Grwth 2023 2028`
## `2023 Households`
## `2023 Med HH Size` *
## `2023 Avg HH Size` .
## `HU Grwth 2010 2023` *
## `2023 Avg HU Size` *
## `2023 Avg HU Value`
## `2023 Avg Yr Built`
## `2023 Group Quarters`
## `2023 Home Blt 1940 1949` .
## `2023 Home Blt 1950 1959`
## `2023 Home Blt 1960 1969` *
## `2023 Home Blt 1970 1979` *
## `2023 Home Blt 1980 1989`
## `2023 Home Blt 1990 1999` **
## `2023 Home Blt 2000 2010`
## `2023 Home Blt 2010`
## `2023 HU 1 Unit`
## `2023 HU 20 Units`
## `2023 HU 2 4 Units`
## `2023 HU 5 19 Units` .
## `2023 Med Yr Built`
## `2023 Owner Occd Housing`
## `2023 Renter Occd Housing`
## `2023 Home Value 1000000`
## `2023 Home Value 100000 200000`
## `2023 Home Value 200000 300000`
## `2023 Home Value 300000 400000`
## `2023 Home Value 400000 500000`
## `2023 Home Value 500000 1000000`
## `2023 Home Value 100000`
## `2023 Median Home Value`
## `Pop Grwth 2010 2023` .
## `Pop Grwth 2023 2028`
## `2023 Population`
## `2023 Pop Wrk Trav Time 30 Min`
## `2023 Pop Wrk Trav Time 60 Min...45`
## `2023 Pop Wrk Trav Time 60 Min...46`
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11560000 on 1089 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.4728, Adjusted R-squared: 0.451
## F-statistic: 21.7 on 45 and 1089 DF, p-value: < 2.2e-16
# Extract coefficients from the multivariate linear regression model
lm_coefs <- coef(lm_model)
# Combine all coefficient vectors into a dataframe
df.comp <- data.frame(
Linear = lm_coefs, # Coefficients from the multivariate linear regression model
Lasso = lasso_coefs, # Coefficients from the lasso regression model
Ridge = ridge_coefs # Coefficients from the ridge regression model
)
# Print the coefficients dataframe
print(df.comp)
## Linear Lasso Ridge
## (Intercept) -1.483973e+07 6896306.68 0.00
## `Number Of Units` 4.520057e+04 0.00 7044699.21
## `Year Built` 5.871427e+03 0.00 68573.80
## Vacancy -1.153896e+04 4033290.13 -90576.53
## `Land Area` 2.100077e+00 0.00 3975914.59
## `House Maint And Repair 2023 Cons Spdng` -6.503166e+00 0.00 -541370.86
## `Household Operations 2023 Cons Spdng` 8.233457e-01 0.00 114455.56
## `HH Grwth 2010 2023` 1.206935e+05 0.00 1150932.96
## `HH Grwth 2023 2028` -7.992534e+04 0.00 -717099.43
## `2023 Households` -7.494807e+05 900178.95 1298147.57
## `2023 Med HH Size` 3.548246e+06 -1039301.39 1212215.77
## `2023 Avg HH Size` -3.660241e+06 -991622.69 -1390163.34
## `HU Grwth 2010 2023` -4.388886e+05 883102.82 -4845459.37
## `2023 Avg HU Size` 4.376257e+05 0.00 1597221.12
## `2023 Avg HU Value` 1.331867e+01 18053.87 1020903.81
## `2023 Avg Yr Built` 2.682260e+05 0.00 1058183.00
## `2023 Group Quarters` -9.403814e+02 0.00 -192816.99
## `2023 Home Blt 1940 1949` -4.573036e+03 0.00 -983712.45
## `2023 Home Blt 1950 1959` -3.108226e+03 -124236.76 -1012367.93
## `2023 Home Blt 1960 1969` -4.043533e+03 -283008.45 -1681493.68
## `2023 Home Blt 1970 1979` -4.506785e+03 610197.19 -2620327.90
## `2023 Home Blt 1980 1989` -5.807567e+02 -129183.67 252716.29
## `2023 Home Blt 1990 1999` -5.509304e+03 0.00 -1583583.17
## `2023 Home Blt 2000 2010` -2.126723e+03 876842.56 -663675.85
## `2023 Home Blt 2010` 6.067036e+02 0.00 761318.61
## `2023 HU 1 Unit` 1.110805e+03 -241903.95 1223259.21
## `2023 HU 20 Units` -2.562441e+02 0.00 -500043.98
## `2023 HU 2 4 Units` 1.658187e+03 -1097736.71 377071.92
## `2023 HU 5 19 Units` -2.796718e+03 0.00 -1717397.83
## `2023 Med Yr Built` -2.670147e+05 0.00 -945766.56
## `2023 Owner Occd Housing` 7.711626e+05 0.00 1024382.29
## `2023 Renter Occd Housing` 7.528854e+05 0.00 2721447.56
## `2023 Home Value 1000000` -1.349811e+04 -207854.07 -233806.59
## `2023 Home Value 100000 200000` -1.049087e+04 -136592.67 -444620.05
## `2023 Home Value 200000 300000` -1.010864e+04 425753.53 -200905.15
## `2023 Home Value 300000 400000` -4.350944e+03 0.00 1116088.80
## `2023 Home Value 400000 500000` -7.447363e+03 988541.39 295689.13
## `2023 Home Value 500000 1000000` -1.165058e+02 0.00 1189373.64
## `2023 Home Value 100000` -8.128898e+03 -336579.02 676340.73
## `2023 Median Home Value` -1.987150e+01 0.00 -1640757.79
## `Pop Grwth 2010 2023` 3.094792e+05 651974.89 2326561.57
## `Pop Grwth 2023 2028` 2.842377e+05 0.00 1587110.67
## `2023 Population` 4.302822e+02 0.00 1946678.46
## `2023 Pop Wrk Trav Time 30 Min` -9.546439e+02 -86112.85 -923504.82
## `2023 Pop Wrk Trav Time 60 Min...45` -1.370296e+03 -572852.03 -1019167.69
## `2023 Pop Wrk Trav Time 60 Min...46` -4.699641e+03 NA -1205461.91
# Plotting coefficients
barplot(t(df.comp), beside = TRUE, col = c("blue", "green", "red"),
main = "Coefficients from Different Models",
xlab = "Variables", ylab = "Coefficients",
legend.text = TRUE)
legend("topright", inset = 0.05, legend = colnames(df.comp), fill = c("blue", "green", "red"))
# Calculate the number of missing coefficients to fill
missing_coef_count <- num_features - length(ridge_coefs)
# Create a vector with missing coefficients filled with NA
if (missing_coef_count > 0) {
ridge_coefs <- c(ridge_coefs, rep(NA, missing_coef_count))
}
# Split the data into training and testing sets (80% train, 20% test)
set.seed(42) # for reproducibility
train_index <- sample(1:nrow(df), 0.8 * nrow(df))
train_data <- df[train_index, ]
test_data <- df[-train_index, ]
# Define the formula for the regression model
formula <- as.formula("`Last Sale Price` ~ `Number Of Units` + `Land Area` + `2023 Households` + `2023 Med HH Size` + `2023 Avg HH Size` + `HU Grwth 2010 2023` + `2023 Home Value 1000000` + `2023 Home Value 100000 200000` + `2023 Home Value 200000 300000` + `Pop Grwth 2010 2023`")
# Fit the linear regression model
model <- lm(formula, data = train_data)
# Make predictions on the testing set
predictions <- predict(model, newdata = test_data)
# Evaluate the model
mse <- mean((test_data$Last_Sale_Price - predictions)^2)
r_squared <- summary(model)$r.squared
cat("Mean Squared Error:", mse, "\n")
## Mean Squared Error: NaN
cat("R-squared:", r_squared, "\n")
## R-squared: 0.43557
# Print the coefficients
coefficients <- coef(model)
print(coefficients)
## (Intercept) `Number Of Units`
## 2.653295e+06 4.843101e+04
## `Land Area` `2023 Households`
## 1.864141e+00 2.549349e+02
## `2023 Med HH Size` `2023 Avg HH Size`
## 4.127027e+06 -5.124950e+06
## `HU Grwth 2010 2023` `2023 Home Value 1000000`
## -3.276234e+05 8.484266e+03
## `2023 Home Value 100000 200000` `2023 Home Value 200000 300000`
## -7.919315e+02 -2.016957e+03
## `Pop Grwth 2010 2023`
## 3.343593e+05
# Summary of the linear regression model
summary(model)
##
## Call:
## lm(formula = formula, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -86271983 -2159783 160925 1964770 273147482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.653e+06 3.232e+06 0.821 0.41186
## `Number Of Units` 4.843e+04 3.524e+03 13.742 < 2e-16 ***
## `Land Area` 1.864e+00 2.749e-01 6.782 2.15e-11 ***
## `2023 Households` 2.549e+02 3.582e+02 0.712 0.47687
## `2023 Med HH Size` 4.127e+06 1.750e+06 2.358 0.01857 *
## `2023 Avg HH Size` -5.125e+06 1.600e+06 -3.203 0.00141 **
## `HU Grwth 2010 2023` -3.276e+05 1.539e+05 -2.129 0.03353 *
## `2023 Home Value 1000000` 8.484e+03 7.432e+03 1.142 0.25395
## `2023 Home Value 100000 200000` -7.919e+02 1.402e+03 -0.565 0.57244
## `2023 Home Value 200000 300000` -2.017e+03 1.581e+03 -1.275 0.20252
## `Pop Grwth 2010 2023` 3.344e+05 1.596e+05 2.095 0.03649 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12260000 on 898 degrees of freedom
## Multiple R-squared: 0.4356, Adjusted R-squared: 0.4293
## F-statistic: 69.3 on 10 and 898 DF, p-value: < 2.2e-16
# Make predictions on the testing set
predictions <- predict(model, newdata = test_data)
# Calculate residuals
residuals <- residuals(model)
# Ensure both predictions and residuals have the same length
n <- min(length(predictions), length(residuals))
predictions <- predictions[1:n]
residuals <- residuals[1:n]
# Plotting residuals vs. fitted values
plot(predictions, residuals)
abline(h = 0, col = "red") # Add a horizontal line at residual = 0
#
# # Load necessary packages
# library(car)
#
# # Combine all coefficient vectors into a dataframe
# df.comp <- data.frame(
# Linear = lm_coefs, # Coefficients from the multivariate linear regression model
# Lasso = lasso_coefs, # Coefficients from the lasso regression model
# Ridge = ridge_coefs # Coefficients from the ridge regression model
# )
#
# # Calculate VIF for each variable
# vif_values <- sapply(df.comp, function(x) {
# if (is.numeric(x)) {
# vif(lm(x ~ ., data = df.comp))
# } else {
# NA
# }
# })
#
# # Combine VIF values into a dataframe
# vif_df <- data.frame(VIF = vif_values)
#
# # Print or use VIF dataframe as needed
# print(vif_df)
# summary(model)
#
# # Fit the linear regression model
# model <- lm(formula, data = train_data)
#
# # Check the significance and direction of coefficients
# summary_model <- summary(model)
#
# # Identify statistically insignificant variables with high p-values
# insignificant_vars <- names(summary_model$coefficients[,"Pr(>|t|)"][summary_model$coefficients[,"Pr(>|t|)"] > 0.05])
#
# # Drop statistically insignificant variables from both training and testing datasets
# train_data <- train_data[, !names(train_data) %in% insignificant_vars]
# test_data <- test_data[, !names(test_data) %in% insignificant_vars]
#
# # Refit the model with updated variables
# model <- lm(formula, data = train_data)
#
# # Make predictions on the testing set
# predictions <- predict(model, newdata = test_data)
#
# # Evaluate the model
# mse <- mean((test_data$Last_Sale_Price - predictions)^2)
# r_squared <- summary(model)$r.squared
# cat("Mean Squared Error:", mse, "\n")
# cat("R-squared:", r_squared, "\n")
#
# # Print the coefficients
# coefficients <- coef(model)
# print(coefficients)
#
#
# # Plot the coefficients
# plot(model)
#
# # Print the summary
# summary(model)
#
# # Fit the linear regression model after removing '2023 Med HH Size' and '2023 Avg HH Size'
# model <- lm(`Last Sale Price` ~ `Number Of Units` + `Land Area` + `2023 Households` + `HU Grwth 2010 2023` + `2023 Home Value 1000000` + `2023 Home Value 100000 200000` + `2023 Home Value 200000 300000` + `Pop Grwth 2010 2023`, data = train_data)
#
# # Check the significance and direction of coefficients
# summary_model <- summary(model)
#
# # Identify statistically insignificant variables with high p-values
# insignificant_vars <- names(summary_model$coefficients[,"Pr(>|t|)"][summary_model$coefficients[,"Pr(>|t|)"] > 0.05])
#
# # Drop statistically insignificant variables from both training and testing datasets
# train_data <- train_data[, !names(train_data) %in% insignificant_vars]
# test_data <- test_data[, !names(test_data) %in% insignificant_vars]
#
# # Refit the model with updated variables
# model <- lm(`Last Sale Price` ~ `Number Of Units` + `Land Area` + `2023 Households` + `HU Grwth 2010 2023` + `2023 Home Value 1000000` + `2023 Home Value 100000 200000` + `2023 Home Value 200000 300000` + `Pop Grwth 2010 2023`, data = train_data)
#
# # Make predictions on the testing set
# predictions <- predict(model, newdata = test_data)
#
# # Evaluate the model
# mse <- mean((test_data$Last_Sale_Price - predictions)^2)
# r_squared <- summary(model)$r.squared
# cat("Mean Squared Error:", mse, "\n")
# cat("R-squared:", r_squared, "\n")
#
# # Plot the coefficients
# plot(model)
#
# # Print the summary
# summary(model)
#
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.