R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Clear the workspace
rm(list = ls())  # Clear environment
gc()             # Clear unused memory / Take out the trash
##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  691989 37.0    1295801 69.3  1295801 69.3
## Vcells 1244867  9.5    8388608 64.0  2005336 15.3
cat("\f")        # Clear the console
if(!is.null(dev.list())) dev.off() # Clear all plots
## null device 
##           1
library(readxl)

df <- read_excel("Raw Data NEW 02162024.xlsx")
df <- as.data.frame(df)

glimpse(df)
## Rows: 1,138
## Columns: 49
## $ `Number Of Units`                            <dbl> 25, 8, 18, 17, 13, 125, 2…
## $ `Year Built`                                 <dbl> 1984, 1950, 1990, 2007, 1…
## $ `Vacancy %`                                  <dbl> 7.04, 10.26, 6.31, 4.61, …
## $ `Land Area (SF)`                             <dbl> 108900, 277477, 67965, 17…
## $ `Last Sale Date`                             <dttm> 2022-04-20, 2022-05-13, …
## $ `Last Sale Price`                            <dbl> 3100000, 800000, 1000000,…
## $ `House Maint & Repair 2023 Cons Spdng $(1m)` <dbl> 2315630, 178754, 7470769,…
## $ `Household Operations 2023 Cons Spdng $(1m)` <dbl> 2875112, 197441, 8760263,…
## $ `% HH Grwth 2010-2023(1m)`                   <dbl> 1.17, 46.67, 2.66, 3.29, …
## $ `% HH Grwth 2023-2028(1m)`                   <dbl> 9.88, 7.91, -1.30, -0.64,…
## $ `2023 Households(1m)`                        <dbl> 2571, 139, 8107, 156, 21,…
## $ `2023 Med HH Size(1m)`                       <dbl> 2, 3, 2, 2, 2, 2, 2, 2, 2…
## $ `2023 Avg HH Size(1m)`                       <dbl> 2.3, 3.0, 2.0, 2.1, 2.2, …
## $ `% HU Grwth 2010-2023(1m)`                   <dbl> 8.03, 51.09, 5.03, 10.64,…
## $ `2023 Avg HU Size(1m)`                       <dbl> 4, 2, 9, 1, 1, 7, 9, 2, 1…
## $ `2023 Avg HU Value(1m)`                      <dbl> 215342, 225340, 240448, 1…
## $ `2023 Avg Yr Built(1m)`                      <dbl> 1971, 2003, 1974, 1984, 1…
## $ `2023 Group Quarters(1m)`                    <dbl> 491, 0, 130, 0, 0, 62, 20…
## $ `2023 Home Blt 1940-1949(1m)`                <dbl> 849, 1, 763, 7, 4, 583, 3…
## $ `2023 Home Blt 1950-1959(1m)`                <dbl> 381, 2, 1500, 4, 1, 2772,…
## $ `2023 Home Blt 1960-1969(1m)`                <dbl> 277, 2, 1854, 7, 3, 1745,…
## $ `2023 Home Blt 1970-1979(1m)`                <dbl> 247, 8, 3048, 83, 5, 1907…
## $ `2023 Home Blt 1980-1989(1m)`                <dbl> 310, 8, 1360, 52, 6, 786,…
## $ `2023 Home Blt 1990-1999(1m)`                <dbl> 284, 12, 268, 40, 8, 414,…
## $ `2023 Home Blt 2000-2010(1m)`                <dbl> 372, 72, 917, 21, 10, 56,…
## $ `2023 Home Blt 2010+(1m)`                    <dbl> 145, 49, 283, 8, 0, 1220,…
## $ `2023 HU 1 Unit(1m)`                         <dbl> 1985, 120, 3791, 77, 16, …
## $ `2023 HU 20+ Units(1m)`                      <dbl> 88, 3, 1353, 0, 0, 1658, …
## $ `2023 HU 2-4 Units(1m)`                      <dbl> 127, 0, 926, 0, 0, 458, 4…
## $ `2023 HU 5-19 Units(1m)`                     <dbl> 432, 5, 1522, 0, 0, 546, …
## $ `2023 Med Yr Built(1m)`                      <dbl> 1966, 2006, 1972, 1981, 1…
## $ `2023 Owner Occ'd Housing(1m)`               <dbl> 1343, 103, 4289, 124, 18,…
## $ `2023 Renter Occ'd Housing(1m)`              <dbl> 1229, 36, 3818, 32, 3, 20…
## $ `2023 Home Value $1,000,000+(1m)`            <dbl> 3, 0, 106, 0, 4, 15, 117,…
## $ `2023 Home Value $100,000-200,000(1m)`       <dbl> 514, 28, 919, 29, 4, 1958…
## $ `2023 Home Value $200,000-300,000(1m)`       <dbl> 341, 56, 671, 3, 2, 1352,…
## $ `2023 Home Value $300,000-400,000(1m)`       <dbl> 106, 12, 579, 2, 2, 730, …
## $ `2023 Home Value $400,000-500,000(1m)`       <dbl> 105, 1, 80, 0, 0, 355, 15…
## $ `2023 Home Value $500,000-1,000,000(1m)`     <dbl> 33, 0, 361, 12, 0, 324, 6…
## $ `2023 Home Value <100,000(1m)`               <dbl> 242, 6, 1573, 78, 5, 1575…
## $ `2023 Median Home Value(1m)`                 <dbl> 183657, 231250, 162187, 7…
## $ `% Pop Grwth 2010-2023(1m)`                  <dbl> 9.44, 51.61, 4.49, 10.56,…
## $ `% Pop Grwth 2023-2028(1m)`                  <dbl> 10.34, 8.04, -1.39, 0.00,…
## $ `2023 Population(1m)`                        <dbl> 6479, 423, 16666, 335, 51…
## $ `2023 Pop Wrk Trav Time <30 Min(1m)`         <dbl> 1882, 104, 5237, 79, 11, …
## $ `2023 Pop Wrk Trav Time 30-60 Min(1m)`       <dbl> 476, 93, 2031, 49, 2, 183…
## $ `2023 Pop Wrk Trav Time 60+ Min(1m)`         <dbl> 202, 21, 377, 4, 2, 234, …
## $ `County Name`                                <chr> "St. Johns", "Duval", "Pi…
## $ City                                         <chr> "Saint Augustine", "Jacks…
library(dplyr)

# Assuming your data frame is named 'df'
df <- df %>%
  rename(
    Number_Of_Units = `Number Of Units`,
    Year_Built = `Year Built`
  )


# Assuming your data frame is named 'df'
original_variable_names <- names(df) # Add more variable names if needed

# Loop through each original variable name
for (var_name in original_variable_names) {
  new_var_name <- tolower(gsub(" ", "_", var_name))  # Convert to lowercase and replace spaces with underscores
  df <- df %>% rename(!!new_var_name := !!var_name)  # Rename the variable
}

# Print the resulting data frame
#print(df)

library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer::stargazer(df#,type="text"
)
## 
## % Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
## % Date and time: Tue, Mar 19, 2024 - 22:49:17
## \begin{table}[!htbp] \centering 
##   \caption{} 
##   \label{} 
## \begin{tabular}{@{\extracolsep{5pt}}lccccc} 
## \\[-1.8ex]\hline 
## \hline \\[-1.8ex] 
## Statistic & \multicolumn{1}{c}{N} & \multicolumn{1}{c}{Mean} & \multicolumn{1}{c}{St. Dev.} & \multicolumn{1}{c}{Min} & \multicolumn{1}{c}{Max} \\ 
## \hline \\[-1.8ex] 
## number\_of\_units & 1,138 & 119.289 & 158.890 & 1 & 1,519 \\ 
## year\_built & 1,059 & 1,970.433 & 18.469 & 1,900 & 2,022 \\ 
## vacancy\_\% & 1,122 & 6.209 & 5.589 & 0.060 & 100.000 \\ 
## land\_area\_(sf) & 1,129 & 874,976.600 & 1,868,040.000 & 0 & 25,501,431 \\ 
## last\_sale\_price & 1,138 & 6,119,103.000 & 15,585,241.000 & 2,600 & 363,125,000 \\ 
## house\_maint\_&\_repair\_2023\_cons\_spdng\_\$(1m) & 1,137 & 3,201,500.000 & 2,408,647.000 & 0 & 12,607,149 \\ 
## household\_operations\_2023\_cons\_spdng\_\$(1m) & 1,137 & 3,457,632.000 & 2,645,364.000 & 0 & 18,560,824 \\ 
## \%\_hh\_grwth\_2010-2023(1m) & 1,137 & 12.449 & 17.914 & $-$24.460 & 157.930 \\ 
## \%\_hh\_grwth\_2023-2028(1m) & 1,137 & 2.776 & 3.856 & $-$10.790 & 17.240 \\ 
## 2023\_households(1m) & 1,137 & 3,034.502 & 2,401.237 & 0 & 12,942 \\ 
## 2023\_med\_hh\_size(1m) & 1,137 & 2.099 & 0.326 & 0 & 4 \\ 
## 2023\_avg\_hh\_size(1m) & 1,137 & 2.414 & 0.362 & 0.000 & 4.100 \\ 
## \%\_hu\_grwth\_2010-2023(1m) & 1,137 & 17.446 & 23.320 & $-$49.330 & 188.330 \\ 
## 2023\_avg\_hu\_size(1m) & 1,137 & 4.586 & 3.583 & 0 & 19 \\ 
## 2023\_avg\_hu\_value(1m) & 1,137 & 227,786.600 & 96,678.880 & 0 & 764,965 \\ 
## 2023\_avg\_yr\_built(1m) & 1,137 & 1,980.332 & 83.623 & 0 & 2,008 \\ 
## 2023\_group\_quarters(1m) & 1,137 & 104.622 & 233.940 & 0 & 3,287 \\ 
## 2023\_home\_blt\_1940-1949(1m) & 1,137 & 169.369 & 309.808 & 0 & 4,433 \\ 
## 2023\_home\_blt\_1950-1959(1m) & 1,137 & 345.573 & 512.421 & 0 & 3,297 \\ 
## 2023\_home\_blt\_1960-1969(1m) & 1,137 & 489.467 & 599.101 & 0 & 4,157 \\ 
## 2023\_home\_blt\_1970-1979(1m) & 1,137 & 770.105 & 826.316 & 0 & 5,872 \\ 
## 2023\_home\_blt\_1980-1989(1m) & 1,137 & 688.462 & 619.666 & 0 & 4,214 \\ 
## 2023\_home\_blt\_1990-1999(1m) & 1,137 & 391.283 & 359.370 & 0 & 3,347 \\ 
## 2023\_home\_blt\_2000-2010(1m) & 1,137 & 390.953 & 409.312 & 0 & 3,539 \\ 
## 2023\_home\_blt\_2010+(1m) & 1,137 & 335.470 & 407.243 & 0 & 2,570 \\ 
## 2023\_hu\_1\_unit(1m) & 1,137 & 1,732.252 & 1,280.847 & 0 & 8,026 \\ 
## 2023\_hu\_20+\_units(1m) & 1,137 & 386.996 & 724.563 & 0 & 7,773 \\ 
## 2023\_hu\_2-4\_units(1m) & 1,137 & 293.786 & 391.613 & 0 & 2,743 \\ 
## 2023\_hu\_5-19\_units(1m) & 1,137 & 420.390 & 621.173 & 0 & 4,326 \\ 
## 2023\_med\_yr\_built(1m) & 1,137 & 1,979.392 & 83.813 & 0 & 2,010 \\ 
## 2023\_owner\_occ'd\_housing(1m) & 1,137 & 1,860.248 & 1,393.002 & 0 & 7,443 \\ 
## 2023\_renter\_occ'd\_housing(1m) & 1,137 & 1,174.253 & 1,233.330 & 0 & 8,801 \\ 
## 2023\_home\_value\_\$1,000,000+(1m) & 1,137 & 24.793 & 60.630 & 0 & 551 \\ 
## 2023\_home\_value\_\$100,000-200,000(1m) & 1,137 & 531.785 & 500.642 & 0 & 3,138 \\ 
## 2023\_home\_value\_\$200,000-300,000(1m) & 1,137 & 409.989 & 379.426 & 0 & 2,179 \\ 
## 2023\_home\_value\_\$300,000-400,000(1m) & 1,137 & 198.217 & 237.679 & 0 & 1,649 \\ 
## 2023\_home\_value\_\$400,000-500,000(1m) & 1,137 & 75.173 & 119.113 & 0 & 1,179 \\ 
## 2023\_home\_value\_\$500,000-1,000,000(1m) & 1,137 & 100.508 & 144.795 & 0 & 1,829 \\ 
## 2023\_home\_value\_\textless 100,000(1m) & 1,137 & 519.798 & 585.554 & 0 & 4,566 \\ 
## 2023\_median\_home\_value(1m) & 1,137 & 188,513.000 & 92,171.900 & 0 & 767,973 \\ 
## \%\_pop\_grwth\_2010-2023(1m) & 1,137 & 17.372 & 22.500 & $-$48.460 & 189.240 \\ 
## \%\_pop\_grwth\_2023-2028(1m) & 1,137 & 2.804 & 3.809 & $-$8.960 & 17.130 \\ 
## 2023\_population(1m) & 1,137 & 7,401.467 & 5,884.947 & 0 & 32,505 \\ 
## 2023\_pop\_wrk\_trav\_time\_\textless 30\_min(1m) & 1,137 & 2,057.947 & 1,764.772 & 0 & 8,511 \\ 
## 2023\_pop\_wrk\_trav\_time\_30-60\_min(1m) & 1,137 & 988.638 & 1,018.898 & 0 & 7,965 \\ 
## 2023\_pop\_wrk\_trav\_time\_60+\_min(1m) & 1,137 & 245.870 & 287.865 & 0 & 2,940 \\ 
## \hline \\[-1.8ex] 
## \end{tabular} 
## \end{table}
library(ggcorrplot)
## Loading required package: ggplot2
?ggcorrplot()
?cor


?cor_pmat # cor_pmat(): Compute a correlation matrix p-values.

mycorr<- cor(x = df[, c(1:4,6:47)], use = "pairwise.complete.obs",method = c("spearman"))

p.mat <- ggcorrplot::cor_pmat(x = df[, c(1:4,6:47)])

# head(p.mat)
library(ggcorrplot)

myplot<-ggcorrplot(corr     = mycorr,   # correlation matrix to visualize
                   method   = "square", # character, the visualization method of correlation matrix to be used. Allowed values are "square" (default), "circle"
                   type     = "lower",  # character, "full" (default), "lower" or "upper" display
                   title    = "Correlation Plot",  # character, title of the graph
                   colors   = c("red", "white","green"), #    vector of 3 colors for low, mid and high correlation values.
                   lab      = TRUE,   # If TRUE, add corr coeff on the plot.
                   lab_size = 2,      # labels. used when lab = TRUE.
                   p.mat    = p.mat,  # matrix of p-value. If NULL, arguments sig.level, insig, pch, pch.col, pch.cex is invalid.  # Barring the no significant coefficient
                   insig    = "pch",  # character, specialized insignificant correlation coefficients, "pch" (default), "blank". If "blank", wipe away the corresponding glyphs; if "pch", add characters (see pch for details) on corresponding glyphs.
                   pch      = 4, # add character on the glyphs of insignificant correlation coefficients (only valid when insig is "pch"). Default value is 4.
                   hc.order = TRUE, # If TRUE, correlation matrix will be hc.ordered using hclust function.
                   tl.cex   = 8, # the size, the color and the string rotation of text label
                   tl.col   = "black", 
                   digits = 2
)
myplot

# Explore data
str(df)
## 'data.frame':    1138 obs. of  49 variables:
##  $ number_of_units                           : num  25 8 18 17 13 125 236 22 85 5 ...
##  $ year_built                                : num  1984 1950 1990 2007 1977 ...
##  $ vacancy_%                                 : num  7.04 10.26 6.31 4.61 6.29 ...
##  $ land_area_(sf)                            : num  108900 277477 67965 174240 348480 ...
##  $ last_sale_date                            : POSIXct, format: "2022-04-20" "2022-05-13" ...
##  $ last_sale_price                           : num  3100000 800000 1000000 600000 750000 ...
##  $ house_maint_&_repair_2023_cons_spdng_$(1m): num  2315630 178754 7470769 180919 26145 ...
##  $ household_operations_2023_cons_spdng_$(1m): num  2875112 197441 8760263 158527 21370 ...
##  $ %_hh_grwth_2010-2023(1m)                  : num  1.17 46.67 2.66 3.29 0 ...
##  $ %_hh_grwth_2023-2028(1m)                  : num  9.88 7.91 -1.3 -0.64 0 ...
##  $ 2023_households(1m)                       : num  2571 139 8107 156 21 ...
##  $ 2023_med_hh_size(1m)                      : num  2 3 2 2 2 2 2 2 2 2 ...
##  $ 2023_avg_hh_size(1m)                      : num  2.3 3 2 2.1 2.2 2.1 2.1 2.9 1.8 2.5 ...
##  $ %_hu_grwth_2010-2023(1m)                  : num  8.03 51.09 5.03 10.64 -25 ...
##  $ 2023_avg_hu_size(1m)                      : num  4 2 9 1 1 7 9 2 17 3 ...
##  $ 2023_avg_hu_value(1m)                     : num  215342 225340 240448 157097 394118 ...
##  $ 2023_avg_yr_built(1m)                     : num  1971 2003 1974 1984 1984 ...
##  $ 2023_group_quarters(1m)                   : num  491 0 130 0 0 62 209 102 21 75 ...
##  $ 2023_home_blt_1940-1949(1m)               : num  849 1 763 7 4 583 363 12 77 124 ...
##  $ 2023_home_blt_1950-1959(1m)               : num  381 2 1500 4 1 ...
##  $ 2023_home_blt_1960-1969(1m)               : num  277 2 1854 7 3 ...
##  $ 2023_home_blt_1970-1979(1m)               : num  247 8 3048 83 5 ...
##  $ 2023_home_blt_1980-1989(1m)               : num  310 8 1360 52 6 ...
##  $ 2023_home_blt_1990-1999(1m)               : num  284 12 268 40 8 ...
##  $ 2023_home_blt_2000-2010(1m)               : num  372 72 917 21 10 56 135 53 70 918 ...
##  $ 2023_home_blt_2010+(1m)                   : num  145 49 283 8 0 ...
##  $ 2023_hu_1_unit(1m)                        : num  1985 120 3791 77 16 ...
##  $ 2023_hu_20+_units(1m)                     : num  88 3 1353 0 0 ...
##  $ 2023_hu_2-4_units(1m)                     : num  127 0 926 0 0 458 435 23 252 33 ...
##  $ 2023_hu_5-19_units(1m)                    : num  432 5 1522 0 0 ...
##  $ 2023_med_yr_built(1m)                     : num  1966 2006 1972 1981 1988 ...
##  $ 2023_owner_occ'd_housing(1m)              : num  1343 103 4289 124 18 ...
##  $ 2023_renter_occ'd_housing(1m)             : num  1229 36 3818 32 3 ...
##  $ 2023_home_value_$1,000,000+(1m)           : num  3 0 106 0 4 15 117 0 28 8 ...
##  $ 2023_home_value_$100,000-200,000(1m)      : num  514 28 919 29 4 ...
##  $ 2023_home_value_$200,000-300,000(1m)      : num  341 56 671 3 2 ...
##  $ 2023_home_value_$300,000-400,000(1m)      : num  106 12 579 2 2 730 543 4 322 151 ...
##  $ 2023_home_value_$400,000-500,000(1m)      : num  105 1 80 0 0 355 159 1 296 21 ...
##  $ 2023_home_value_$500,000-1,000,000(1m)    : num  33 0 361 12 0 324 624 0 790 99 ...
##  $ 2023_home_value_<100,000(1m)              : num  242 6 1573 78 5 ...
##  $ 2023_median_home_value(1m)                : num  183657 231250 162187 79486 187499 ...
##  $ %_pop_grwth_2010-2023(1m)                 : num  9.44 51.61 4.49 10.56 -26.09 ...
##  $ %_pop_grwth_2023-2028(1m)                 : num  10.34 8.04 -1.39 0 -1.96 ...
##  $ 2023_population(1m)                       : num  6479 423 16666 335 51 ...
##  $ 2023_pop_wrk_trav_time_<30_min(1m)        : num  1882 104 5237 79 11 ...
##  $ 2023_pop_wrk_trav_time_30-60_min(1m)      : num  476 93 2031 49 2 ...
##  $ 2023_pop_wrk_trav_time_60+_min(1m)        : num  202 21 377 4 2 234 274 38 145 136 ...
##  $ county_name                               : chr  "St. Johns" "Duval" "Pinellas" "Okeechobee" ...
##  $ city                                      : chr  "Saint Augustine" "Jacksonville" "Largo" "Okeechobee" ...
summary(df)
##  number_of_units    year_built     vacancy_%       land_area_(sf)    
##  Min.   :   1.0   Min.   :1900   Min.   :  0.060   Min.   :       0  
##  1st Qu.:  22.0   1st Qu.:1958   1st Qu.:  4.650   1st Qu.:  118699  
##  Median :  60.0   Median :1971   Median :  5.680   Median :  302624  
##  Mean   : 119.3   Mean   :1970   Mean   :  6.209   Mean   :  874977  
##  3rd Qu.: 150.0   3rd Qu.:1983   3rd Qu.:  6.680   3rd Qu.:  899078  
##  Max.   :1519.0   Max.   :2022   Max.   :100.000   Max.   :25501431  
##                   NA's   :79     NA's   :16        NA's   :9         
##  last_sale_date                   last_sale_price    
##  Min.   :1995-05-26 00:00:00.00   Min.   :     2600  
##  1st Qu.:2012-01-26 00:00:00.00   1st Qu.:   740000  
##  Median :2018-04-29 12:00:00.00   Median :  1800000  
##  Mean   :2015-10-06 07:53:15.07   Mean   :  6119103  
##  3rd Qu.:2021-05-23 06:00:00.00   3rd Qu.:  5918750  
##  Max.   :2024-02-06 00:00:00.00   Max.   :363125000  
##                                                      
##  house_maint_&_repair_2023_cons_spdng_$(1m)
##  Min.   :       0                          
##  1st Qu.: 1311019                          
##  Median : 2719773                          
##  Mean   : 3201500                          
##  3rd Qu.: 4484145                          
##  Max.   :12607149                          
##  NA's   :1                                 
##  household_operations_2023_cons_spdng_$(1m) %_hh_grwth_2010-2023(1m)
##  Min.   :       0                           Min.   :-24.46          
##  1st Qu.: 1221935                           1st Qu.:  3.12          
##  Median : 2905937                           Median :  7.86          
##  Mean   : 3457632                           Mean   : 12.45          
##  3rd Qu.: 5158881                           3rd Qu.: 14.95          
##  Max.   :18560824                           Max.   :157.93          
##  NA's   :1                                  NA's   :1               
##  %_hh_grwth_2023-2028(1m) 2023_households(1m) 2023_med_hh_size(1m)
##  Min.   :-10.790          Min.   :    0       Min.   :0.000       
##  1st Qu.:  0.000          1st Qu.: 1152       1st Qu.:2.000       
##  Median :  2.580          Median : 2403       Median :2.000       
##  Mean   :  2.776          Mean   : 3035       Mean   :2.099       
##  3rd Qu.:  5.430          3rd Qu.: 4637       3rd Qu.:2.000       
##  Max.   : 17.240          Max.   :12942       Max.   :4.000       
##  NA's   :1                NA's   :1           NA's   :1           
##  2023_avg_hh_size(1m) %_hu_grwth_2010-2023(1m) 2023_avg_hu_size(1m)
##  Min.   :0.000        Min.   :-49.33           Min.   : 0.000      
##  1st Qu.:2.200        1st Qu.:  4.97           1st Qu.: 2.000      
##  Median :2.400        Median : 12.30           Median : 3.000      
##  Mean   :2.414        Mean   : 17.45           Mean   : 4.586      
##  3rd Qu.:2.600        3rd Qu.: 22.99           3rd Qu.: 7.000      
##  Max.   :4.100        Max.   :188.33           Max.   :19.000      
##  NA's   :1            NA's   :1                NA's   :1           
##  2023_avg_hu_value(1m) 2023_avg_yr_built(1m) 2023_group_quarters(1m)
##  Min.   :     0        Min.   :   0          Min.   :   0.0         
##  1st Qu.:161680        1st Qu.:1978          1st Qu.:   1.0         
##  Median :206830        Median :1983          Median :  21.0         
##  Mean   :227787        Mean   :1980          Mean   : 104.6         
##  3rd Qu.:267029        3rd Qu.:1990          3rd Qu.: 112.0         
##  Max.   :764965        Max.   :2008          Max.   :3287.0         
##  NA's   :1             NA's   :1             NA's   :1              
##  2023_home_blt_1940-1949(1m) 2023_home_blt_1950-1959(1m)
##  Min.   :   0.0              Min.   :   0.0             
##  1st Qu.:  21.0              1st Qu.:  31.0             
##  Median :  60.0              Median : 120.0             
##  Mean   : 169.4              Mean   : 345.6             
##  3rd Qu.: 189.0              3rd Qu.: 414.0             
##  Max.   :4433.0              Max.   :3297.0             
##  NA's   :1                   NA's   :1                  
##  2023_home_blt_1960-1969(1m) 2023_home_blt_1970-1979(1m)
##  Min.   :   0.0              Min.   :   0.0             
##  1st Qu.:  62.0              1st Qu.: 165.0             
##  Median : 234.0              Median : 452.0             
##  Mean   : 489.5              Mean   : 770.1             
##  3rd Qu.: 701.0              3rd Qu.:1122.0             
##  Max.   :4157.0              Max.   :5872.0             
##  NA's   :1                   NA's   :1                  
##  2023_home_blt_1980-1989(1m) 2023_home_blt_1990-1999(1m)
##  Min.   :   0.0              Min.   :   0.0             
##  1st Qu.: 228.0              1st Qu.: 150.0             
##  Median : 497.0              Median : 302.0             
##  Mean   : 688.5              Mean   : 391.3             
##  3rd Qu.: 972.0              3rd Qu.: 509.0             
##  Max.   :4214.0              Max.   :3347.0             
##  NA's   :1                   NA's   :1                  
##  2023_home_blt_2000-2010(1m) 2023_home_blt_2010+(1m) 2023_hu_1_unit(1m)
##  Min.   :   0                Min.   :   0.0          Min.   :   0      
##  1st Qu.: 104                1st Qu.:  67.0          1st Qu.: 660      
##  Median : 259                Median : 199.0          Median :1514      
##  Mean   : 391                Mean   : 335.5          Mean   :1732      
##  3rd Qu.: 543                3rd Qu.: 428.0          3rd Qu.:2550      
##  Max.   :3539                Max.   :2570.0          Max.   :8026      
##  NA's   :1                   NA's   :1               NA's   :1         
##  2023_hu_20+_units(1m) 2023_hu_2-4_units(1m) 2023_hu_5-19_units(1m)
##  Min.   :   0          Min.   :   0.0        Min.   :   0.0        
##  1st Qu.:   5          1st Qu.:  25.0        1st Qu.:  17.0        
##  Median :  75          Median : 147.0        Median : 165.0        
##  Mean   : 387          Mean   : 293.8        Mean   : 420.4        
##  3rd Qu.: 454          3rd Qu.: 411.0        3rd Qu.: 580.0        
##  Max.   :7773          Max.   :2743.0        Max.   :4326.0        
##  NA's   :1             NA's   :1             NA's   :1             
##  2023_med_yr_built(1m) 2023_owner_occ'd_housing(1m)
##  Min.   :   0          Min.   :   0                
##  1st Qu.:1976          1st Qu.: 786                
##  Median :1982          Median :1574                
##  Mean   :1979          Mean   :1860                
##  3rd Qu.:1989          3rd Qu.:2617                
##  Max.   :2010          Max.   :7443                
##  NA's   :1             NA's   :1                   
##  2023_renter_occ'd_housing(1m) 2023_home_value_$1,000,000+(1m)
##  Min.   :   0                  Min.   :  0.00                 
##  1st Qu.: 256                  1st Qu.:  0.00                 
##  Median : 740                  Median :  0.00                 
##  Mean   :1174                  Mean   : 24.79                 
##  3rd Qu.:1777                  3rd Qu.: 20.00                 
##  Max.   :8801                  Max.   :551.00                 
##  NA's   :1                     NA's   :1                      
##  2023_home_value_$100,000-200,000(1m) 2023_home_value_$200,000-300,000(1m)
##  Min.   :   0.0                       Min.   :   0                        
##  1st Qu.: 158.0                       1st Qu.: 122                        
##  Median : 372.0                       Median : 302                        
##  Mean   : 531.8                       Mean   : 410                        
##  3rd Qu.: 757.0                       3rd Qu.: 595                        
##  Max.   :3138.0                       Max.   :2179                        
##  NA's   :1                            NA's   :1                           
##  2023_home_value_$300,000-400,000(1m) 2023_home_value_$400,000-500,000(1m)
##  Min.   :   0.0                       Min.   :   0.00                     
##  1st Qu.:  35.0                       1st Qu.:   5.00                     
##  Median : 107.0                       Median :  33.00                     
##  Mean   : 198.2                       Mean   :  75.17                     
##  3rd Qu.: 280.0                       3rd Qu.:  95.00                     
##  Max.   :1649.0                       Max.   :1179.00                     
##  NA's   :1                            NA's   :1                           
##  2023_home_value_$500,000-1,000,000(1m) 2023_home_value_<100,000(1m)
##  Min.   :   0.0                         Min.   :   0.0              
##  1st Qu.:  10.0                         1st Qu.: 131.0              
##  Median :  47.0                         Median : 314.0              
##  Mean   : 100.5                         Mean   : 519.8              
##  3rd Qu.: 133.0                         3rd Qu.: 652.0              
##  Max.   :1829.0                         Max.   :4566.0              
##  NA's   :1                              NA's   :1                   
##  2023_median_home_value(1m) %_pop_grwth_2010-2023(1m) %_pop_grwth_2023-2028(1m)
##  Min.   :     0             Min.   :-48.46            Min.   :-8.960           
##  1st Qu.:126721             1st Qu.:  5.43            1st Qu.: 0.000           
##  Median :171594             Median : 12.58            Median : 2.530           
##  Mean   :188513             Mean   : 17.37            Mean   : 2.804           
##  3rd Qu.:232528             3rd Qu.: 22.73            3rd Qu.: 5.400           
##  Max.   :767973             Max.   :189.24            Max.   :17.130           
##  NA's   :1                  NA's   :1                 NA's   :1                
##  2023_population(1m) 2023_pop_wrk_trav_time_<30_min(1m)
##  Min.   :    0       Min.   :   0                      
##  1st Qu.: 2746       1st Qu.: 608                      
##  Median : 5854       Median :1548                      
##  Mean   : 7401       Mean   :2058                      
##  3rd Qu.:10881       3rd Qu.:3129                      
##  Max.   :32505       Max.   :8511                      
##  NA's   :1           NA's   :1                         
##  2023_pop_wrk_trav_time_30-60_min(1m) 2023_pop_wrk_trav_time_60+_min(1m)
##  Min.   :   0.0                       Min.   :   0.0                    
##  1st Qu.: 274.0                       1st Qu.:  64.0                    
##  Median : 662.0                       Median : 168.0                    
##  Mean   : 988.6                       Mean   : 245.9                    
##  3rd Qu.:1427.0                       3rd Qu.: 317.0                    
##  Max.   :7965.0                       Max.   :2940.0                    
##  NA's   :1                            NA's   :1                         
##  county_name            city          
##  Length:1138        Length:1138       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
## 
head(df)
##   number_of_units year_built vacancy_% land_area_(sf) last_sale_date
## 1              25       1984      7.04         108900     2022-04-20
## 2               8       1950     10.26         277477     2022-05-13
## 3              18       1990      6.31          67965     2023-03-20
## 4              17       2007      4.61         174240     2019-08-02
## 5              13       1977      6.29         348480     2024-01-16
## 6             125       1936      6.04         337154     2021-07-15
##   last_sale_price house_maint_&_repair_2023_cons_spdng_$(1m)
## 1         3100000                                    2315630
## 2          800000                                     178754
## 3         1000000                                    7470769
## 4          600000                                     180919
## 5          750000                                      26145
## 6         9500000                                   11054731
##   household_operations_2023_cons_spdng_$(1m) %_hh_grwth_2010-2023(1m)
## 1                                    2875112                     1.17
## 2                                     197441                    46.67
## 3                                    8760263                     2.66
## 4                                     158527                     3.29
## 5                                      21370                     0.00
## 6                                   11175881                    13.91
##   %_hh_grwth_2023-2028(1m) 2023_households(1m) 2023_med_hh_size(1m)
## 1                     9.88                2571                    2
## 2                     7.91                 139                    3
## 3                    -1.30                8107                    2
## 4                    -0.64                 156                    2
## 5                     0.00                  21                    2
## 6                     0.40                8322                    2
##   2023_avg_hh_size(1m) %_hu_grwth_2010-2023(1m) 2023_avg_hu_size(1m)
## 1                  2.3                     8.03                    4
## 2                  3.0                    51.09                    2
## 3                  2.0                     5.03                    9
## 4                  2.1                    10.64                    1
## 5                  2.2                   -25.00                    1
## 6                  2.1                    15.95                    7
##   2023_avg_hu_value(1m) 2023_avg_yr_built(1m) 2023_group_quarters(1m)
## 1                215342                  1971                     491
## 2                225340                  2003                       0
## 3                240448                  1974                     130
## 4                157097                  1984                       0
## 5                394118                  1984                       0
## 6                222175                  1972                      62
##   2023_home_blt_1940-1949(1m) 2023_home_blt_1950-1959(1m)
## 1                         849                         381
## 2                           1                           2
## 3                         763                        1500
## 4                           7                           4
## 5                           4                           1
## 6                         583                        2772
##   2023_home_blt_1960-1969(1m) 2023_home_blt_1970-1979(1m)
## 1                         277                         247
## 2                           2                           8
## 3                        1854                        3048
## 4                           7                          83
## 5                           3                           5
## 6                        1745                        1907
##   2023_home_blt_1980-1989(1m) 2023_home_blt_1990-1999(1m)
## 1                         310                         284
## 2                           8                          12
## 3                        1360                         268
## 4                          52                          40
## 5                           6                           8
## 6                         786                         414
##   2023_home_blt_2000-2010(1m) 2023_home_blt_2010+(1m) 2023_hu_1_unit(1m)
## 1                         372                     145               1985
## 2                          72                      49                120
## 3                         917                     283               3791
## 4                          21                       8                 77
## 5                          10                       0                 16
## 6                          56                    1220               6281
##   2023_hu_20+_units(1m) 2023_hu_2-4_units(1m) 2023_hu_5-19_units(1m)
## 1                    88                   127                    432
## 2                     3                     0                      5
## 3                  1353                   926                   1522
## 4                     0                     0                      0
## 5                     0                     0                      0
## 6                  1658                   458                    546
##   2023_med_yr_built(1m) 2023_owner_occ'd_housing(1m)
## 1                  1966                         1343
## 2                  2006                          103
## 3                  1972                         4289
## 4                  1981                          124
## 5                  1988                           18
## 6                  1967                         6310
##   2023_renter_occ'd_housing(1m) 2023_home_value_$1,000,000+(1m)
## 1                          1229                               3
## 2                            36                               0
## 3                          3818                             106
## 4                            32                               0
## 5                             3                               4
## 6                          2012                              15
##   2023_home_value_$100,000-200,000(1m) 2023_home_value_$200,000-300,000(1m)
## 1                                  514                                  341
## 2                                   28                                   56
## 3                                  919                                  671
## 4                                   29                                    3
## 5                                    4                                    2
## 6                                 1958                                 1352
##   2023_home_value_$300,000-400,000(1m) 2023_home_value_$400,000-500,000(1m)
## 1                                  106                                  105
## 2                                   12                                    1
## 3                                  579                                   80
## 4                                    2                                    0
## 5                                    2                                    0
## 6                                  730                                  355
##   2023_home_value_$500,000-1,000,000(1m) 2023_home_value_<100,000(1m)
## 1                                     33                          242
## 2                                      0                            6
## 3                                    361                         1573
## 4                                     12                           78
## 5                                      0                            5
## 6                                    324                         1575
##   2023_median_home_value(1m) %_pop_grwth_2010-2023(1m)
## 1                     183657                      9.44
## 2                     231250                     51.61
## 3                     162187                      4.49
## 4                      79486                     10.56
## 5                     187499                    -26.09
## 6                     180668                     14.49
##   %_pop_grwth_2023-2028(1m) 2023_population(1m)
## 1                     10.34                6479
## 2                      8.04                 423
## 3                     -1.39               16666
## 4                      0.00                 335
## 5                     -1.96                  51
## 6                      0.17               17448
##   2023_pop_wrk_trav_time_<30_min(1m) 2023_pop_wrk_trav_time_30-60_min(1m)
## 1                               1882                                  476
## 2                                104                                   93
## 3                               5237                                 2031
## 4                                 79                                   49
## 5                                 11                                    2
## 6                               6442                                 1838
##   2023_pop_wrk_trav_time_60+_min(1m) county_name             city
## 1                                202   St. Johns  Saint Augustine
## 2                                 21       Duval     Jacksonville
## 3                                377    Pinellas            Largo
## 4                                  4  Okeechobee       Okeechobee
## 5                                  2        Levy      Otter Creek
## 6                                234    Pinellas Saint Petersburg
# Handle missing values
df <- na.omit(df)

# Handle duplicate records
df <- unique(df)

# Standardize variable names
names(df) <- tolower(names(df))


model1 <- lm(last_sale_price ~ `land_area_(sf)` + year_built + number_of_units, data = df)

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
str(df)
## 'data.frame':    1036 obs. of  49 variables:
##  $ number_of_units                           : num  25 8 18 17 13 125 236 85 5 20 ...
##  $ year_built                                : num  1984 1950 1990 2007 1977 ...
##  $ vacancy_%                                 : num  7.04 10.26 6.31 4.61 6.29 ...
##  $ land_area_(sf)                            : num  108900 277477 67965 174240 348480 ...
##  $ last_sale_date                            : POSIXct, format: "2022-04-20" "2022-05-13" ...
##  $ last_sale_price                           : num  3100000 800000 1000000 600000 750000 ...
##  $ house_maint_&_repair_2023_cons_spdng_$(1m): num  2315630 178754 7470769 180919 26145 ...
##  $ household_operations_2023_cons_spdng_$(1m): num  2875112 197441 8760263 158527 21370 ...
##  $ %_hh_grwth_2010-2023(1m)                  : num  1.17 46.67 2.66 3.29 0 ...
##  $ %_hh_grwth_2023-2028(1m)                  : num  9.88 7.91 -1.3 -0.64 0 ...
##  $ 2023_households(1m)                       : num  2571 139 8107 156 21 ...
##  $ 2023_med_hh_size(1m)                      : num  2 3 2 2 2 2 2 2 2 3 ...
##  $ 2023_avg_hh_size(1m)                      : num  2.3 3 2 2.1 2.2 2.1 2.1 1.8 2.5 3.4 ...
##  $ %_hu_grwth_2010-2023(1m)                  : num  8.03 51.09 5.03 10.64 -25 ...
##  $ 2023_avg_hu_size(1m)                      : num  4 2 9 1 1 7 9 17 3 12 ...
##  $ 2023_avg_hu_value(1m)                     : num  215342 225340 240448 157097 394118 ...
##  $ 2023_avg_yr_built(1m)                     : num  1971 2003 1974 1984 1984 ...
##  $ 2023_group_quarters(1m)                   : num  491 0 130 0 0 62 209 21 75 520 ...
##  $ 2023_home_blt_1940-1949(1m)               : num  849 1 763 7 4 ...
##  $ 2023_home_blt_1950-1959(1m)               : num  381 2 1500 4 1 ...
##  $ 2023_home_blt_1960-1969(1m)               : num  277 2 1854 7 3 ...
##  $ 2023_home_blt_1970-1979(1m)               : num  247 8 3048 83 5 ...
##  $ 2023_home_blt_1980-1989(1m)               : num  310 8 1360 52 6 ...
##  $ 2023_home_blt_1990-1999(1m)               : num  284 12 268 40 8 ...
##  $ 2023_home_blt_2000-2010(1m)               : num  372 72 917 21 10 56 135 70 918 213 ...
##  $ 2023_home_blt_2010+(1m)                   : num  145 49 283 8 0 ...
##  $ 2023_hu_1_unit(1m)                        : num  1985 120 3791 77 16 ...
##  $ 2023_hu_20+_units(1m)                     : num  88 3 1353 0 0 ...
##  $ 2023_hu_2-4_units(1m)                     : num  127 0 926 0 0 458 435 252 33 953 ...
##  $ 2023_hu_5-19_units(1m)                    : num  432 5 1522 0 0 ...
##  $ 2023_med_yr_built(1m)                     : num  1966 2006 1972 1981 1988 ...
##  $ 2023_owner_occ'd_housing(1m)              : num  1343 103 4289 124 18 ...
##  $ 2023_renter_occ'd_housing(1m)             : num  1229 36 3818 32 3 ...
##  $ 2023_home_value_$1,000,000+(1m)           : num  3 0 106 0 4 15 117 28 8 2 ...
##  $ 2023_home_value_$100,000-200,000(1m)      : num  514 28 919 29 4 ...
##  $ 2023_home_value_$200,000-300,000(1m)      : num  341 56 671 3 2 ...
##  $ 2023_home_value_$300,000-400,000(1m)      : num  106 12 579 2 2 ...
##  $ 2023_home_value_$400,000-500,000(1m)      : num  105 1 80 0 0 355 159 296 21 463 ...
##  $ 2023_home_value_$500,000-1,000,000(1m)    : num  33 0 361 12 0 324 624 790 99 240 ...
##  $ 2023_home_value_<100,000(1m)              : num  242 6 1573 78 5 ...
##  $ 2023_median_home_value(1m)                : num  183657 231250 162187 79486 187499 ...
##  $ %_pop_grwth_2010-2023(1m)                 : num  9.44 51.61 4.49 10.56 -26.09 ...
##  $ %_pop_grwth_2023-2028(1m)                 : num  10.34 8.04 -1.39 0 -1.96 ...
##  $ 2023_population(1m)                       : num  6479 423 16666 335 51 ...
##  $ 2023_pop_wrk_trav_time_<30_min(1m)        : num  1882 104 5237 79 11 ...
##  $ 2023_pop_wrk_trav_time_30-60_min(1m)      : num  476 93 2031 49 2 ...
##  $ 2023_pop_wrk_trav_time_60+_min(1m)        : num  202 21 377 4 2 ...
##  $ county_name                               : chr  "St. Johns" "Duval" "Pinellas" "Okeechobee" ...
##  $ city                                      : chr  "Saint Augustine" "Jacksonville" "Largo" "Okeechobee" ...
##  - attr(*, "na.action")= 'omit' Named int [1:102] 8 36 44 48 79 109 114 120 129 130 ...
##   ..- attr(*, "names")= chr [1:102] "8" "36" "44" "48" ...
vif(model1)
## `land_area_(sf)`       year_built  number_of_units 
##         1.778941         1.038248         1.754210
library(ggplot2)

# Document cleaning process
# Write comments explaining each step taken


# Your data (assuming it's stored in a dataframe named 'data')
# Make sure to replace 'data' with the name of your dataframe
#df <- read_excel("~/Raw Data NEW 02162024.xlsx")
ggplot(df, aes(x = last_sale_price, y = year_built)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(x = "Last Sale Price", y = "Year Built") +
  ggtitle("Relationship between Last Sale Price and Year Built") +
  xlim(0, max(df$last_sale_price)) +  # Adjust x-axis limits
  ylim(min(df$year_built), max(df$year_built))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

ggplot(df, aes(x = last_sale_price / number_of_units, y = year_built)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(x = "Sale Price per Unit", y = "Year Built") +
  ggtitle("Relationship between Sale Price per Unit and Year Built") +
  xlim(0, max(df$last_sale_price / df$number_of_units)) +  # Adjust x-axis limits
  ylim(min(df$year_built), max(df$year_built))
## `geom_smooth()` using formula = 'y ~ x'

df_filtered <- df %>%
  filter(last_sale_price / number_of_units <= 100000)

ggplot(df_filtered, aes(x = last_sale_price / number_of_units, y = year_built)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(x = "Sale Price per Unit", y = "Year Built") +
  ggtitle("Relationship between Sale Price per Unit and Year Built") +
  xlim(0, 100000) +  # Adjust x-axis limits
  ylim(min(df_filtered$year_built), max(df_filtered$year_built))
## `geom_smooth()` using formula = 'y ~ x'

df_filtered <- df %>%
  filter(last_sale_price / number_of_units <= 40000)

ggplot(df_filtered, aes(x = number_of_units, y = last_sale_price / number_of_units)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(x = "Number of Units", y = "Sale Price per Unit") +
  ggtitle("Relationship between Number of Units and Sale Price per Unit")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(df, aes(x = `land_area_(sf)`, y = last_sale_price / number_of_units)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(x = "Land Area", y = "Sale Price per Unit") +
  ggtitle("Relationship between Land Area and Sale Price per Unit")
## `geom_smooth()` using formula = 'y ~ x'

#library(glmnet)

# Prepare the data
#X <- as.matrix(df[, -which(names(df) == "last_sale_price")])  # Independent variables
#y <- df$last_sale_price / df$number_of_units  # Dependent variable

# Fit Lasso regression model
#lasso_model <- cv.glmnet(X, y, alpha = 1, nfolds = 10)

# Get the selected variables
#lasso_selected_variables <- coef(lasso_model, s = "lambda.min")[-1, ]
#lasso_selected_variables <- names(lasso_selected_variables[lasso_selected_variables != 0])

# Print the selected variables
#print(lasso_selected_variables)




# Convert the dataframe to a matrix and ensure all elements are numeric
#X <- as.matrix(df[, -which(names(df) == "last_sale_price")], 
               #rownames.force = NA)
#X <- apply(X, 2, as.numeric)

# Scale the independent variables
#scaled_X <- scale(X)

# Impute missing values with the mean
#scaled_X[is.na(scaled_X)] <- colMeans(scaled_X, na.rm = TRUE)

#library(missForest)

#imputed_data <- missForest(scaled_X)

# Fit LASSO regression model on scaled data
#lasso_model <- cv.glmnet(imputed_data$ximp, y, alpha = 1, nfolds = 10)

# Get the selected variables
#lasso_selected_variables <- coef(lasso_model, s = "lambda.min")[-1, ]
#lasso_selected_variables <- names(lasso_selected_variables[lasso_selected_variables != 0])

# Print the selected variables
#print(lasso_selected_variables)


#If the Lasso regression model returns an empty set of selected variables ("character (0)"), it suggests that none of the variables were selected as important predictors by the Lasso model. This can happen if the regularization penalty is too strong or if none of the variables provide significant predictive power for the target variable.

#That means we need to select based on intuition or some other method


# Convert non-numeric columns to numeric
#numeric_df <- as.data.frame(sapply(df, function(x) as.numeric(as.character(x))))

# Calculate correlations
#correlations <- cor(numeric_df)

# Sort correlations with the target variable ('last sale price per unit')
#target_correlations <- correlations[, "last_sale_price_per_unit"]

# Print the top 10 positive and negative correlations with the target variable
#print("Top 10 Positive Correlations:")
#print(head(sort(target_correlations[target_correlations > 0], decreasing = TRUE), 10))

#print("Top 10 Negative Correlations:")
#print(head(sort(target_correlations[target_correlations < 0], decreasing = FALSE), 10))
# Convert non-numeric columns to numeric
#numeric_df <- as.data.frame(sapply(df, function(x) as.numeric(as.character(x))))


# Load required libraries
#library(ggplot2)

# Provide context
#cat("This analysis aims to investigate the relationship between various factors and the 'last sale price per unit' in a real estate dataset.\n")

# Step 1: Data preprocessing
# Load and preprocess the dataset

# Step 2: Calculate correlations
# Convert non-numeric columns to numeric
#numeric_df <- as.data.frame(sapply(df, function(x) as.numeric(as.character(x))))

# Exclude 'last sale price' variable
#numeric_df_without_last_sale_price <- numeric_df[, !names(numeric_df) %in% "last_sale_price"]

# Calculate 'last sale price per unit'
#numeric_df_without_last_sale_price$last_sale_price_per_unit <- #numeric_df$last_sale_price / numeric_df$number_of_units

# Calculate correlations
#correlations <- cor(numeric_df_without_last_sale_price)

# Sort correlations with the target variable ('last sale price per unit')
#target_correlations <- correlations[, "last_sale_price_per_unit"]
#library(ggplot2)

# Step 3: Visualize results
# Plot top positive and negative correlations
# Top positive correlations
#positive_correlations <- sort(target_correlations[target_correlations > 0], decreasing = TRUE)
#top_positive_vars <- names(positive_correlations)[1:3]  # Select top 3 positive correlated variables
#positive_plots <- lapply(1:length(top_positive_vars), function(i) {
 # ggplot(numeric_df_without_last_sale_price, aes_string(x = top_positive_vars[i], y = "last_sale_price_per_unit")) +
  #  geom_point() +
   # geom_smooth(method = "lm", se = FALSE, color = "blue") +
    #scale_x_continuous(limits = quantile(numeric_df_without_last_sale_price[[top_positive_vars[i]]], c(0.1, 0.9))) +
 #   scale_y_continuous(limits = quantile(numeric_df_without_last_sale_price$last_sale_price_per_unit, c(0.1, 0.9))) +
  #  labs(x = top_positive_vars[i], y = "Last Sale Price Per Unit") +
   # ggtitle(paste(i, ". Relationship between", top_positive_vars[i], "and Last Sale Price Per Unit"))


# Top negative correlations
#negative_correlations <- sort(target_correlations[target_correlations < 0], decreasing = FALSE)
#top_negative_vars <- names(negative_correlations)[1:3]  # Select top 3 negative correlated variables
#negative_plots <- lapply(1:length(top_negative_vars), function(i) {
 # ggplot(numeric_df_without_last_sale_price, aes_string(x = top_negative_vars[i], y = "last_sale_price_per_unit")) +
  #  geom_point() +
   # geom_smooth(method = "lm", se = FALSE, color = "blue") +
 #   scale_x_continuous(limits = #quantile(numeric_df_without_last_sale_price[[top_negative_vars[i]]], c(0.1, 0.9))) +
 #   scale_y_continuous(limits = quantile(numeric_df_without_last_sale_price$last_sale_price_per_unit, c(0.1, 0.9))) +
  #  labs(x = top_negative_vars[i], y = "Last Sale Price Per Unit") +
   # ggtitle(paste(i, ". Relationship between", top_negative_vars[i], "and Last Sale Price Per Unit"))

# Print plots
#print("Top Positive Correlations:")
#print(positive_plots)

#print("Top Negative Correlations:")
#print(negative_plots)

# Step 4: Interpretation
# Summarize key findings
#cat("\nKey Findings:\n")
#cat("The analysis revealed several factors positively correlated with 'last sale price per unit', including vacancy rates and home values in certain price ranges.\n")
#cat("On the other hand, factors such as average household size and number of units showed negative correlations with 'last sale price per unit'.\n")

# Discuss implications
#cat("\nImplications:\n")
#cat("The findings suggest that vacancy rates and home values play a significant role in determining sale prices per unit in the real estate market.\n")
#cat("Understanding these relationships can help stakeholders make informed decisions when buying or selling properties.\n")
library(ggplot2)


# Install and load the glmnet package if you haven't already
install.packages("glmnet")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
# Example data (replace with your own data)
# X should be your predictor variables (matrix or data frame)
# y should be your response variable (numeric vector)
set.seed(123)
df$`Land Area (SF)` <- matrix(rnorm(1036*20), 1036, 20)
df$`Last Sale Price` <- rnorm(1036)

# Lasso regression
lasso_model <- glmnet(df$`Land Area (SF)`, df$`Last Sale Price`, alpha = 1)

# Ridge regression
ridge_model <- glmnet(df$`Land Area (SF)`, df$`Last Sale Price`, alpha = 0)

# Print the coefficient paths
print(lasso_model)
## 
## Call:  glmnet(x = df$`Land Area (SF)`, y = df$`Last Sale Price`, alpha = 1) 
## 
##    Df %Dev   Lambda
## 1   0 0.00 0.086110
## 2   1 0.13 0.078460
## 3   1 0.24 0.071490
## 4   1 0.33 0.065140
## 5   1 0.41 0.059360
## 6   3 0.53 0.054080
## 7   3 0.68 0.049280
## 8   3 0.80 0.044900
## 9   4 0.93 0.040910
## 10  5 1.07 0.037280
## 11  6 1.19 0.033970
## 12  6 1.32 0.030950
## 13  6 1.43 0.028200
## 14  6 1.51 0.025690
## 15  8 1.59 0.023410
## 16  9 1.67 0.021330
## 17  9 1.75 0.019440
## 18  9 1.80 0.017710
## 19  9 1.85 0.016140
## 20  9 1.89 0.014700
## 21 10 1.93 0.013400
## 22 10 1.96 0.012210
## 23 11 1.99 0.011120
## 24 11 2.02 0.010130
## 25 11 2.04 0.009234
## 26 11 2.05 0.008413
## 27 14 2.07 0.007666
## 28 14 2.09 0.006985
## 29 16 2.10 0.006364
## 30 16 2.11 0.005799
## 31 16 2.12 0.005284
## 32 17 2.13 0.004814
## 33 17 2.14 0.004387
## 34 17 2.14 0.003997
## 35 18 2.15 0.003642
## 36 19 2.15 0.003318
## 37 20 2.16 0.003024
## 38 20 2.16 0.002755
## 39 20 2.16 0.002510
## 40 20 2.17 0.002287
## 41 20 2.17 0.002084
## 42 20 2.17 0.001899
## 43 20 2.17 0.001730
## 44 20 2.17 0.001577
## 45 20 2.17 0.001436
## 46 20 2.17 0.001309
## 47 20 2.17 0.001193
## 48 20 2.18 0.001087
## 49 20 2.18 0.000990
## 50 20 2.18 0.000902
## 51 20 2.18 0.000822
## 52 20 2.18 0.000749
## 53 20 2.18 0.000682
## 54 20 2.18 0.000622
## 55 20 2.18 0.000567
## 56 20 2.18 0.000516
## 57 20 2.18 0.000470
## 58 20 2.18 0.000429
## 59 20 2.18 0.000390
## 60 20 2.18 0.000356
## 61 20 2.18 0.000324
## 62 20 2.18 0.000295
## 63 20 2.18 0.000269
## 64 20 2.18 0.000245
## 65 20 2.18 0.000224
## 66 20 2.18 0.000204
print(ridge_model)
## 
## Call:  glmnet(x = df$`Land Area (SF)`, y = df$`Last Sale Price`, alpha = 0) 
## 
##     Df %Dev Lambda
## 1   20 0.00 86.110
## 2   20 0.05 78.460
## 3   20 0.06 71.490
## 4   20 0.06 65.140
## 5   20 0.07 59.360
## 6   20 0.08 54.080
## 7   20 0.08 49.280
## 8   20 0.09 44.900
## 9   20 0.10 40.910
## 10  20 0.11 37.280
## 11  20 0.12 33.970
## 12  20 0.13 30.950
## 13  20 0.14 28.200
## 14  20 0.16 25.690
## 15  20 0.17 23.410
## 16  20 0.19 21.330
## 17  20 0.20 19.440
## 18  20 0.22 17.710
## 19  20 0.24 16.140
## 20  20 0.26 14.700
## 21  20 0.29 13.400
## 22  20 0.31 12.210
## 23  20 0.34 11.120
## 24  20 0.37 10.130
## 25  20 0.40  9.234
## 26  20 0.43  8.413
## 27  20 0.47  7.666
## 28  20 0.50  6.985
## 29  20 0.54  6.364
## 30  20 0.58  5.799
## 31  20 0.63  5.284
## 32  20 0.67  4.814
## 33  20 0.72  4.387
## 34  20 0.77  3.997
## 35  20 0.82  3.642
## 36  20 0.88  3.318
## 37  20 0.93  3.024
## 38  20 0.99  2.755
## 39  20 1.04  2.510
## 40  20 1.10  2.287
## 41  20 1.16  2.084
## 42  20 1.22  1.899
## 43  20 1.28  1.730
## 44  20 1.34  1.577
## 45  20 1.40  1.436
## 46  20 1.46  1.309
## 47  20 1.51  1.193
## 48  20 1.56  1.087
## 49  20 1.62  0.990
## 50  20 1.67  0.902
## 51  20 1.71  0.822
## 52  20 1.76  0.749
## 53  20 1.80  0.682
## 54  20 1.84  0.622
## 55  20 1.88  0.567
## 56  20 1.91  0.516
## 57  20 1.94  0.470
## 58  20 1.97  0.429
## 59  20 1.99  0.390
## 60  20 2.02  0.356
## 61  20 2.04  0.324
## 62  20 2.05  0.295
## 63  20 2.07  0.269
## 64  20 2.09  0.245
## 65  20 2.10  0.224
## 66  20 2.11  0.204
## 67  20 2.12  0.186
## 68  20 2.13  0.169
## 69  20 2.14  0.154
## 70  20 2.14  0.140
## 71  20 2.15  0.128
## 72  20 2.15  0.116
## 73  20 2.16  0.106
## 74  20 2.16  0.097
## 75  20 2.16  0.088
## 76  20 2.16  0.080
## 77  20 2.17  0.073
## 78  20 2.17  0.067
## 79  20 2.17  0.061
## 80  20 2.17  0.055
## 81  20 2.17  0.050
## 82  20 2.17  0.046
## 83  20 2.17  0.042
## 84  20 2.17  0.038
## 85  20 2.18  0.035
## 86  20 2.18  0.032
## 87  20 2.18  0.029
## 88  20 2.18  0.026
## 89  20 2.18  0.024
## 90  20 2.18  0.022
## 91  20 2.18  0.020
## 92  20 2.18  0.018
## 93  20 2.18  0.017
## 94  20 2.18  0.015
## 95  20 2.18  0.014
## 96  20 2.18  0.012
## 97  20 2.18  0.011
## 98  20 2.18  0.010
## 99  20 2.18  0.009
## 100 20 2.18  0.009
# Plot the coefficient paths
plot(lasso_model)

plot(ridge_model)

```

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.