Lab 8_11.12.25

Author

Sean Lee

#install.packages("caret")
#install.packages("randomForest")

library(tidyverse)
Warning: package 'ggplot2' was built under R version 4.5.2
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.2
✔ ggplot2   4.0.0     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf)
Linking to GEOS 3.13.1, GDAL 3.11.0, PROJ 9.6.0; sf_use_s2() is TRUE
library(caret)
Warning: package 'caret' was built under R version 4.5.2
Loading required package: lattice

Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift
library(glmnet)
Warning: package 'glmnet' was built under R version 4.5.2
Loading required package: Matrix

Attaching package: 'Matrix'

The following objects are masked from 'package:tidyr':

    expand, pack, unpack

Loaded glmnet 4.1-10
library(randomForest)
Warning: package 'randomForest' was built under R version 4.5.2
randomForest 4.7-1.2
Type rfNews() to see new features/changes/bug fixes.

Attaching package: 'randomForest'

The following object is masked from 'package:dplyr':

    combine

The following object is masked from 'package:ggplot2':

    margin
library(janitor)

Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test
library(forcats)
set.seed(24)
ames <- read_csv("data/AmesHousing.csv")
Rows: 2930 Columns: 82
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (45): PID, MS SubClass, MS Zoning, Street, Alley, Lot Shape, Land Contou...
dbl (37): Order, Lot Frontage, Lot Area, Overall Qual, Overall Cond, Year Bu...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(ames)
Rows: 2,930
Columns: 82
$ Order             <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
$ PID               <chr> "0526301100", "0526350040", "0526351010", "052635303…
$ `MS SubClass`     <chr> "020", "020", "020", "020", "060", "060", "120", "12…
$ `MS Zoning`       <chr> "RL", "RH", "RL", "RL", "RL", "RL", "RL", "RL", "RL"…
$ `Lot Frontage`    <dbl> 141, 80, 81, 93, 74, 78, 41, 43, 39, 60, 75, NA, 63,…
$ `Lot Area`        <dbl> 31770, 11622, 14267, 11160, 13830, 9978, 4920, 5005,…
$ Street            <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pav…
$ Alley             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ `Lot Shape`       <chr> "IR1", "Reg", "IR1", "Reg", "IR1", "IR1", "Reg", "IR…
$ `Land Contour`    <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "HL…
$ Utilities         <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "A…
$ `Lot Config`      <chr> "Corner", "Inside", "Corner", "Corner", "Inside", "I…
$ `Land Slope`      <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gt…
$ Neighborhood      <chr> "NAmes", "NAmes", "NAmes", "NAmes", "Gilbert", "Gilb…
$ `Condition 1`     <chr> "Norm", "Feedr", "Norm", "Norm", "Norm", "Norm", "No…
$ `Condition 2`     <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", "Nor…
$ `Bldg Type`       <chr> "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "Twn…
$ `House Style`     <chr> "1Story", "1Story", "1Story", "1Story", "2Story", "2…
$ `Overall Qual`    <dbl> 6, 5, 6, 7, 5, 6, 8, 8, 8, 7, 6, 6, 6, 7, 8, 8, 8, 9…
$ `Overall Cond`    <dbl> 5, 6, 6, 5, 5, 6, 5, 5, 5, 5, 5, 7, 5, 5, 5, 5, 7, 2…
$ `Year Built`      <dbl> 1960, 1961, 1958, 1968, 1997, 1998, 2001, 1992, 1995…
$ `Year Remod/Add`  <dbl> 1960, 1961, 1958, 1968, 1998, 1998, 2001, 1992, 1996…
$ `Roof Style`      <chr> "Hip", "Gable", "Hip", "Hip", "Gable", "Gable", "Gab…
$ `Roof Matl`       <chr> "CompShg", "CompShg", "CompShg", "CompShg", "CompShg…
$ `Exterior 1st`    <chr> "BrkFace", "VinylSd", "Wd Sdng", "BrkFace", "VinylSd…
$ `Exterior 2nd`    <chr> "Plywood", "VinylSd", "Wd Sdng", "BrkFace", "VinylSd…
$ `Mas Vnr Type`    <chr> "Stone", "None", "BrkFace", "None", "None", "BrkFace…
$ `Mas Vnr Area`    <dbl> 112, 0, 108, 0, 0, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60…
$ `Exter Qual`      <chr> "TA", "TA", "TA", "Gd", "TA", "TA", "Gd", "Gd", "Gd"…
$ `Exter Cond`      <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA"…
$ Foundation        <chr> "CBlock", "CBlock", "CBlock", "CBlock", "PConc", "PC…
$ `Bsmt Qual`       <chr> "TA", "TA", "TA", "TA", "Gd", "TA", "Gd", "Gd", "Gd"…
$ `Bsmt Cond`       <chr> "Gd", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA"…
$ `Bsmt Exposure`   <chr> "Gd", "No", "No", "No", "No", "No", "Mn", "No", "No"…
$ `BsmtFin Type 1`  <chr> "BLQ", "Rec", "ALQ", "ALQ", "GLQ", "GLQ", "GLQ", "AL…
$ `BsmtFin SF 1`    <dbl> 639, 468, 923, 1065, 791, 602, 616, 263, 1180, 0, 0,…
$ `BsmtFin Type 2`  <chr> "Unf", "LwQ", "Unf", "Unf", "Unf", "Unf", "Unf", "Un…
$ `BsmtFin SF 2`    <dbl> 0, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1120, 0,…
$ `Bsmt Unf SF`     <dbl> 441, 270, 406, 1045, 137, 324, 722, 1017, 415, 994, …
$ `Total Bsmt SF`   <dbl> 1080, 882, 1329, 2110, 928, 926, 1338, 1280, 1595, 9…
$ Heating           <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", "Gas…
$ `Heating QC`      <chr> "Fa", "TA", "TA", "Ex", "Gd", "Ex", "Ex", "Ex", "Ex"…
$ `Central Air`     <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y…
$ Electrical        <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr"…
$ `1st Flr SF`      <dbl> 1656, 896, 1329, 2110, 928, 926, 1338, 1280, 1616, 1…
$ `2nd Flr SF`      <dbl> 0, 0, 0, 0, 701, 678, 0, 0, 0, 776, 892, 0, 676, 0, …
$ `Low Qual Fin SF` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ `Gr Liv Area`     <dbl> 1656, 896, 1329, 2110, 1629, 1604, 1338, 1280, 1616,…
$ `Bsmt Full Bath`  <dbl> 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1…
$ `Bsmt Half Bath`  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ `Full Bath`       <dbl> 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 3, 2, 1…
$ `Half Bath`       <dbl> 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1…
$ `Bedroom AbvGr`   <dbl> 3, 2, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 3, 2, 1, 4, 4, 1…
$ `Kitchen AbvGr`   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ `Kitchen Qual`    <chr> "TA", "TA", "Gd", "Ex", "TA", "Gd", "Gd", "Gd", "Gd"…
$ `TotRms AbvGrd`   <dbl> 7, 5, 6, 8, 6, 7, 6, 5, 5, 7, 7, 6, 7, 5, 4, 12, 8, …
$ Functional        <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Ty…
$ Fireplaces        <dbl> 2, 0, 0, 2, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1…
$ `Fireplace Qu`    <chr> "Gd", NA, NA, "TA", "TA", "Gd", NA, NA, "TA", "TA", …
$ `Garage Type`     <chr> "Attchd", "Attchd", "Attchd", "Attchd", "Attchd", "A…
$ `Garage Yr Blt`   <dbl> 1960, 1961, 1958, 1968, 1997, 1998, 2001, 1992, 1995…
$ `Garage Finish`   <chr> "Fin", "Unf", "Unf", "Fin", "Fin", "Fin", "Fin", "RF…
$ `Garage Cars`     <dbl> 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3…
$ `Garage Area`     <dbl> 528, 730, 312, 522, 482, 470, 582, 506, 608, 442, 44…
$ `Garage Qual`     <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA"…
$ `Garage Cond`     <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA"…
$ `Paved Drive`     <chr> "P", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y…
$ `Wood Deck SF`    <dbl> 210, 140, 393, 0, 212, 360, 0, 0, 237, 140, 157, 483…
$ `Open Porch SF`   <dbl> 62, 0, 36, 0, 34, 36, 0, 82, 152, 60, 84, 21, 75, 0,…
$ `Enclosed Porch`  <dbl> 0, 0, 0, 0, 0, 0, 170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `3Ssn Porch`      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ `Screen Porch`    <dbl> 0, 120, 0, 0, 0, 0, 0, 144, 0, 0, 0, 0, 0, 0, 140, 2…
$ `Pool Area`       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ `Pool QC`         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Fence             <chr> NA, "MnPrv", NA, NA, "MnPrv", NA, NA, NA, NA, NA, NA…
$ `Misc Feature`    <chr> NA, NA, "Gar2", NA, NA, NA, NA, NA, NA, NA, NA, "She…
$ `Misc Val`        <dbl> 0, 0, 12500, 0, 0, 0, 0, 0, 0, 0, 0, 500, 0, 0, 0, 0…
$ `Mo Sold`         <dbl> 5, 6, 6, 4, 3, 6, 4, 1, 3, 6, 4, 3, 5, 2, 6, 6, 6, 6…
$ `Yr Sold`         <dbl> 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010…
$ `Sale Type`       <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD"…
$ `Sale Condition`  <chr> "Normal", "Normal", "Normal", "Normal", "Normal", "N…
$ SalePrice         <dbl> 215000, 105000, 172000, 244000, 189900, 195500, 2135…
na_tbl <- tibble(variable = names(ames),
                 n_missing = colSums(is.na(ames)),
                 pct_missing = round(100*n_missing / nrow(ames), 2)) %>%
  arrange(desc(pct_missing))

head(na_tbl, 15)
# A tibble: 15 × 3
   variable       n_missing pct_missing
   <chr>              <dbl>       <dbl>
 1 Pool QC             2917       99.6 
 2 Misc Feature        2824       96.4 
 3 Alley               2732       93.2 
 4 Fence               2358       80.5 
 5 Fireplace Qu        1422       48.5 
 6 Lot Frontage         490       16.7 
 7 Garage Yr Blt        159        5.43
 8 Garage Finish        159        5.43
 9 Garage Qual          159        5.43
10 Garage Cond          159        5.43
11 Garage Type          157        5.36
12 Bsmt Exposure         83        2.83
13 BsmtFin Type 2        81        2.76
14 Bsmt Qual             80        2.73
15 Bsmt Cond             80        2.73
high_na <- na_tbl %>% 
  filter(pct_missing > 30) %>% 
  pull(variable)

modeldata <- ames %>% 
  select(-all_of(high_na)) %>% 
  drop_na() %>% 
  clean_names()

glimpse(modeldata)
Rows: 2,218
Columns: 77
$ order           <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 16, 17, 18,…
$ pid             <chr> "0526301100", "0526350040", "0526351010", "0526353030"…
$ ms_sub_class    <chr> "020", "020", "020", "020", "060", "060", "120", "120"…
$ ms_zoning       <chr> "RL", "RH", "RL", "RL", "RL", "RL", "RL", "RL", "RL", …
$ lot_frontage    <dbl> 141, 80, 81, 93, 74, 78, 41, 43, 39, 60, 75, 63, 85, 4…
$ lot_area        <dbl> 31770, 11622, 14267, 11160, 13830, 9978, 4920, 5005, 5…
$ street          <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pave"…
$ lot_shape       <chr> "IR1", "Reg", "IR1", "Reg", "IR1", "IR1", "Reg", "IR1"…
$ land_contour    <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "HLS"…
$ utilities       <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "All…
$ lot_config      <chr> "Corner", "Inside", "Corner", "Corner", "Inside", "Ins…
$ land_slope      <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl"…
$ neighborhood    <chr> "NAmes", "NAmes", "NAmes", "NAmes", "Gilbert", "Gilber…
$ condition_1     <chr> "Norm", "Feedr", "Norm", "Norm", "Norm", "Norm", "Norm…
$ condition_2     <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", "Norm"…
$ bldg_type       <chr> "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "Twnhs…
$ house_style     <chr> "1Story", "1Story", "1Story", "1Story", "2Story", "2St…
$ overall_qual    <dbl> 6, 5, 6, 7, 5, 6, 8, 8, 8, 7, 6, 6, 7, 8, 8, 9, 4, 6, …
$ overall_cond    <dbl> 5, 6, 6, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 7, 2, 5, 6, …
$ year_built      <dbl> 1960, 1961, 1958, 1968, 1997, 1998, 2001, 1992, 1995, …
$ year_remod_add  <dbl> 1960, 1961, 1958, 1968, 1998, 1998, 2001, 1992, 1996, …
$ roof_style      <chr> "Hip", "Gable", "Hip", "Hip", "Gable", "Gable", "Gable…
$ roof_matl       <chr> "CompShg", "CompShg", "CompShg", "CompShg", "CompShg",…
$ exterior_1st    <chr> "BrkFace", "VinylSd", "Wd Sdng", "BrkFace", "VinylSd",…
$ exterior_2nd    <chr> "Plywood", "VinylSd", "Wd Sdng", "BrkFace", "VinylSd",…
$ mas_vnr_type    <chr> "Stone", "None", "BrkFace", "None", "None", "BrkFace",…
$ mas_vnr_area    <dbl> 112, 0, 108, 0, 0, 20, 0, 0, 0, 0, 0, 0, 0, 603, 0, 35…
$ exter_qual      <chr> "TA", "TA", "TA", "Gd", "TA", "TA", "Gd", "Gd", "Gd", …
$ exter_cond      <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
$ foundation      <chr> "CBlock", "CBlock", "CBlock", "CBlock", "PConc", "PCon…
$ bsmt_qual       <chr> "TA", "TA", "TA", "TA", "Gd", "TA", "Gd", "Gd", "Gd", …
$ bsmt_cond       <chr> "Gd", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
$ bsmt_exposure   <chr> "Gd", "No", "No", "No", "No", "No", "Mn", "No", "No", …
$ bsmt_fin_type_1 <chr> "BLQ", "Rec", "ALQ", "ALQ", "GLQ", "GLQ", "GLQ", "ALQ"…
$ bsmt_fin_sf_1   <dbl> 639, 468, 923, 1065, 791, 602, 616, 263, 1180, 0, 0, 0…
$ bsmt_fin_type_2 <chr> "Unf", "LwQ", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf"…
$ bsmt_fin_sf_2   <dbl> 0, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
$ bsmt_unf_sf     <dbl> 441, 270, 406, 1045, 137, 324, 722, 1017, 415, 994, 76…
$ total_bsmt_sf   <dbl> 1080, 882, 1329, 2110, 928, 926, 1338, 1280, 1595, 994…
$ heating         <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", "GasA"…
$ heating_qc      <chr> "Fa", "TA", "TA", "Ex", "Gd", "Ex", "Ex", "Ex", "Ex", …
$ central_air     <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",…
$ electrical      <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", …
$ x1st_flr_sf     <dbl> 1656, 896, 1329, 2110, 928, 926, 1338, 1280, 1616, 102…
$ x2nd_flr_sf     <dbl> 0, 0, 0, 0, 701, 678, 0, 0, 0, 776, 892, 676, 0, 1589,…
$ low_qual_fin_sf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ gr_liv_area     <dbl> 1656, 896, 1329, 2110, 1629, 1604, 1338, 1280, 1616, 1…
$ bsmt_full_bath  <dbl> 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, …
$ bsmt_half_bath  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ full_bath       <dbl> 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 3, 2, 1, 1, 2, …
$ half_bath       <dbl> 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, …
$ bedroom_abv_gr  <dbl> 3, 2, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 2, 4, 4, 1, 2, 3, …
$ kitchen_abv_gr  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ kitchen_qual    <chr> "TA", "TA", "Gd", "Ex", "TA", "Gd", "Gd", "Gd", "Gd", …
$ tot_rms_abv_grd <dbl> 7, 5, 6, 8, 6, 7, 6, 5, 5, 7, 7, 7, 5, 12, 8, 8, 4, 7,…
$ functional      <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ"…
$ fireplaces      <dbl> 2, 0, 0, 2, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 2, …
$ garage_type     <chr> "Attchd", "Attchd", "Attchd", "Attchd", "Attchd", "Att…
$ garage_yr_blt   <dbl> 1960, 1961, 1958, 1968, 1997, 1998, 2001, 1992, 1995, …
$ garage_finish   <chr> "Fin", "Unf", "Unf", "Fin", "Fin", "Fin", "Fin", "RFn"…
$ garage_cars     <dbl> 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 2, …
$ garage_area     <dbl> 528, 730, 312, 522, 482, 470, 582, 506, 608, 442, 440,…
$ garage_qual     <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
$ garage_cond     <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
$ paved_drive     <chr> "P", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",…
$ wood_deck_sf    <dbl> 210, 140, 393, 0, 212, 360, 0, 0, 237, 140, 157, 0, 19…
$ open_porch_sf   <dbl> 62, 0, 36, 0, 34, 36, 0, 82, 152, 60, 84, 75, 0, 36, 1…
$ enclosed_porch  <dbl> 0, 0, 0, 0, 0, 0, 170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ x3ssn_porch     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ screen_porch    <dbl> 0, 120, 0, 0, 0, 0, 0, 144, 0, 0, 0, 0, 0, 210, 0, 0, …
$ pool_area       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ misc_val        <dbl> 0, 0, 12500, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ mo_sold         <dbl> 5, 6, 6, 4, 3, 6, 4, 1, 3, 6, 4, 5, 2, 6, 6, 6, 6, 2, …
$ yr_sold         <dbl> 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, …
$ sale_type       <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", …
$ sale_condition  <chr> "Normal", "Normal", "Normal", "Normal", "Normal", "Nor…
$ sale_price      <dbl> 215000, 105000, 172000, 244000, 189900, 195500, 213500…
modeldata %>% 
  ggplot(aes(x = sale_price)) +
  geom_histogram(bins = 40) +
  labs(title = "Distribution of Sale Price", x = "Sale Price", y = "Count")

num_small <- modeldata %>% 
  select(sale_price, gr_liv_area, total_bsmt_sf, garage_area, x1st_flr_sf, full_bath)

pairs(num_small, main = "Pairs plot: selected numeric variables")

modeldata %>%
  mutate(neighborhood_top = forcats::fct_lump_n(neighborhood, n = 12)) %>%
  group_by(neighborhood_top) %>% 
  summarise(median_price = median(sale_price), .groups = "drop") %>% 
  ggplot(aes(x = reorder(neighborhood_top, median_price), y = median_price)) +
  geom_col()+
  coord_flip()+
  labs(title = "Median Sale Price by Neighborhood (top 12)", x = "Neighborhood", y = "Median Price")

train_idx <- createDataPartition(y = modeldata$sale_price, p = 0.7, list = FALSE)
train_data <- modeldata[ train_idx, ]
test_data <- modeldata[-train_idx, ]

#for data cleaning :(
train_data <- train_data %>% 
  mutate(across(where(is.character), as.factor))
test_data <- test_data %>% 
  mutate(across(where(is.character), as.factor))

nzv <- nearZeroVar(train_data, saveMetrics = TRUE)
keep <- rownames(nzv)[!nzv$nzv]
keep <- unique(c("sale_price", keep))
train_data <- train_data[, keep, drop = FALSE]
test_data <- test_data[, keep, drop = FALSE]

X <- model.matrix(sale_price ~., data = train_data)[, -1, drop = FALSE]
lc <- findLinearCombos(X)
if (!is.null(lc$remove)){
  cols_to_drop <- colnames(X)[lc$remove]
  cols_to_drop <- intersect(cols_to_drop, names(train_data))
  train_data <- train_data[, setdiff(names(train_data), cols_to_drop), drop = FALSE]
  test_data <- test_data[, setdiff(names(test_data), cols_to_drop), drop = FALSE]
}

for (nm in names(train_data)){
  if (is.factor(train_data[[nm]])){
    test_data[[nm]] <- factor(test_data[[nm]], levels = levels(train_data[[nm]]))
  }
}

nrow(train_data); nrow(test_data)
[1] 1554
[1] 664