#https://datatables.net/reference/option/
options(DT.options = list(scrollX = TRUE, pagin=TRUE, fixedHeader = TRUE, searchHighlight = TRUE))

Objective

Predict Housing Prices using 2 supervised ml algos:

  1. elastic net
  2. random forest

Get & Split Data

a = read_csv('train.csv') %>% 
  clean_names() %>% 
  mutate(across(where(is.character), factor)) %>% 
  select(sort(tidyselect::peek_vars())) %>% 
  select(
    where(is.Date),
    where(is.factor),
    where(is_character),
    where(is_numeric)
  )

test = read_csv('test.csv') %>% 
  clean_names() %>% 
  mutate(across(where(is.character), factor)) %>% 
  select(sort(tidyselect::peek_vars())) %>% 
  select(
    where(is.Date),
    where(is.factor),
    where(is_character),
    where(is_numeric)
  )

a %>% sample_n(10)

5 min EDA

viz missing

total.cols = a %>% colnames() %>% length() 

a %>% visdat::vis_dat()

check missing, na, inf

a %>% select(where(is.factor)) %>% funModeling::status() %>% arrange(-q_na)
a %>% select(where(is.numeric)) %>% funModeling::status() %>% arrange(-q_na)

Observations

  1. 4 factor vars have 80%+ missing values
    • since this is so many, consider removing entirely rather than imputing
  2. For other factor vars with minimal missing nas, consider either:
    • replacing missing values with dedicated ‘unknown’ level OR
    • replacing missing values by imputing
  3. Consider imputing num vars with nas
    • lot_frontage: rf or kmeans imputation with all other vars
    • garage_yr_blt: rf imputation with other date vars (e.g. year_XXX, yr_sold)
    • mas_vnr_area: rf or kmeans imputation with all other vars

Wrangling / Cleaning

a$pool_qc = NULL
a$misc_feature = NULL
a$alley = NULL
a$fence = NULL

EDA: nom vars

names

(nom.vars = a %>% select(where(is.factor)) %>% colnames %>% as.character)
##  [1] "alley"          "bldg_type"      "bsmt_cond"      "bsmt_exposure" 
##  [5] "bsmt_fin_type1" "bsmt_fin_type2" "bsmt_qual"      "central_air"   
##  [9] "condition1"     "condition2"     "electrical"     "exter_cond"    
## [13] "exter_qual"     "exterior1st"    "exterior2nd"    "fence"         
## [17] "fireplace_qu"   "foundation"     "functional"     "garage_cond"   
## [21] "garage_finish"  "garage_qual"    "garage_type"    "heating"       
## [25] "heating_qc"     "house_style"    "kitchen_qual"   "land_contour"  
## [29] "land_slope"     "lot_config"     "lot_shape"      "mas_vnr_type"  
## [33] "misc_feature"   "ms_zoning"      "neighborhood"   "paved_drive"   
## [37] "pool_qc"        "roof_matl"      "roof_style"     "sale_condition"
## [41] "sale_type"      "street"         "utilities"

check missing

a %>% select(nom.vars) %>% miss_var_summary
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(nom.vars)` instead of `nom.vars` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

sample

a %>% select(nom.vars) %>% sample_n(5)

skim

a %>% select(nom.vars) %>% skim_without_charts()
Data summary
Name Piped data
Number of rows 1460
Number of columns 43
_______________________
Column type frequency:
factor 43
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
alley 1369 0.06 FALSE 2 Grv: 50, Pav: 41
bldg_type 0 1.00 FALSE 5 1Fa: 1220, Twn: 114, Dup: 52, Twn: 43
bsmt_cond 37 0.97 FALSE 4 TA: 1311, Gd: 65, Fa: 45, Po: 2
bsmt_exposure 38 0.97 FALSE 4 No: 953, Av: 221, Gd: 134, Mn: 114
bsmt_fin_type1 37 0.97 FALSE 6 Unf: 430, GLQ: 418, ALQ: 220, BLQ: 148
bsmt_fin_type2 38 0.97 FALSE 6 Unf: 1256, Rec: 54, LwQ: 46, BLQ: 33
bsmt_qual 37 0.97 FALSE 4 TA: 649, Gd: 618, Ex: 121, Fa: 35
central_air 0 1.00 FALSE 2 Y: 1365, N: 95
condition1 0 1.00 FALSE 9 Nor: 1260, Fee: 81, Art: 48, RRA: 26
condition2 0 1.00 FALSE 8 Nor: 1445, Fee: 6, Art: 2, Pos: 2
electrical 1 1.00 FALSE 5 SBr: 1334, Fus: 94, Fus: 27, Fus: 3
exter_cond 0 1.00 FALSE 5 TA: 1282, Gd: 146, Fa: 28, Ex: 3
exter_qual 0 1.00 FALSE 4 TA: 906, Gd: 488, Ex: 52, Fa: 14
exterior1st 0 1.00 FALSE 15 Vin: 515, HdB: 222, Met: 220, Wd : 206
exterior2nd 0 1.00 FALSE 16 Vin: 504, Met: 214, HdB: 207, Wd : 197
fence 1179 0.19 FALSE 4 MnP: 157, GdP: 59, GdW: 54, MnW: 11
fireplace_qu 690 0.53 FALSE 5 Gd: 380, TA: 313, Fa: 33, Ex: 24
foundation 0 1.00 FALSE 6 PCo: 647, CBl: 634, Brk: 146, Sla: 24
functional 0 1.00 FALSE 7 Typ: 1360, Min: 34, Min: 31, Mod: 15
garage_cond 81 0.94 FALSE 5 TA: 1326, Fa: 35, Gd: 9, Po: 7
garage_finish 81 0.94 FALSE 3 Unf: 605, RFn: 422, Fin: 352
garage_qual 81 0.94 FALSE 5 TA: 1311, Fa: 48, Gd: 14, Ex: 3
garage_type 81 0.94 FALSE 6 Att: 870, Det: 387, Bui: 88, Bas: 19
heating 0 1.00 FALSE 6 Gas: 1428, Gas: 18, Gra: 7, Wal: 4
heating_qc 0 1.00 FALSE 5 Ex: 741, TA: 428, Gd: 241, Fa: 49
house_style 0 1.00 FALSE 8 1St: 726, 2St: 445, 1.5: 154, SLv: 65
kitchen_qual 0 1.00 FALSE 4 TA: 735, Gd: 586, Ex: 100, Fa: 39
land_contour 0 1.00 FALSE 4 Lvl: 1311, Bnk: 63, HLS: 50, Low: 36
land_slope 0 1.00 FALSE 3 Gtl: 1382, Mod: 65, Sev: 13
lot_config 0 1.00 FALSE 5 Ins: 1052, Cor: 263, Cul: 94, FR2: 47
lot_shape 0 1.00 FALSE 4 Reg: 925, IR1: 484, IR2: 41, IR3: 10
mas_vnr_type 8 0.99 FALSE 4 Non: 864, Brk: 445, Sto: 128, Brk: 15
misc_feature 1406 0.04 FALSE 4 She: 49, Gar: 2, Oth: 2, Ten: 1
ms_zoning 0 1.00 FALSE 5 RL: 1151, RM: 218, FV: 65, RH: 16
neighborhood 0 1.00 FALSE 25 NAm: 225, Col: 150, Old: 113, Edw: 100
paved_drive 0 1.00 FALSE 3 Y: 1340, N: 90, P: 30
pool_qc 1453 0.00 FALSE 3 Gd: 3, Ex: 2, Fa: 2
roof_matl 0 1.00 FALSE 8 Com: 1434, Tar: 11, WdS: 6, WdS: 5
roof_style 0 1.00 FALSE 6 Gab: 1141, Hip: 286, Fla: 13, Gam: 11
sale_condition 0 1.00 FALSE 6 Nor: 1198, Par: 125, Abn: 101, Fam: 20
sale_type 0 1.00 FALSE 9 WD: 1267, New: 122, COD: 43, Con: 9
street 0 1.00 FALSE 2 Pav: 1454, Grv: 6
utilities 0 1.00 FALSE 2 All: 1459, NoS: 1

viz: level counts distribution

a %>% select(nom.vars) %>% map(n_unique) %>% as.data.frame.list %>% pivot_longer(everything()) %>% mutate(name = fct_reorder(name, value)) %>% plot_ly(x = ~value, y = ~name, color = ~name) %>% hide_legend() %>% layout(title = 'factor vars level counts', xaxis = list(title = 'count'), yaxis = list(title = ''))
a %>% select(nom.vars) %>% funModeling::freq()

##   alley frequency percentage cumulative_perc
## 1  <NA>      1369      93.77           93.77
## 2  Grvl        50       3.42           97.19
## 3  Pave        41       2.81          100.00

##   bldg_type frequency percentage cumulative_perc
## 1      1Fam      1220      83.56           83.56
## 2    TwnhsE       114       7.81           91.37
## 3    Duplex        52       3.56           94.93
## 4     Twnhs        43       2.95           97.88
## 5    2fmCon        31       2.12          100.00

##   bsmt_cond frequency percentage cumulative_perc
## 1        TA      1311      89.79           89.79
## 2        Gd        65       4.45           94.24
## 3        Fa        45       3.08           97.32
## 4      <NA>        37       2.53           99.85
## 5        Po         2       0.14          100.00

##   bsmt_exposure frequency percentage cumulative_perc
## 1            No       953      65.27           65.27
## 2            Av       221      15.14           80.41
## 3            Gd       134       9.18           89.59
## 4            Mn       114       7.81           97.40
## 5          <NA>        38       2.60          100.00

##   bsmt_fin_type1 frequency percentage cumulative_perc
## 1            Unf       430      29.45           29.45
## 2            GLQ       418      28.63           58.08
## 3            ALQ       220      15.07           73.15
## 4            BLQ       148      10.14           83.29
## 5            Rec       133       9.11           92.40
## 6            LwQ        74       5.07           97.47
## 7           <NA>        37       2.53          100.00

##   bsmt_fin_type2 frequency percentage cumulative_perc
## 1            Unf      1256      86.03           86.03
## 2            Rec        54       3.70           89.73
## 3            LwQ        46       3.15           92.88
## 4           <NA>        38       2.60           95.48
## 5            BLQ        33       2.26           97.74
## 6            ALQ        19       1.30           99.04
## 7            GLQ        14       0.96          100.00

##   bsmt_qual frequency percentage cumulative_perc
## 1        TA       649      44.45           44.45
## 2        Gd       618      42.33           86.78
## 3        Ex       121       8.29           95.07
## 4      <NA>        37       2.53           97.60
## 5        Fa        35       2.40          100.00

##   central_air frequency percentage cumulative_perc
## 1           Y      1365      93.49           93.49
## 2           N        95       6.51          100.00

##   condition1 frequency percentage cumulative_perc
## 1       Norm      1260      86.30           86.30
## 2      Feedr        81       5.55           91.85
## 3     Artery        48       3.29           95.14
## 4       RRAn        26       1.78           96.92
## 5       PosN        19       1.30           98.22
## 6       RRAe        11       0.75           98.97
## 7       PosA         8       0.55           99.52
## 8       RRNn         5       0.34           99.86
## 9       RRNe         2       0.14          100.00

##   condition2 frequency percentage cumulative_perc
## 1       Norm      1445      98.97           98.97
## 2      Feedr         6       0.41           99.38
## 3     Artery         2       0.14           99.52
## 4       PosN         2       0.14           99.66
## 5       RRNn         2       0.14           99.80
## 6       PosA         1       0.07           99.87
## 7       RRAe         1       0.07           99.94
## 8       RRAn         1       0.07          100.00

##   electrical frequency percentage cumulative_perc
## 1      SBrkr      1334      91.37           91.37
## 2      FuseA        94       6.44           97.81
## 3      FuseF        27       1.85           99.66
## 4      FuseP         3       0.21           99.87
## 5        Mix         1       0.07           99.94
## 6       <NA>         1       0.07          100.00

##   exter_cond frequency percentage cumulative_perc
## 1         TA      1282      87.81           87.81
## 2         Gd       146      10.00           97.81
## 3         Fa        28       1.92           99.73
## 4         Ex         3       0.21           99.94
## 5         Po         1       0.07          100.00

##   exter_qual frequency percentage cumulative_perc
## 1         TA       906      62.05           62.05
## 2         Gd       488      33.42           95.47
## 3         Ex        52       3.56           99.03
## 4         Fa        14       0.96          100.00

##    exterior1st frequency percentage cumulative_perc
## 1      VinylSd       515      35.27           35.27
## 2      HdBoard       222      15.21           50.48
## 3      MetalSd       220      15.07           65.55
## 4      Wd Sdng       206      14.11           79.66
## 5      Plywood       108       7.40           87.06
## 6      CemntBd        61       4.18           91.24
## 7      BrkFace        50       3.42           94.66
## 8      WdShing        26       1.78           96.44
## 9       Stucco        25       1.71           98.15
## 10     AsbShng        20       1.37           99.52
## 11     BrkComm         2       0.14           99.66
## 12       Stone         2       0.14           99.80
## 13     AsphShn         1       0.07           99.87
## 14      CBlock         1       0.07           99.94
## 15     ImStucc         1       0.07          100.00

##    exterior2nd frequency percentage cumulative_perc
## 1      VinylSd       504      34.52           34.52
## 2      MetalSd       214      14.66           49.18
## 3      HdBoard       207      14.18           63.36
## 4      Wd Sdng       197      13.49           76.85
## 5      Plywood       142       9.73           86.58
## 6      CmentBd        60       4.11           90.69
## 7      Wd Shng        38       2.60           93.29
## 8       Stucco        26       1.78           95.07
## 9      BrkFace        25       1.71           96.78
## 10     AsbShng        20       1.37           98.15
## 11     ImStucc        10       0.68           98.83
## 12     Brk Cmn         7       0.48           99.31
## 13       Stone         5       0.34           99.65
## 14     AsphShn         3       0.21           99.86
## 15      CBlock         1       0.07           99.93
## 16       Other         1       0.07          100.00

##   fence frequency percentage cumulative_perc
## 1  <NA>      1179      80.75           80.75
## 2 MnPrv       157      10.75           91.50
## 3 GdPrv        59       4.04           95.54
## 4  GdWo        54       3.70           99.24
## 5  MnWw        11       0.75          100.00

##   fireplace_qu frequency percentage cumulative_perc
## 1         <NA>       690      47.26           47.26
## 2           Gd       380      26.03           73.29
## 3           TA       313      21.44           94.73
## 4           Fa        33       2.26           96.99
## 5           Ex        24       1.64           98.63
## 6           Po        20       1.37          100.00

##   foundation frequency percentage cumulative_perc
## 1      PConc       647      44.32           44.32
## 2     CBlock       634      43.42           87.74
## 3     BrkTil       146      10.00           97.74
## 4       Slab        24       1.64           99.38
## 5      Stone         6       0.41           99.79
## 6       Wood         3       0.21          100.00

##   functional frequency percentage cumulative_perc
## 1        Typ      1360      93.15           93.15
## 2       Min2        34       2.33           95.48
## 3       Min1        31       2.12           97.60
## 4        Mod        15       1.03           98.63
## 5       Maj1        14       0.96           99.59
## 6       Maj2         5       0.34           99.93
## 7        Sev         1       0.07          100.00

##   garage_cond frequency percentage cumulative_perc
## 1          TA      1326      90.82           90.82
## 2        <NA>        81       5.55           96.37
## 3          Fa        35       2.40           98.77
## 4          Gd         9       0.62           99.39
## 5          Po         7       0.48           99.87
## 6          Ex         2       0.14          100.00

##   garage_finish frequency percentage cumulative_perc
## 1           Unf       605      41.44           41.44
## 2           RFn       422      28.90           70.34
## 3           Fin       352      24.11           94.45
## 4          <NA>        81       5.55          100.00

##   garage_qual frequency percentage cumulative_perc
## 1          TA      1311      89.79           89.79
## 2        <NA>        81       5.55           95.34
## 3          Fa        48       3.29           98.63
## 4          Gd        14       0.96           99.59
## 5          Ex         3       0.21           99.80
## 6          Po         3       0.21          100.00

##   garage_type frequency percentage cumulative_perc
## 1      Attchd       870      59.59           59.59
## 2      Detchd       387      26.51           86.10
## 3     BuiltIn        88       6.03           92.13
## 4        <NA>        81       5.55           97.68
## 5     Basment        19       1.30           98.98
## 6     CarPort         9       0.62           99.60
## 7      2Types         6       0.41          100.00

##   heating frequency percentage cumulative_perc
## 1    GasA      1428      97.81           97.81
## 2    GasW        18       1.23           99.04
## 3    Grav         7       0.48           99.52
## 4    Wall         4       0.27           99.79
## 5    OthW         2       0.14           99.93
## 6   Floor         1       0.07          100.00

##   heating_qc frequency percentage cumulative_perc
## 1         Ex       741      50.75           50.75
## 2         TA       428      29.32           80.07
## 3         Gd       241      16.51           96.58
## 4         Fa        49       3.36           99.94
## 5         Po         1       0.07          100.00

##   house_style frequency percentage cumulative_perc
## 1      1Story       726      49.73           49.73
## 2      2Story       445      30.48           80.21
## 3      1.5Fin       154      10.55           90.76
## 4        SLvl        65       4.45           95.21
## 5      SFoyer        37       2.53           97.74
## 6      1.5Unf        14       0.96           98.70
## 7      2.5Unf        11       0.75           99.45
## 8      2.5Fin         8       0.55          100.00

##   kitchen_qual frequency percentage cumulative_perc
## 1           TA       735      50.34           50.34
## 2           Gd       586      40.14           90.48
## 3           Ex       100       6.85           97.33
## 4           Fa        39       2.67          100.00

##   land_contour frequency percentage cumulative_perc
## 1          Lvl      1311      89.79           89.79
## 2          Bnk        63       4.32           94.11
## 3          HLS        50       3.42           97.53
## 4          Low        36       2.47          100.00

##   land_slope frequency percentage cumulative_perc
## 1        Gtl      1382      94.66           94.66
## 2        Mod        65       4.45           99.11
## 3        Sev        13       0.89          100.00

##   lot_config frequency percentage cumulative_perc
## 1     Inside      1052      72.05           72.05
## 2     Corner       263      18.01           90.06
## 3    CulDSac        94       6.44           96.50
## 4        FR2        47       3.22           99.72
## 5        FR3         4       0.27          100.00

##   lot_shape frequency percentage cumulative_perc
## 1       Reg       925      63.36           63.36
## 2       IR1       484      33.15           96.51
## 3       IR2        41       2.81           99.32
## 4       IR3        10       0.68          100.00

##   mas_vnr_type frequency percentage cumulative_perc
## 1         None       864      59.18           59.18
## 2      BrkFace       445      30.48           89.66
## 3        Stone       128       8.77           98.43
## 4       BrkCmn        15       1.03           99.46
## 5         <NA>         8       0.55          100.00

##   misc_feature frequency percentage cumulative_perc
## 1         <NA>      1406      96.30           96.30
## 2         Shed        49       3.36           99.66
## 3         Gar2         2       0.14           99.80
## 4         Othr         2       0.14           99.94
## 5         TenC         1       0.07          100.00

##   ms_zoning frequency percentage cumulative_perc
## 1        RL      1151      78.84           78.84
## 2        RM       218      14.93           93.77
## 3        FV        65       4.45           98.22
## 4        RH        16       1.10           99.32
## 5   C (all)        10       0.68          100.00

##    neighborhood frequency percentage cumulative_perc
## 1         NAmes       225      15.41           15.41
## 2       CollgCr       150      10.27           25.68
## 3       OldTown       113       7.74           33.42
## 4       Edwards       100       6.85           40.27
## 5       Somerst        86       5.89           46.16
## 6       Gilbert        79       5.41           51.57
## 7       NridgHt        77       5.27           56.84
## 8        Sawyer        74       5.07           61.91
## 9        NWAmes        73       5.00           66.91
## 10      SawyerW        59       4.04           70.95
## 11      BrkSide        58       3.97           74.92
## 12      Crawfor        51       3.49           78.41
## 13      Mitchel        49       3.36           81.77
## 14      NoRidge        41       2.81           84.58
## 15       Timber        38       2.60           87.18
## 16       IDOTRR        37       2.53           89.71
## 17      ClearCr        28       1.92           91.63
## 18      StoneBr        25       1.71           93.34
## 19        SWISU        25       1.71           95.05
## 20      Blmngtn        17       1.16           96.21
## 21      MeadowV        17       1.16           97.37
## 22       BrDale        16       1.10           98.47
## 23      Veenker        11       0.75           99.22
## 24      NPkVill         9       0.62           99.84
## 25      Blueste         2       0.14          100.00

##   paved_drive frequency percentage cumulative_perc
## 1           Y      1340      91.78           91.78
## 2           N        90       6.16           97.94
## 3           P        30       2.05          100.00

##   pool_qc frequency percentage cumulative_perc
## 1    <NA>      1453      99.52           99.52
## 2      Gd         3       0.21           99.73
## 3      Ex         2       0.14           99.87
## 4      Fa         2       0.14          100.00

##   roof_matl frequency percentage cumulative_perc
## 1   CompShg      1434      98.22           98.22
## 2   Tar&Grv        11       0.75           98.97
## 3   WdShngl         6       0.41           99.38
## 4   WdShake         5       0.34           99.72
## 5   ClyTile         1       0.07           99.79
## 6   Membran         1       0.07           99.86
## 7     Metal         1       0.07           99.93
## 8      Roll         1       0.07          100.00

##   roof_style frequency percentage cumulative_perc
## 1      Gable      1141      78.15           78.15
## 2        Hip       286      19.59           97.74
## 3       Flat        13       0.89           98.63
## 4    Gambrel        11       0.75           99.38
## 5    Mansard         7       0.48           99.86
## 6       Shed         2       0.14          100.00

##   sale_condition frequency percentage cumulative_perc
## 1         Normal      1198      82.05           82.05
## 2        Partial       125       8.56           90.61
## 3        Abnorml       101       6.92           97.53
## 4         Family        20       1.37           98.90
## 5         Alloca        12       0.82           99.72
## 6        AdjLand         4       0.27          100.00

##   sale_type frequency percentage cumulative_perc
## 1        WD      1267      86.78           86.78
## 2       New       122       8.36           95.14
## 3       COD        43       2.95           98.09
## 4     ConLD         9       0.62           98.71
## 5     ConLI         5       0.34           99.05
## 6     ConLw         5       0.34           99.39
## 7       CWD         4       0.27           99.66
## 8       Oth         3       0.21           99.87
## 9       Con         2       0.14          100.00

##   street frequency percentage cumulative_perc
## 1   Pave      1454      99.59           99.59
## 2   Grvl         6       0.41          100.00

##   utilities frequency percentage cumulative_perc
## 1    AllPub      1459      99.93           99.93
## 2    NoSeWa         1       0.07          100.00
## [1] "Variables processed: alley, bldg_type, bsmt_cond, bsmt_exposure, bsmt_fin_type1, bsmt_fin_type2, bsmt_qual, central_air, condition1, condition2, electrical, exter_cond, exter_qual, exterior1st, exterior2nd, fence, fireplace_qu, foundation, functional, garage_cond, garage_finish, garage_qual, garage_type, heating, heating_qc, house_style, kitchen_qual, land_contour, land_slope, lot_config, lot_shape, mas_vnr_type, misc_feature, ms_zoning, neighborhood, paved_drive, pool_qc, roof_matl, roof_style, sale_condition, sale_type, street, utilities"

viz: mosaic plots: ggpairs

#documentation: https://mran.microsoft.com/snapshot/2016-01-12/web/packages/GGally/vignettes/ggpairs.html

a %>% select(nom.vars) %>% 
  GGally::ggpairs(
    columns = c(nom.vars[13], nom.vars[32]),
    mapping = aes(color = eval(as.name(nom.vars[13])))
)

Description of Masonry veneer type

  • BrkCmn Brick Common
  • BrkFace Brick Face
  • CBlock Cinder Block
  • None None
  • Stone Stone

Description of Exterior Quality Levels

  • Ex Excellent
  • Gd Good
  • TA Average/Typical
  • Fa Fair
  • Po Poor

Not too surprisingly, Many of the Brick type Masonry veneer type had Exterior Qualities of Good and Excellent

EDA: num vars

names

(num.vars = a %>% select(where(is.numeric)) %>% colnames %>% as.character)
##  [1] "bedroom_abv_gr"  "bsmt_fin_sf1"    "bsmt_fin_sf2"    "bsmt_full_bath" 
##  [5] "bsmt_half_bath"  "bsmt_unf_sf"     "enclosed_porch"  "fireplaces"     
##  [9] "full_bath"       "garage_area"     "garage_cars"     "garage_yr_blt"  
## [13] "gr_liv_area"     "half_bath"       "kitchen_abv_gr"  "lot_area"       
## [17] "lot_frontage"    "low_qual_fin_sf" "mas_vnr_area"    "misc_val"       
## [21] "mo_sold"         "ms_sub_class"    "open_porch_sf"   "overall_cond"   
## [25] "overall_qual"    "pool_area"       "sale_price"      "screen_porch"   
## [29] "tot_rms_abv_grd" "total_bsmt_sf"   "wood_deck_sf"    "x1st_flr_sf"    
## [33] "x2nd_flr_sf"     "x3ssn_porch"     "year_built"      "year_remod_add" 
## [37] "yr_sold"

check missing

a %>% select(num.vars) %>% miss_var_summary
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(num.vars)` instead of `num.vars` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

sample

a %>% select(num.vars) %>% sample_n(5)

skim

a %>% select(num.vars) %>% skim_without_charts()
Data summary
Name Piped data
Number of rows 1460
Number of columns 37
_______________________
Column type frequency:
numeric 37
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
bedroom_abv_gr 0 1.00 2.87 0.82 0 2.00 3.0 3.00 8
bsmt_fin_sf1 0 1.00 443.64 456.10 0 0.00 383.5 712.25 5644
bsmt_fin_sf2 0 1.00 46.55 161.32 0 0.00 0.0 0.00 1474
bsmt_full_bath 0 1.00 0.43 0.52 0 0.00 0.0 1.00 3
bsmt_half_bath 0 1.00 0.06 0.24 0 0.00 0.0 0.00 2
bsmt_unf_sf 0 1.00 567.24 441.87 0 223.00 477.5 808.00 2336
enclosed_porch 0 1.00 21.95 61.12 0 0.00 0.0 0.00 552
fireplaces 0 1.00 0.61 0.64 0 0.00 1.0 1.00 3
full_bath 0 1.00 1.57 0.55 0 1.00 2.0 2.00 3
garage_area 0 1.00 472.98 213.80 0 334.50 480.0 576.00 1418
garage_cars 0 1.00 1.77 0.75 0 1.00 2.0 2.00 4
garage_yr_blt 81 0.94 1978.51 24.69 1900 1961.00 1980.0 2002.00 2010
gr_liv_area 0 1.00 1515.46 525.48 334 1129.50 1464.0 1776.75 5642
half_bath 0 1.00 0.38 0.50 0 0.00 0.0 1.00 2
kitchen_abv_gr 0 1.00 1.05 0.22 0 1.00 1.0 1.00 3
lot_area 0 1.00 10516.83 9981.26 1300 7553.50 9478.5 11601.50 215245
lot_frontage 259 0.82 70.05 24.28 21 59.00 69.0 80.00 313
low_qual_fin_sf 0 1.00 5.84 48.62 0 0.00 0.0 0.00 572
mas_vnr_area 8 0.99 103.69 181.07 0 0.00 0.0 166.00 1600
misc_val 0 1.00 43.49 496.12 0 0.00 0.0 0.00 15500
mo_sold 0 1.00 6.32 2.70 1 5.00 6.0 8.00 12
ms_sub_class 0 1.00 56.90 42.30 20 20.00 50.0 70.00 190
open_porch_sf 0 1.00 46.66 66.26 0 0.00 25.0 68.00 547
overall_cond 0 1.00 5.58 1.11 1 5.00 5.0 6.00 9
overall_qual 0 1.00 6.10 1.38 1 5.00 6.0 7.00 10
pool_area 0 1.00 2.76 40.18 0 0.00 0.0 0.00 738
sale_price 0 1.00 180921.20 79442.50 34900 129975.00 163000.0 214000.00 755000
screen_porch 0 1.00 15.06 55.76 0 0.00 0.0 0.00 480
tot_rms_abv_grd 0 1.00 6.52 1.63 2 5.00 6.0 7.00 14
total_bsmt_sf 0 1.00 1057.43 438.71 0 795.75 991.5 1298.25 6110
wood_deck_sf 0 1.00 94.24 125.34 0 0.00 0.0 168.00 857
x1st_flr_sf 0 1.00 1162.63 386.59 334 882.00 1087.0 1391.25 4692
x2nd_flr_sf 0 1.00 346.99 436.53 0 0.00 0.0 728.00 2065
x3ssn_porch 0 1.00 3.41 29.32 0 0.00 0.0 0.00 508
year_built 0 1.00 1971.27 30.20 1872 1954.00 1973.0 2000.00 2010
year_remod_add 0 1.00 1984.87 20.65 1950 1967.00 1994.0 2004.00 2010
yr_sold 0 1.00 2007.82 1.33 2006 2007.00 2008.0 2009.00 2010

viz: distribution - hist

a %>% select(num.vars) %>% DataExplorer::plot_histogram(
  scale_x = 'log10',
  #geom_histogram_args = list(bins = 50L),
  ncol = 2, nrow = 2
)
## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis

viz: distribution - density

a %>% select(num.vars) %>% DataExplorer::plot_density(
  scale_x = 'log10',
  #geom_histogram_args = list(bins = 50L),
  ncol = 2, nrow = 2
)

viz: outliers

a %>% select(num.vars) %>% dlookr::plot_outlier()

viz: normality

a %>% select(num.vars) %>% dlookr::plot_normality()

viz: normality: outcome var only

a %>% select(sale_price) %>% dlookr::plot_normality()

consider log transforming

viz: correlations

a %>% select(num.vars) %>% visdat::vis_cor()

#a %>% select(num.vars) %>% dlookr::plot_correlate()

a %>% select(num.vars) %>% GGally::ggcorr(low = 'red', high = 'darkgreen', label = TRUE)

Observations:

  1. Not surprisingly there are positive cor among certain vars (list below not comprehensive):
    • gr_liv_area & tot_rms_abv_grd
    • gr_liv_area & sale_price
    • bedroom_abv_gr & tot_rms_abv_grd
    • yr_blt & garage_yr_blt
  2. Consider doing PCA on correlated vars

EDA: nom/num vars

target.var = 'sale_price'

viz: cor w/target - pearson

a %>% select(num.vars) %>% funModeling::correlation_table(target = target.var)

quick and dirty pseudo feature importance

viz: cross plot - input/target distribution

### useful when outcome var is a binary var
a %>% mutate(
  overall_qual.gte.8.flg = if_else(
    overall_qual >= 8, 'yes', 'no'
  )
) %>% funModeling::cross_plot(
  input = nom.vars,
  target = "overall_qual.gte.8.flg"
  )

quick analysis: binary target

#This function is used to analyze data when we need to reduce variable cardinality in predictive modeling.
#works in conjunction with 'cross_plot';
#a %>% funModeling::categ_analysis(input = nom.vars[13], target = 'overall_qual.gte.8.flg')

Of the subset of homes whose exterior quality was excellent, 88.5% had an overall quality rating of 8 or better

Preprocess Data

1) split data

2) create recipe spec

#order reference: https://recipes.tidymodels.org/articles/Ordering.html

library(tidymodels)

rec.en = train %>% recipe(sale_price ~ . ) %>% 
  step_mutate(sale_price = log10(sale_price), skip = TRUE) %>% 
  step_rm(pool_qc, misc_feature, alley, fence) %>% #excessive nas
  step_bagimpute(
    lot_frontage, garage_yr_blt, mas_vnr_area
    ) %>% 
  step_unknown(all_nominal()) %>% #assign 'unknown' level to vars with nas
  step_corr(all_numeric(),-all_outcomes()) %>%
  step_dummy(all_nominal(), one_hot = TRUE) %>%  #There are new levels in a factor
  step_nzv(all_predictors(),-all_outcomes()) %>%
  step_zv(all_predictors(),-all_outcomes()) %>% 
  step_normalize(all_numeric(),-all_outcomes())

rec.en %>% tidy
#----------------------------

rec.rf = train %>% recipe(sale_price ~ . ) %>% 
  step_mutate(sale_price = log10(sale_price), skip = TRUE) %>% 
  step_rm(pool_qc, misc_feature, alley, fence) %>% #excessive nas
  step_bagimpute(all_numeric(),-all_outcomes()) %>% 
  step_knnimpute(all_nominal(),-all_outcomes()) %>%
  step_corr(all_numeric(),-all_outcomes()) %>%
  #step_dummy(all_nominal(), one_hot = TRUE) %>%  #There are new levels in a factor
  step_nzv(all_predictors(),-all_outcomes()) %>%
  step_zv(all_predictors(),-all_outcomes()) %>% 
  step_normalize(all_numeric(),-all_outcomes())

rec.rf %>% tidy

3) preprocess

check missing

4) create model spec

mdl.en = parsnip::linear_reg(
  penalty = tune(),
  mixture = tune() #lasso/ridge mix
) %>% 
  set_mode('regression') %>% 
  set_engine('glmnet')

mdl.rf = parsnip::rand_forest(
  trees = 150,
  min_n = tune(), #min number of observations at terminal node
  mtry = tune() #number of vars to randomly subset at each node
) %>% 
  set_mode('regression') %>% 
  set_engine('ranger', importance = 'impurity_corrected')

5) create workflow spec

wf.en = workflow() %>% 
  add_recipe(rec.en) %>% 
  add_model(mdl.en)
#----------------------------
wf.rf = workflow() %>% 
  add_recipe(rec.rf) %>% 
  add_model(mdl.rf)

6) execute workflow on vfold using auto hp tuning

doParallel::registerDoParallel() #use parallel processing
set.seed(345)

tg.en = tune_grid(
  object = wf.en,
  resamples = vfold, 
  grid = 5 #Create a tuning grid AUTOMATICALLY
)

tg.en %>% collect_metrics()
#----------------------------

tg.rf = tune_grid(
  object = wf.rf,
  resamples = vfold, 
  grid = 5 #Create a tuning grid AUTOMATICALLY
)
tg.rf$.notes
## [[1]]
## # A tibble: 5 x 1
##   .notes                                                                        
##   <chr>                                                                         
## 1 model 1/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 2 model 2/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 3 model 3/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 4 model 4/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 5 model 5/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 
## [[2]]
## # A tibble: 5 x 1
##   .notes                                                                        
##   <chr>                                                                         
## 1 model 1/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 2 model 2/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 3 model 3/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 4 model 4/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 5 model 5/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 
## [[3]]
## # A tibble: 5 x 1
##   .notes                                                                        
##   <chr>                                                                         
## 1 model 1/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 2 model 2/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 3 model 3/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 4 model 4/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 5 model 5/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 
## [[4]]
## # A tibble: 5 x 1
##   .notes                                                                        
##   <chr>                                                                         
## 1 model 1/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 2 model 2/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 3 model 3/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 4 model 4/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 5 model 5/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 
## [[5]]
## # A tibble: 6 x 1
##   .notes                                                                        
##   <chr>                                                                         
## 1 recipe: skipping variable with zero or non-finite range., the standard deviat~
## 2 model 1/5 (predictions): skipping variable with zero or non-finite range., Fo~
## 3 model 2/5 (predictions): skipping variable with zero or non-finite range., Fo~
## 4 model 3/5 (predictions): skipping variable with zero or non-finite range., Fo~
## 5 model 4/5 (predictions): skipping variable with zero or non-finite range., Fo~
## 6 model 5/5 (predictions): skipping variable with zero or non-finite range., Fo~
tg.rf %>% collect_metrics()

viz evaluation metrics

ggplotly(
tg.en %>%
  collect_metrics() %>%
  filter(.metric == "rmse") %>%
  select(mean, penalty, mixture) %>%
  pivot_longer(penalty:mixture,
    values_to = "value",
    names_to = "parameter"
  ) %>%
  ggplot(aes(value, mean, color = parameter)) +
  geom_point(show.legend = FALSE, size = 3) +
  facet_wrap(~parameter, scales = "free_x") +
  labs(title = 'Elastic Net RMSE by Hyperparameter', x = NULL, y = '')
)
tg.en %>% show_best('rmse')
#----------------------------

ggplotly(
tg.rf %>%
  collect_metrics() %>%
  filter(.metric == "rmse") %>%
  select(mean, min_n, mtry) %>%
  pivot_longer(min_n:mtry,
    values_to = "value",
    names_to = "parameter"
  ) %>%
  ggplot(aes(value, mean, color = parameter)) +
  geom_point(show.legend = FALSE, size = 3) +
  facet_wrap(~parameter, scales = "free_x") +
  labs(title = 'Random Forest RMSE by Hyperparameter', x = NULL, y = '')
)
tg.rf %>% show_best('rmse')

7) select best hps

(best.hps.en = tg.en %>% select_best('rmse'))
#----------------------------
(best.hps.rf = tg.rf %>% select_best('rmse'))

8) finalize workflow & fit model

wf.en.fin = wf.en %>% 
  #1) finalize wf (recipe, model w/previously unknown hps) using best hps
  finalize_workflow(best.hps.en) %>%
  #2) fit on entire train
  fit(train)

#----------------------------

wf.rf.fin = wf.rf %>% 
  #1) finalize wf (recipe, model w/previously unknown hps) using best hps
  finalize_workflow(best.hps.rf) %>%
  #2) fit on entire train
  fit(train)

9) check vip

wf.en.fin %>% pull_workflow_fit() %>% vip(geom = 'point') + labs(title = 'ENET var importance')

#wf.en.fin %>% pull_workflow_fit() %>% vi
wf.rf.fin %>% pull_workflow_fit() %>% vip(geom = 'point') + labs(title = 'RF var importance')

#wf.rf.fin %>% pull_workflow_fit() %>% vi

10) make preds on test

preds.en = wf.en.fin %>% predict(test)

(10 ^ preds.en) %>% plot_ly(x = ~.pred) %>% layout(title = 'Distribution: Elastic Net Sale Price Predictions')
## No trace type specified:
##   Based on info supplied, a 'histogram' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#histogram
(10 ^ preds.en) %>% plot_ly(x = ~.pred) %>% add_boxplot() %>% layout(title = 'Distribution: Elastic Net Sale Price Predictions')
preds.rf = wf.rf.fin %>% predict(test)

(10 ^ preds.rf) %>% plot_ly(x = ~.pred) %>% layout(title = 'Distribution: Random Forest Sale Price Predictions')
## No trace type specified:
##   Based on info supplied, a 'histogram' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#histogram
(10 ^ preds.rf) %>% plot_ly(x = ~.pred) %>% add_boxplot() %>% layout(title = 'Distribution: Random Forest Sale Price Predictions')