#https://datatables.net/reference/option/
options(DT.options = list(scrollX = TRUE, pagin=TRUE, fixedHeader = TRUE, searchHighlight = TRUE))Predict Housing Prices using 2 supervised ml algos:
a = read_csv('train.csv') %>%
clean_names() %>%
mutate(across(where(is.character), factor)) %>%
select(sort(tidyselect::peek_vars())) %>%
select(
where(is.Date),
where(is.factor),
where(is_character),
where(is_numeric)
)
test = read_csv('test.csv') %>%
clean_names() %>%
mutate(across(where(is.character), factor)) %>%
select(sort(tidyselect::peek_vars())) %>%
select(
where(is.Date),
where(is.factor),
where(is_character),
where(is_numeric)
)
a %>% sample_n(10)Observations
## [1] "alley" "bldg_type" "bsmt_cond" "bsmt_exposure"
## [5] "bsmt_fin_type1" "bsmt_fin_type2" "bsmt_qual" "central_air"
## [9] "condition1" "condition2" "electrical" "exter_cond"
## [13] "exter_qual" "exterior1st" "exterior2nd" "fence"
## [17] "fireplace_qu" "foundation" "functional" "garage_cond"
## [21] "garage_finish" "garage_qual" "garage_type" "heating"
## [25] "heating_qc" "house_style" "kitchen_qual" "land_contour"
## [29] "land_slope" "lot_config" "lot_shape" "mas_vnr_type"
## [33] "misc_feature" "ms_zoning" "neighborhood" "paved_drive"
## [37] "pool_qc" "roof_matl" "roof_style" "sale_condition"
## [41] "sale_type" "street" "utilities"
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(nom.vars)` instead of `nom.vars` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
| Name | Piped data |
| Number of rows | 1460 |
| Number of columns | 43 |
| _______________________ | |
| Column type frequency: | |
| factor | 43 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| alley | 1369 | 0.06 | FALSE | 2 | Grv: 50, Pav: 41 |
| bldg_type | 0 | 1.00 | FALSE | 5 | 1Fa: 1220, Twn: 114, Dup: 52, Twn: 43 |
| bsmt_cond | 37 | 0.97 | FALSE | 4 | TA: 1311, Gd: 65, Fa: 45, Po: 2 |
| bsmt_exposure | 38 | 0.97 | FALSE | 4 | No: 953, Av: 221, Gd: 134, Mn: 114 |
| bsmt_fin_type1 | 37 | 0.97 | FALSE | 6 | Unf: 430, GLQ: 418, ALQ: 220, BLQ: 148 |
| bsmt_fin_type2 | 38 | 0.97 | FALSE | 6 | Unf: 1256, Rec: 54, LwQ: 46, BLQ: 33 |
| bsmt_qual | 37 | 0.97 | FALSE | 4 | TA: 649, Gd: 618, Ex: 121, Fa: 35 |
| central_air | 0 | 1.00 | FALSE | 2 | Y: 1365, N: 95 |
| condition1 | 0 | 1.00 | FALSE | 9 | Nor: 1260, Fee: 81, Art: 48, RRA: 26 |
| condition2 | 0 | 1.00 | FALSE | 8 | Nor: 1445, Fee: 6, Art: 2, Pos: 2 |
| electrical | 1 | 1.00 | FALSE | 5 | SBr: 1334, Fus: 94, Fus: 27, Fus: 3 |
| exter_cond | 0 | 1.00 | FALSE | 5 | TA: 1282, Gd: 146, Fa: 28, Ex: 3 |
| exter_qual | 0 | 1.00 | FALSE | 4 | TA: 906, Gd: 488, Ex: 52, Fa: 14 |
| exterior1st | 0 | 1.00 | FALSE | 15 | Vin: 515, HdB: 222, Met: 220, Wd : 206 |
| exterior2nd | 0 | 1.00 | FALSE | 16 | Vin: 504, Met: 214, HdB: 207, Wd : 197 |
| fence | 1179 | 0.19 | FALSE | 4 | MnP: 157, GdP: 59, GdW: 54, MnW: 11 |
| fireplace_qu | 690 | 0.53 | FALSE | 5 | Gd: 380, TA: 313, Fa: 33, Ex: 24 |
| foundation | 0 | 1.00 | FALSE | 6 | PCo: 647, CBl: 634, Brk: 146, Sla: 24 |
| functional | 0 | 1.00 | FALSE | 7 | Typ: 1360, Min: 34, Min: 31, Mod: 15 |
| garage_cond | 81 | 0.94 | FALSE | 5 | TA: 1326, Fa: 35, Gd: 9, Po: 7 |
| garage_finish | 81 | 0.94 | FALSE | 3 | Unf: 605, RFn: 422, Fin: 352 |
| garage_qual | 81 | 0.94 | FALSE | 5 | TA: 1311, Fa: 48, Gd: 14, Ex: 3 |
| garage_type | 81 | 0.94 | FALSE | 6 | Att: 870, Det: 387, Bui: 88, Bas: 19 |
| heating | 0 | 1.00 | FALSE | 6 | Gas: 1428, Gas: 18, Gra: 7, Wal: 4 |
| heating_qc | 0 | 1.00 | FALSE | 5 | Ex: 741, TA: 428, Gd: 241, Fa: 49 |
| house_style | 0 | 1.00 | FALSE | 8 | 1St: 726, 2St: 445, 1.5: 154, SLv: 65 |
| kitchen_qual | 0 | 1.00 | FALSE | 4 | TA: 735, Gd: 586, Ex: 100, Fa: 39 |
| land_contour | 0 | 1.00 | FALSE | 4 | Lvl: 1311, Bnk: 63, HLS: 50, Low: 36 |
| land_slope | 0 | 1.00 | FALSE | 3 | Gtl: 1382, Mod: 65, Sev: 13 |
| lot_config | 0 | 1.00 | FALSE | 5 | Ins: 1052, Cor: 263, Cul: 94, FR2: 47 |
| lot_shape | 0 | 1.00 | FALSE | 4 | Reg: 925, IR1: 484, IR2: 41, IR3: 10 |
| mas_vnr_type | 8 | 0.99 | FALSE | 4 | Non: 864, Brk: 445, Sto: 128, Brk: 15 |
| misc_feature | 1406 | 0.04 | FALSE | 4 | She: 49, Gar: 2, Oth: 2, Ten: 1 |
| ms_zoning | 0 | 1.00 | FALSE | 5 | RL: 1151, RM: 218, FV: 65, RH: 16 |
| neighborhood | 0 | 1.00 | FALSE | 25 | NAm: 225, Col: 150, Old: 113, Edw: 100 |
| paved_drive | 0 | 1.00 | FALSE | 3 | Y: 1340, N: 90, P: 30 |
| pool_qc | 1453 | 0.00 | FALSE | 3 | Gd: 3, Ex: 2, Fa: 2 |
| roof_matl | 0 | 1.00 | FALSE | 8 | Com: 1434, Tar: 11, WdS: 6, WdS: 5 |
| roof_style | 0 | 1.00 | FALSE | 6 | Gab: 1141, Hip: 286, Fla: 13, Gam: 11 |
| sale_condition | 0 | 1.00 | FALSE | 6 | Nor: 1198, Par: 125, Abn: 101, Fam: 20 |
| sale_type | 0 | 1.00 | FALSE | 9 | WD: 1267, New: 122, COD: 43, Con: 9 |
| street | 0 | 1.00 | FALSE | 2 | Pav: 1454, Grv: 6 |
| utilities | 0 | 1.00 | FALSE | 2 | All: 1459, NoS: 1 |
a %>% select(nom.vars) %>% map(n_unique) %>% as.data.frame.list %>% pivot_longer(everything()) %>% mutate(name = fct_reorder(name, value)) %>% plot_ly(x = ~value, y = ~name, color = ~name) %>% hide_legend() %>% layout(title = 'factor vars level counts', xaxis = list(title = 'count'), yaxis = list(title = ''))## alley frequency percentage cumulative_perc
## 1 <NA> 1369 93.77 93.77
## 2 Grvl 50 3.42 97.19
## 3 Pave 41 2.81 100.00
## bldg_type frequency percentage cumulative_perc
## 1 1Fam 1220 83.56 83.56
## 2 TwnhsE 114 7.81 91.37
## 3 Duplex 52 3.56 94.93
## 4 Twnhs 43 2.95 97.88
## 5 2fmCon 31 2.12 100.00
## bsmt_cond frequency percentage cumulative_perc
## 1 TA 1311 89.79 89.79
## 2 Gd 65 4.45 94.24
## 3 Fa 45 3.08 97.32
## 4 <NA> 37 2.53 99.85
## 5 Po 2 0.14 100.00
## bsmt_exposure frequency percentage cumulative_perc
## 1 No 953 65.27 65.27
## 2 Av 221 15.14 80.41
## 3 Gd 134 9.18 89.59
## 4 Mn 114 7.81 97.40
## 5 <NA> 38 2.60 100.00
## bsmt_fin_type1 frequency percentage cumulative_perc
## 1 Unf 430 29.45 29.45
## 2 GLQ 418 28.63 58.08
## 3 ALQ 220 15.07 73.15
## 4 BLQ 148 10.14 83.29
## 5 Rec 133 9.11 92.40
## 6 LwQ 74 5.07 97.47
## 7 <NA> 37 2.53 100.00
## bsmt_fin_type2 frequency percentage cumulative_perc
## 1 Unf 1256 86.03 86.03
## 2 Rec 54 3.70 89.73
## 3 LwQ 46 3.15 92.88
## 4 <NA> 38 2.60 95.48
## 5 BLQ 33 2.26 97.74
## 6 ALQ 19 1.30 99.04
## 7 GLQ 14 0.96 100.00
## bsmt_qual frequency percentage cumulative_perc
## 1 TA 649 44.45 44.45
## 2 Gd 618 42.33 86.78
## 3 Ex 121 8.29 95.07
## 4 <NA> 37 2.53 97.60
## 5 Fa 35 2.40 100.00
## central_air frequency percentage cumulative_perc
## 1 Y 1365 93.49 93.49
## 2 N 95 6.51 100.00
## condition1 frequency percentage cumulative_perc
## 1 Norm 1260 86.30 86.30
## 2 Feedr 81 5.55 91.85
## 3 Artery 48 3.29 95.14
## 4 RRAn 26 1.78 96.92
## 5 PosN 19 1.30 98.22
## 6 RRAe 11 0.75 98.97
## 7 PosA 8 0.55 99.52
## 8 RRNn 5 0.34 99.86
## 9 RRNe 2 0.14 100.00
## condition2 frequency percentage cumulative_perc
## 1 Norm 1445 98.97 98.97
## 2 Feedr 6 0.41 99.38
## 3 Artery 2 0.14 99.52
## 4 PosN 2 0.14 99.66
## 5 RRNn 2 0.14 99.80
## 6 PosA 1 0.07 99.87
## 7 RRAe 1 0.07 99.94
## 8 RRAn 1 0.07 100.00
## electrical frequency percentage cumulative_perc
## 1 SBrkr 1334 91.37 91.37
## 2 FuseA 94 6.44 97.81
## 3 FuseF 27 1.85 99.66
## 4 FuseP 3 0.21 99.87
## 5 Mix 1 0.07 99.94
## 6 <NA> 1 0.07 100.00
## exter_cond frequency percentage cumulative_perc
## 1 TA 1282 87.81 87.81
## 2 Gd 146 10.00 97.81
## 3 Fa 28 1.92 99.73
## 4 Ex 3 0.21 99.94
## 5 Po 1 0.07 100.00
## exter_qual frequency percentage cumulative_perc
## 1 TA 906 62.05 62.05
## 2 Gd 488 33.42 95.47
## 3 Ex 52 3.56 99.03
## 4 Fa 14 0.96 100.00
## exterior1st frequency percentage cumulative_perc
## 1 VinylSd 515 35.27 35.27
## 2 HdBoard 222 15.21 50.48
## 3 MetalSd 220 15.07 65.55
## 4 Wd Sdng 206 14.11 79.66
## 5 Plywood 108 7.40 87.06
## 6 CemntBd 61 4.18 91.24
## 7 BrkFace 50 3.42 94.66
## 8 WdShing 26 1.78 96.44
## 9 Stucco 25 1.71 98.15
## 10 AsbShng 20 1.37 99.52
## 11 BrkComm 2 0.14 99.66
## 12 Stone 2 0.14 99.80
## 13 AsphShn 1 0.07 99.87
## 14 CBlock 1 0.07 99.94
## 15 ImStucc 1 0.07 100.00
## exterior2nd frequency percentage cumulative_perc
## 1 VinylSd 504 34.52 34.52
## 2 MetalSd 214 14.66 49.18
## 3 HdBoard 207 14.18 63.36
## 4 Wd Sdng 197 13.49 76.85
## 5 Plywood 142 9.73 86.58
## 6 CmentBd 60 4.11 90.69
## 7 Wd Shng 38 2.60 93.29
## 8 Stucco 26 1.78 95.07
## 9 BrkFace 25 1.71 96.78
## 10 AsbShng 20 1.37 98.15
## 11 ImStucc 10 0.68 98.83
## 12 Brk Cmn 7 0.48 99.31
## 13 Stone 5 0.34 99.65
## 14 AsphShn 3 0.21 99.86
## 15 CBlock 1 0.07 99.93
## 16 Other 1 0.07 100.00
## fence frequency percentage cumulative_perc
## 1 <NA> 1179 80.75 80.75
## 2 MnPrv 157 10.75 91.50
## 3 GdPrv 59 4.04 95.54
## 4 GdWo 54 3.70 99.24
## 5 MnWw 11 0.75 100.00
## fireplace_qu frequency percentage cumulative_perc
## 1 <NA> 690 47.26 47.26
## 2 Gd 380 26.03 73.29
## 3 TA 313 21.44 94.73
## 4 Fa 33 2.26 96.99
## 5 Ex 24 1.64 98.63
## 6 Po 20 1.37 100.00
## foundation frequency percentage cumulative_perc
## 1 PConc 647 44.32 44.32
## 2 CBlock 634 43.42 87.74
## 3 BrkTil 146 10.00 97.74
## 4 Slab 24 1.64 99.38
## 5 Stone 6 0.41 99.79
## 6 Wood 3 0.21 100.00
## functional frequency percentage cumulative_perc
## 1 Typ 1360 93.15 93.15
## 2 Min2 34 2.33 95.48
## 3 Min1 31 2.12 97.60
## 4 Mod 15 1.03 98.63
## 5 Maj1 14 0.96 99.59
## 6 Maj2 5 0.34 99.93
## 7 Sev 1 0.07 100.00
## garage_cond frequency percentage cumulative_perc
## 1 TA 1326 90.82 90.82
## 2 <NA> 81 5.55 96.37
## 3 Fa 35 2.40 98.77
## 4 Gd 9 0.62 99.39
## 5 Po 7 0.48 99.87
## 6 Ex 2 0.14 100.00
## garage_finish frequency percentage cumulative_perc
## 1 Unf 605 41.44 41.44
## 2 RFn 422 28.90 70.34
## 3 Fin 352 24.11 94.45
## 4 <NA> 81 5.55 100.00
## garage_qual frequency percentage cumulative_perc
## 1 TA 1311 89.79 89.79
## 2 <NA> 81 5.55 95.34
## 3 Fa 48 3.29 98.63
## 4 Gd 14 0.96 99.59
## 5 Ex 3 0.21 99.80
## 6 Po 3 0.21 100.00
## garage_type frequency percentage cumulative_perc
## 1 Attchd 870 59.59 59.59
## 2 Detchd 387 26.51 86.10
## 3 BuiltIn 88 6.03 92.13
## 4 <NA> 81 5.55 97.68
## 5 Basment 19 1.30 98.98
## 6 CarPort 9 0.62 99.60
## 7 2Types 6 0.41 100.00
## heating frequency percentage cumulative_perc
## 1 GasA 1428 97.81 97.81
## 2 GasW 18 1.23 99.04
## 3 Grav 7 0.48 99.52
## 4 Wall 4 0.27 99.79
## 5 OthW 2 0.14 99.93
## 6 Floor 1 0.07 100.00
## heating_qc frequency percentage cumulative_perc
## 1 Ex 741 50.75 50.75
## 2 TA 428 29.32 80.07
## 3 Gd 241 16.51 96.58
## 4 Fa 49 3.36 99.94
## 5 Po 1 0.07 100.00
## house_style frequency percentage cumulative_perc
## 1 1Story 726 49.73 49.73
## 2 2Story 445 30.48 80.21
## 3 1.5Fin 154 10.55 90.76
## 4 SLvl 65 4.45 95.21
## 5 SFoyer 37 2.53 97.74
## 6 1.5Unf 14 0.96 98.70
## 7 2.5Unf 11 0.75 99.45
## 8 2.5Fin 8 0.55 100.00
## kitchen_qual frequency percentage cumulative_perc
## 1 TA 735 50.34 50.34
## 2 Gd 586 40.14 90.48
## 3 Ex 100 6.85 97.33
## 4 Fa 39 2.67 100.00
## land_contour frequency percentage cumulative_perc
## 1 Lvl 1311 89.79 89.79
## 2 Bnk 63 4.32 94.11
## 3 HLS 50 3.42 97.53
## 4 Low 36 2.47 100.00
## land_slope frequency percentage cumulative_perc
## 1 Gtl 1382 94.66 94.66
## 2 Mod 65 4.45 99.11
## 3 Sev 13 0.89 100.00
## lot_config frequency percentage cumulative_perc
## 1 Inside 1052 72.05 72.05
## 2 Corner 263 18.01 90.06
## 3 CulDSac 94 6.44 96.50
## 4 FR2 47 3.22 99.72
## 5 FR3 4 0.27 100.00
## lot_shape frequency percentage cumulative_perc
## 1 Reg 925 63.36 63.36
## 2 IR1 484 33.15 96.51
## 3 IR2 41 2.81 99.32
## 4 IR3 10 0.68 100.00
## mas_vnr_type frequency percentage cumulative_perc
## 1 None 864 59.18 59.18
## 2 BrkFace 445 30.48 89.66
## 3 Stone 128 8.77 98.43
## 4 BrkCmn 15 1.03 99.46
## 5 <NA> 8 0.55 100.00
## misc_feature frequency percentage cumulative_perc
## 1 <NA> 1406 96.30 96.30
## 2 Shed 49 3.36 99.66
## 3 Gar2 2 0.14 99.80
## 4 Othr 2 0.14 99.94
## 5 TenC 1 0.07 100.00
## ms_zoning frequency percentage cumulative_perc
## 1 RL 1151 78.84 78.84
## 2 RM 218 14.93 93.77
## 3 FV 65 4.45 98.22
## 4 RH 16 1.10 99.32
## 5 C (all) 10 0.68 100.00
## neighborhood frequency percentage cumulative_perc
## 1 NAmes 225 15.41 15.41
## 2 CollgCr 150 10.27 25.68
## 3 OldTown 113 7.74 33.42
## 4 Edwards 100 6.85 40.27
## 5 Somerst 86 5.89 46.16
## 6 Gilbert 79 5.41 51.57
## 7 NridgHt 77 5.27 56.84
## 8 Sawyer 74 5.07 61.91
## 9 NWAmes 73 5.00 66.91
## 10 SawyerW 59 4.04 70.95
## 11 BrkSide 58 3.97 74.92
## 12 Crawfor 51 3.49 78.41
## 13 Mitchel 49 3.36 81.77
## 14 NoRidge 41 2.81 84.58
## 15 Timber 38 2.60 87.18
## 16 IDOTRR 37 2.53 89.71
## 17 ClearCr 28 1.92 91.63
## 18 StoneBr 25 1.71 93.34
## 19 SWISU 25 1.71 95.05
## 20 Blmngtn 17 1.16 96.21
## 21 MeadowV 17 1.16 97.37
## 22 BrDale 16 1.10 98.47
## 23 Veenker 11 0.75 99.22
## 24 NPkVill 9 0.62 99.84
## 25 Blueste 2 0.14 100.00
## paved_drive frequency percentage cumulative_perc
## 1 Y 1340 91.78 91.78
## 2 N 90 6.16 97.94
## 3 P 30 2.05 100.00
## pool_qc frequency percentage cumulative_perc
## 1 <NA> 1453 99.52 99.52
## 2 Gd 3 0.21 99.73
## 3 Ex 2 0.14 99.87
## 4 Fa 2 0.14 100.00
## roof_matl frequency percentage cumulative_perc
## 1 CompShg 1434 98.22 98.22
## 2 Tar&Grv 11 0.75 98.97
## 3 WdShngl 6 0.41 99.38
## 4 WdShake 5 0.34 99.72
## 5 ClyTile 1 0.07 99.79
## 6 Membran 1 0.07 99.86
## 7 Metal 1 0.07 99.93
## 8 Roll 1 0.07 100.00
## roof_style frequency percentage cumulative_perc
## 1 Gable 1141 78.15 78.15
## 2 Hip 286 19.59 97.74
## 3 Flat 13 0.89 98.63
## 4 Gambrel 11 0.75 99.38
## 5 Mansard 7 0.48 99.86
## 6 Shed 2 0.14 100.00
## sale_condition frequency percentage cumulative_perc
## 1 Normal 1198 82.05 82.05
## 2 Partial 125 8.56 90.61
## 3 Abnorml 101 6.92 97.53
## 4 Family 20 1.37 98.90
## 5 Alloca 12 0.82 99.72
## 6 AdjLand 4 0.27 100.00
## sale_type frequency percentage cumulative_perc
## 1 WD 1267 86.78 86.78
## 2 New 122 8.36 95.14
## 3 COD 43 2.95 98.09
## 4 ConLD 9 0.62 98.71
## 5 ConLI 5 0.34 99.05
## 6 ConLw 5 0.34 99.39
## 7 CWD 4 0.27 99.66
## 8 Oth 3 0.21 99.87
## 9 Con 2 0.14 100.00
## street frequency percentage cumulative_perc
## 1 Pave 1454 99.59 99.59
## 2 Grvl 6 0.41 100.00
## utilities frequency percentage cumulative_perc
## 1 AllPub 1459 99.93 99.93
## 2 NoSeWa 1 0.07 100.00
## [1] "Variables processed: alley, bldg_type, bsmt_cond, bsmt_exposure, bsmt_fin_type1, bsmt_fin_type2, bsmt_qual, central_air, condition1, condition2, electrical, exter_cond, exter_qual, exterior1st, exterior2nd, fence, fireplace_qu, foundation, functional, garage_cond, garage_finish, garage_qual, garage_type, heating, heating_qc, house_style, kitchen_qual, land_contour, land_slope, lot_config, lot_shape, mas_vnr_type, misc_feature, ms_zoning, neighborhood, paved_drive, pool_qc, roof_matl, roof_style, sale_condition, sale_type, street, utilities"
#documentation: https://mran.microsoft.com/snapshot/2016-01-12/web/packages/GGally/vignettes/ggpairs.html
a %>% select(nom.vars) %>%
GGally::ggpairs(
columns = c(nom.vars[13], nom.vars[32]),
mapping = aes(color = eval(as.name(nom.vars[13])))
)Description of Masonry veneer type
Description of Exterior Quality Levels
## [1] "bedroom_abv_gr" "bsmt_fin_sf1" "bsmt_fin_sf2" "bsmt_full_bath"
## [5] "bsmt_half_bath" "bsmt_unf_sf" "enclosed_porch" "fireplaces"
## [9] "full_bath" "garage_area" "garage_cars" "garage_yr_blt"
## [13] "gr_liv_area" "half_bath" "kitchen_abv_gr" "lot_area"
## [17] "lot_frontage" "low_qual_fin_sf" "mas_vnr_area" "misc_val"
## [21] "mo_sold" "ms_sub_class" "open_porch_sf" "overall_cond"
## [25] "overall_qual" "pool_area" "sale_price" "screen_porch"
## [29] "tot_rms_abv_grd" "total_bsmt_sf" "wood_deck_sf" "x1st_flr_sf"
## [33] "x2nd_flr_sf" "x3ssn_porch" "year_built" "year_remod_add"
## [37] "yr_sold"
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(num.vars)` instead of `num.vars` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
| Name | Piped data |
| Number of rows | 1460 |
| Number of columns | 37 |
| _______________________ | |
| Column type frequency: | |
| numeric | 37 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| bedroom_abv_gr | 0 | 1.00 | 2.87 | 0.82 | 0 | 2.00 | 3.0 | 3.00 | 8 |
| bsmt_fin_sf1 | 0 | 1.00 | 443.64 | 456.10 | 0 | 0.00 | 383.5 | 712.25 | 5644 |
| bsmt_fin_sf2 | 0 | 1.00 | 46.55 | 161.32 | 0 | 0.00 | 0.0 | 0.00 | 1474 |
| bsmt_full_bath | 0 | 1.00 | 0.43 | 0.52 | 0 | 0.00 | 0.0 | 1.00 | 3 |
| bsmt_half_bath | 0 | 1.00 | 0.06 | 0.24 | 0 | 0.00 | 0.0 | 0.00 | 2 |
| bsmt_unf_sf | 0 | 1.00 | 567.24 | 441.87 | 0 | 223.00 | 477.5 | 808.00 | 2336 |
| enclosed_porch | 0 | 1.00 | 21.95 | 61.12 | 0 | 0.00 | 0.0 | 0.00 | 552 |
| fireplaces | 0 | 1.00 | 0.61 | 0.64 | 0 | 0.00 | 1.0 | 1.00 | 3 |
| full_bath | 0 | 1.00 | 1.57 | 0.55 | 0 | 1.00 | 2.0 | 2.00 | 3 |
| garage_area | 0 | 1.00 | 472.98 | 213.80 | 0 | 334.50 | 480.0 | 576.00 | 1418 |
| garage_cars | 0 | 1.00 | 1.77 | 0.75 | 0 | 1.00 | 2.0 | 2.00 | 4 |
| garage_yr_blt | 81 | 0.94 | 1978.51 | 24.69 | 1900 | 1961.00 | 1980.0 | 2002.00 | 2010 |
| gr_liv_area | 0 | 1.00 | 1515.46 | 525.48 | 334 | 1129.50 | 1464.0 | 1776.75 | 5642 |
| half_bath | 0 | 1.00 | 0.38 | 0.50 | 0 | 0.00 | 0.0 | 1.00 | 2 |
| kitchen_abv_gr | 0 | 1.00 | 1.05 | 0.22 | 0 | 1.00 | 1.0 | 1.00 | 3 |
| lot_area | 0 | 1.00 | 10516.83 | 9981.26 | 1300 | 7553.50 | 9478.5 | 11601.50 | 215245 |
| lot_frontage | 259 | 0.82 | 70.05 | 24.28 | 21 | 59.00 | 69.0 | 80.00 | 313 |
| low_qual_fin_sf | 0 | 1.00 | 5.84 | 48.62 | 0 | 0.00 | 0.0 | 0.00 | 572 |
| mas_vnr_area | 8 | 0.99 | 103.69 | 181.07 | 0 | 0.00 | 0.0 | 166.00 | 1600 |
| misc_val | 0 | 1.00 | 43.49 | 496.12 | 0 | 0.00 | 0.0 | 0.00 | 15500 |
| mo_sold | 0 | 1.00 | 6.32 | 2.70 | 1 | 5.00 | 6.0 | 8.00 | 12 |
| ms_sub_class | 0 | 1.00 | 56.90 | 42.30 | 20 | 20.00 | 50.0 | 70.00 | 190 |
| open_porch_sf | 0 | 1.00 | 46.66 | 66.26 | 0 | 0.00 | 25.0 | 68.00 | 547 |
| overall_cond | 0 | 1.00 | 5.58 | 1.11 | 1 | 5.00 | 5.0 | 6.00 | 9 |
| overall_qual | 0 | 1.00 | 6.10 | 1.38 | 1 | 5.00 | 6.0 | 7.00 | 10 |
| pool_area | 0 | 1.00 | 2.76 | 40.18 | 0 | 0.00 | 0.0 | 0.00 | 738 |
| sale_price | 0 | 1.00 | 180921.20 | 79442.50 | 34900 | 129975.00 | 163000.0 | 214000.00 | 755000 |
| screen_porch | 0 | 1.00 | 15.06 | 55.76 | 0 | 0.00 | 0.0 | 0.00 | 480 |
| tot_rms_abv_grd | 0 | 1.00 | 6.52 | 1.63 | 2 | 5.00 | 6.0 | 7.00 | 14 |
| total_bsmt_sf | 0 | 1.00 | 1057.43 | 438.71 | 0 | 795.75 | 991.5 | 1298.25 | 6110 |
| wood_deck_sf | 0 | 1.00 | 94.24 | 125.34 | 0 | 0.00 | 0.0 | 168.00 | 857 |
| x1st_flr_sf | 0 | 1.00 | 1162.63 | 386.59 | 334 | 882.00 | 1087.0 | 1391.25 | 4692 |
| x2nd_flr_sf | 0 | 1.00 | 346.99 | 436.53 | 0 | 0.00 | 0.0 | 728.00 | 2065 |
| x3ssn_porch | 0 | 1.00 | 3.41 | 29.32 | 0 | 0.00 | 0.0 | 0.00 | 508 |
| year_built | 0 | 1.00 | 1971.27 | 30.20 | 1872 | 1954.00 | 1973.0 | 2000.00 | 2010 |
| year_remod_add | 0 | 1.00 | 1984.87 | 20.65 | 1950 | 1967.00 | 1994.0 | 2004.00 | 2010 |
| yr_sold | 0 | 1.00 | 2007.82 | 1.33 | 2006 | 2007.00 | 2008.0 | 2009.00 | 2010 |
a %>% select(num.vars) %>% DataExplorer::plot_histogram(
scale_x = 'log10',
#geom_histogram_args = list(bins = 50L),
ncol = 2, nrow = 2
)## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
#a %>% select(num.vars) %>% dlookr::plot_correlate()
a %>% select(num.vars) %>% GGally::ggcorr(low = 'red', high = 'darkgreen', label = TRUE)Observations:
#This function is used to analyze data when we need to reduce variable cardinality in predictive modeling.
#works in conjunction with 'cross_plot';
#a %>% funModeling::categ_analysis(input = nom.vars[13], target = 'overall_qual.gte.8.flg')#order reference: https://recipes.tidymodels.org/articles/Ordering.html
library(tidymodels)
rec.en = train %>% recipe(sale_price ~ . ) %>%
step_mutate(sale_price = log10(sale_price), skip = TRUE) %>%
step_rm(pool_qc, misc_feature, alley, fence) %>% #excessive nas
step_bagimpute(
lot_frontage, garage_yr_blt, mas_vnr_area
) %>%
step_unknown(all_nominal()) %>% #assign 'unknown' level to vars with nas
step_corr(all_numeric(),-all_outcomes()) %>%
step_dummy(all_nominal(), one_hot = TRUE) %>% #There are new levels in a factor
step_nzv(all_predictors(),-all_outcomes()) %>%
step_zv(all_predictors(),-all_outcomes()) %>%
step_normalize(all_numeric(),-all_outcomes())
rec.en %>% tidy#----------------------------
rec.rf = train %>% recipe(sale_price ~ . ) %>%
step_mutate(sale_price = log10(sale_price), skip = TRUE) %>%
step_rm(pool_qc, misc_feature, alley, fence) %>% #excessive nas
step_bagimpute(all_numeric(),-all_outcomes()) %>%
step_knnimpute(all_nominal(),-all_outcomes()) %>%
step_corr(all_numeric(),-all_outcomes()) %>%
#step_dummy(all_nominal(), one_hot = TRUE) %>% #There are new levels in a factor
step_nzv(all_predictors(),-all_outcomes()) %>%
step_zv(all_predictors(),-all_outcomes()) %>%
step_normalize(all_numeric(),-all_outcomes())
rec.rf %>% tidymdl.en = parsnip::linear_reg(
penalty = tune(),
mixture = tune() #lasso/ridge mix
) %>%
set_mode('regression') %>%
set_engine('glmnet')
mdl.rf = parsnip::rand_forest(
trees = 150,
min_n = tune(), #min number of observations at terminal node
mtry = tune() #number of vars to randomly subset at each node
) %>%
set_mode('regression') %>%
set_engine('ranger', importance = 'impurity_corrected')doParallel::registerDoParallel() #use parallel processing
set.seed(345)
tg.en = tune_grid(
object = wf.en,
resamples = vfold,
grid = 5 #Create a tuning grid AUTOMATICALLY
)
tg.en %>% collect_metrics()#----------------------------
tg.rf = tune_grid(
object = wf.rf,
resamples = vfold,
grid = 5 #Create a tuning grid AUTOMATICALLY
)
tg.rf$.notes## [[1]]
## # A tibble: 5 x 1
## .notes
## <chr>
## 1 model 1/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 2 model 2/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 3 model 3/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 4 model 4/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 5 model 5/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
##
## [[2]]
## # A tibble: 5 x 1
## .notes
## <chr>
## 1 model 1/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 2 model 2/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 3 model 3/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 4 model 4/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 5 model 5/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
##
## [[3]]
## # A tibble: 5 x 1
## .notes
## <chr>
## 1 model 1/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 2 model 2/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 3 model 3/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 4 model 4/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 5 model 5/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
##
## [[4]]
## # A tibble: 5 x 1
## .notes
## <chr>
## 1 model 1/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 2 model 2/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 3 model 3/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 4 model 4/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
## 5 model 5/5 (predictions): Forest was grown with 'impurity_corrected' variable ~
##
## [[5]]
## # A tibble: 6 x 1
## .notes
## <chr>
## 1 recipe: skipping variable with zero or non-finite range., the standard deviat~
## 2 model 1/5 (predictions): skipping variable with zero or non-finite range., Fo~
## 3 model 2/5 (predictions): skipping variable with zero or non-finite range., Fo~
## 4 model 3/5 (predictions): skipping variable with zero or non-finite range., Fo~
## 5 model 4/5 (predictions): skipping variable with zero or non-finite range., Fo~
## 6 model 5/5 (predictions): skipping variable with zero or non-finite range., Fo~
ggplotly(
tg.en %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
select(mean, penalty, mixture) %>%
pivot_longer(penalty:mixture,
values_to = "value",
names_to = "parameter"
) %>%
ggplot(aes(value, mean, color = parameter)) +
geom_point(show.legend = FALSE, size = 3) +
facet_wrap(~parameter, scales = "free_x") +
labs(title = 'Elastic Net RMSE by Hyperparameter', x = NULL, y = '')
)#----------------------------
ggplotly(
tg.rf %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
select(mean, min_n, mtry) %>%
pivot_longer(min_n:mtry,
values_to = "value",
names_to = "parameter"
) %>%
ggplot(aes(value, mean, color = parameter)) +
geom_point(show.legend = FALSE, size = 3) +
facet_wrap(~parameter, scales = "free_x") +
labs(title = 'Random Forest RMSE by Hyperparameter', x = NULL, y = '')
)wf.en.fin = wf.en %>%
#1) finalize wf (recipe, model w/previously unknown hps) using best hps
finalize_workflow(best.hps.en) %>%
#2) fit on entire train
fit(train)
#----------------------------
wf.rf.fin = wf.rf %>%
#1) finalize wf (recipe, model w/previously unknown hps) using best hps
finalize_workflow(best.hps.rf) %>%
#2) fit on entire train
fit(train)#wf.en.fin %>% pull_workflow_fit() %>% vi
wf.rf.fin %>% pull_workflow_fit() %>% vip(geom = 'point') + labs(title = 'RF var importance')preds.en = wf.en.fin %>% predict(test)
(10 ^ preds.en) %>% plot_ly(x = ~.pred) %>% layout(title = 'Distribution: Elastic Net Sale Price Predictions')## No trace type specified:
## Based on info supplied, a 'histogram' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#histogram
(10 ^ preds.en) %>% plot_ly(x = ~.pred) %>% add_boxplot() %>% layout(title = 'Distribution: Elastic Net Sale Price Predictions')preds.rf = wf.rf.fin %>% predict(test)
(10 ^ preds.rf) %>% plot_ly(x = ~.pred) %>% layout(title = 'Distribution: Random Forest Sale Price Predictions')## No trace type specified:
## Based on info supplied, a 'histogram' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#histogram