Memanggil Package

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(DataExplorer)
library(skimr)

Import Data

library(readr)
house_price1 <- read_csv("C:/Users/sarah/Downloads/house_price1.csv")
## Rows: 1460 Columns: 81
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (38): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(house_price1)

Mengubah nama file menjadi data_house

data_house <- read.csv("C:/Users/sarah/Downloads/house_price1.csv",stringsAsFactors = TRUE)
glimpse(data_house)
## Rows: 1,460
## Columns: 81
## $ Id            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ MSSubClass    <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20,…
## $ MSZoning      <fct> RL, RL, RL, RL, RL, RL, RL, RL, RM, RL, RL, RL, RL, RL, …
## $ LotFrontage   <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, …
## $ LotArea       <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 612…
## $ Street        <fct> Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pa…
## $ Alley         <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ LotShape      <fct> Reg, Reg, IR1, IR1, IR1, IR1, Reg, IR1, Reg, Reg, Reg, I…
## $ LandContour   <fct> Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, L…
## $ Utilities     <fct> AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, …
## $ LotConfig     <fct> Inside, FR2, Inside, Corner, FR2, Inside, Inside, Corner…
## $ LandSlope     <fct> Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, G…
## $ Neighborhood  <fct> CollgCr, Veenker, CollgCr, Crawfor, NoRidge, Mitchel, So…
## $ Condition1    <fct> Norm, Feedr, Norm, Norm, Norm, Norm, Norm, PosN, Artery,…
## $ Condition2    <fct> Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Ar…
## $ BldgType      <fct> 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 2f…
## $ HouseStyle    <fct> 2Story, 1Story, 2Story, 2Story, 2Story, 1.5Fin, 1Story, …
## $ OverallQual   <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5,…
## $ OverallCond   <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5,…
## $ YearBuilt     <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 19…
## $ YearRemodAdd  <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 19…
## $ RoofStyle     <fct> Gable, Gable, Gable, Gable, Gable, Gable, Gable, Gable, …
## $ RoofMatl      <fct> CompShg, CompShg, CompShg, CompShg, CompShg, CompShg, Co…
## $ Exterior1st   <fct> VinylSd, MetalSd, VinylSd, Wd Sdng, VinylSd, VinylSd, Vi…
## $ Exterior2nd   <fct> VinylSd, MetalSd, VinylSd, Wd Shng, VinylSd, VinylSd, Vi…
## $ MasVnrType    <fct> BrkFace, None, BrkFace, None, BrkFace, None, Stone, Ston…
## $ MasVnrArea    <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306, …
## $ ExterQual     <fct> Gd, TA, Gd, TA, Gd, TA, Gd, TA, TA, TA, TA, Ex, TA, Gd, …
## $ ExterCond     <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, …
## $ Foundation    <fct> PConc, CBlock, PConc, BrkTil, PConc, Wood, PConc, CBlock…
## $ BsmtQual      <fct> Gd, Gd, Gd, TA, Gd, Gd, Ex, Gd, TA, TA, TA, Ex, TA, Gd, …
## $ BsmtCond      <fct> TA, TA, TA, Gd, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, …
## $ BsmtExposure  <fct> No, Gd, Mn, No, Av, No, Av, Mn, No, No, No, No, No, Av, …
## $ BsmtFinType1  <fct> GLQ, ALQ, GLQ, ALQ, GLQ, GLQ, GLQ, ALQ, Unf, GLQ, Rec, G…
## $ BsmtFinSF1    <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 99…
## $ BsmtFinType2  <fct> Unf, Unf, Unf, Unf, Unf, Unf, Unf, BLQ, Unf, Unf, Unf, U…
## $ BsmtFinSF2    <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ BsmtUnfSF     <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 17…
## $ TotalBsmtSF   <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 10…
## $ Heating       <fct> GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, Ga…
## $ HeatingQC     <fct> Ex, Ex, Ex, Gd, Ex, Ex, Ex, Ex, Gd, Ex, Ex, Ex, TA, Ex, …
## $ CentralAir    <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,…
## $ Electrical    <fct> SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, …
## $ X1stFlrSF     <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, …
## $ X2ndFlrSF     <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0,…
## $ LowQualFinSF  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ GrLivArea     <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 10…
## $ BsmtFullBath  <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,…
## $ BsmtHalfBath  <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FullBath      <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1,…
## $ HalfBath      <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,…
## $ BedroomAbvGr  <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3,…
## $ KitchenAbvGr  <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
## $ KitchenQual   <fct> Gd, TA, Gd, Gd, Gd, TA, Gd, TA, TA, TA, TA, Ex, TA, Gd, …
## $ TotRmsAbvGrd  <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6…
## $ Functional    <fct> Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ, Min1, Typ, Typ, …
## $ Fireplaces    <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0,…
## $ FireplaceQu   <fct> NA, TA, TA, Gd, TA, NA, Gd, TA, TA, TA, NA, Gd, NA, Gd, …
## $ GarageType    <fct> Attchd, Attchd, Attchd, Detchd, Attchd, Attchd, Attchd, …
## $ GarageYrBlt   <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, 19…
## $ GarageFinish  <fct> RFn, RFn, RFn, Unf, RFn, Unf, RFn, RFn, Unf, RFn, Unf, F…
## $ GarageCars    <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2,…
## $ GarageArea    <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 7…
## $ GarageQual    <fct> TA, TA, TA, TA, TA, TA, TA, TA, Fa, Gd, TA, TA, TA, TA, …
## $ GarageCond    <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, …
## $ PavedDrive    <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,…
## $ WoodDeckSF    <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160…
## $ OpenPorchSF   <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213,…
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, …
## $ X3SsnPorch    <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ScreenPorch   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, …
## $ PoolArea      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ PoolQC        <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ Fence         <fct> NA, NA, NA, NA, NA, MnPrv, NA, NA, NA, NA, NA, NA, NA, N…
## $ MiscFeature   <fct> NA, NA, NA, NA, NA, Shed, NA, Shed, NA, NA, NA, NA, NA, …
## $ MiscVal       <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700,…
## $ MoSold        <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10…
## $ YrSold        <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 20…
## $ SaleType      <fct> WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, New, WD, New…
## $ SaleCondition <fct> Normal, Normal, Normal, Abnorml, Normal, Normal, Normal,…
## $ SalePrice     <int> 208500, 181500, 223500, 140000, 250000, 143000, 307000, …

Memeriksa Gambaran Umum Data

plot_intro(data = data_house, geom_label_args = list(size=2.5))

Melakukan Data Cleaning

skim_without_charts(data = data_house)
Data summary
Name data_house
Number of rows 1460
Number of columns 81
_______________________
Column type frequency:
factor 43
numeric 38
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
MSZoning 0 1.00 FALSE 5 RL: 1151, RM: 218, FV: 65, RH: 16
Street 0 1.00 FALSE 2 Pav: 1454, Grv: 6
Alley 1369 0.06 FALSE 2 Grv: 50, Pav: 41
LotShape 0 1.00 FALSE 4 Reg: 925, IR1: 484, IR2: 41, IR3: 10
LandContour 0 1.00 FALSE 4 Lvl: 1311, Bnk: 63, HLS: 50, Low: 36
Utilities 0 1.00 FALSE 2 All: 1459, NoS: 1
LotConfig 0 1.00 FALSE 5 Ins: 1052, Cor: 263, Cul: 94, FR2: 47
LandSlope 0 1.00 FALSE 3 Gtl: 1382, Mod: 65, Sev: 13
Neighborhood 0 1.00 FALSE 25 NAm: 225, Col: 150, Old: 113, Edw: 100
Condition1 0 1.00 FALSE 9 Nor: 1260, Fee: 81, Art: 48, RRA: 26
Condition2 0 1.00 FALSE 8 Nor: 1445, Fee: 6, Art: 2, Pos: 2
BldgType 0 1.00 FALSE 5 1Fa: 1220, Twn: 114, Dup: 52, Twn: 43
HouseStyle 0 1.00 FALSE 8 1St: 726, 2St: 445, 1.5: 154, SLv: 65
RoofStyle 0 1.00 FALSE 6 Gab: 1141, Hip: 286, Fla: 13, Gam: 11
RoofMatl 0 1.00 FALSE 8 Com: 1434, Tar: 11, WdS: 6, WdS: 5
Exterior1st 0 1.00 FALSE 15 Vin: 515, HdB: 222, Met: 220, Wd : 206
Exterior2nd 0 1.00 FALSE 16 Vin: 504, Met: 214, HdB: 207, Wd : 197
MasVnrType 8 0.99 FALSE 4 Non: 864, Brk: 445, Sto: 128, Brk: 15
ExterQual 0 1.00 FALSE 4 TA: 906, Gd: 488, Ex: 52, Fa: 14
ExterCond 0 1.00 FALSE 5 TA: 1282, Gd: 146, Fa: 28, Ex: 3
Foundation 0 1.00 FALSE 6 PCo: 647, CBl: 634, Brk: 146, Sla: 24
BsmtQual 37 0.97 FALSE 4 TA: 649, Gd: 618, Ex: 121, Fa: 35
BsmtCond 37 0.97 FALSE 4 TA: 1311, Gd: 65, Fa: 45, Po: 2
BsmtExposure 38 0.97 FALSE 4 No: 953, Av: 221, Gd: 134, Mn: 114
BsmtFinType1 37 0.97 FALSE 6 Unf: 430, GLQ: 418, ALQ: 220, BLQ: 148
BsmtFinType2 38 0.97 FALSE 6 Unf: 1256, Rec: 54, LwQ: 46, BLQ: 33
Heating 0 1.00 FALSE 6 Gas: 1428, Gas: 18, Gra: 7, Wal: 4
HeatingQC 0 1.00 FALSE 5 Ex: 741, TA: 428, Gd: 241, Fa: 49
CentralAir 0 1.00 FALSE 2 Y: 1365, N: 95
Electrical 1 1.00 FALSE 5 SBr: 1334, Fus: 94, Fus: 27, Fus: 3
KitchenQual 0 1.00 FALSE 4 TA: 735, Gd: 586, Ex: 100, Fa: 39
Functional 0 1.00 FALSE 7 Typ: 1360, Min: 34, Min: 31, Mod: 15
FireplaceQu 690 0.53 FALSE 5 Gd: 380, TA: 313, Fa: 33, Ex: 24
GarageType 81 0.94 FALSE 6 Att: 870, Det: 387, Bui: 88, Bas: 19
GarageFinish 81 0.94 FALSE 3 Unf: 605, RFn: 422, Fin: 352
GarageQual 81 0.94 FALSE 5 TA: 1311, Fa: 48, Gd: 14, Ex: 3
GarageCond 81 0.94 FALSE 5 TA: 1326, Fa: 35, Gd: 9, Po: 7
PavedDrive 0 1.00 FALSE 3 Y: 1340, N: 90, P: 30
PoolQC 1453 0.00 FALSE 3 Gd: 3, Ex: 2, Fa: 2
Fence 1179 0.19 FALSE 4 MnP: 157, GdP: 59, GdW: 54, MnW: 11
MiscFeature 1406 0.04 FALSE 4 She: 49, Gar: 2, Oth: 2, Ten: 1
SaleType 0 1.00 FALSE 9 WD: 1267, New: 122, COD: 43, Con: 9
SaleCondition 0 1.00 FALSE 6 Nor: 1198, Par: 125, Abn: 101, Fam: 20

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
Id 0 1.00 730.50 421.61 1 365.75 730.5 1095.25 1460
MSSubClass 0 1.00 56.90 42.30 20 20.00 50.0 70.00 190
LotFrontage 259 0.82 70.05 24.28 21 59.00 69.0 80.00 313
LotArea 0 1.00 10516.83 9981.26 1300 7553.50 9478.5 11601.50 215245
OverallQual 0 1.00 6.10 1.38 1 5.00 6.0 7.00 10
OverallCond 0 1.00 5.58 1.11 1 5.00 5.0 6.00 9
YearBuilt 0 1.00 1971.27 30.20 1872 1954.00 1973.0 2000.00 2010
YearRemodAdd 0 1.00 1984.87 20.65 1950 1967.00 1994.0 2004.00 2010
MasVnrArea 8 0.99 103.69 181.07 0 0.00 0.0 166.00 1600
BsmtFinSF1 0 1.00 443.64 456.10 0 0.00 383.5 712.25 5644
BsmtFinSF2 0 1.00 46.55 161.32 0 0.00 0.0 0.00 1474
BsmtUnfSF 0 1.00 567.24 441.87 0 223.00 477.5 808.00 2336
TotalBsmtSF 0 1.00 1057.43 438.71 0 795.75 991.5 1298.25 6110
X1stFlrSF 0 1.00 1162.63 386.59 334 882.00 1087.0 1391.25 4692
X2ndFlrSF 0 1.00 346.99 436.53 0 0.00 0.0 728.00 2065
LowQualFinSF 0 1.00 5.84 48.62 0 0.00 0.0 0.00 572
GrLivArea 0 1.00 1515.46 525.48 334 1129.50 1464.0 1776.75 5642
BsmtFullBath 0 1.00 0.43 0.52 0 0.00 0.0 1.00 3
BsmtHalfBath 0 1.00 0.06 0.24 0 0.00 0.0 0.00 2
FullBath 0 1.00 1.57 0.55 0 1.00 2.0 2.00 3
HalfBath 0 1.00 0.38 0.50 0 0.00 0.0 1.00 2
BedroomAbvGr 0 1.00 2.87 0.82 0 2.00 3.0 3.00 8
KitchenAbvGr 0 1.00 1.05 0.22 0 1.00 1.0 1.00 3
TotRmsAbvGrd 0 1.00 6.52 1.63 2 5.00 6.0 7.00 14
Fireplaces 0 1.00 0.61 0.64 0 0.00 1.0 1.00 3
GarageYrBlt 81 0.94 1978.51 24.69 1900 1961.00 1980.0 2002.00 2010
GarageCars 0 1.00 1.77 0.75 0 1.00 2.0 2.00 4
GarageArea 0 1.00 472.98 213.80 0 334.50 480.0 576.00 1418
WoodDeckSF 0 1.00 94.24 125.34 0 0.00 0.0 168.00 857
OpenPorchSF 0 1.00 46.66 66.26 0 0.00 25.0 68.00 547
EnclosedPorch 0 1.00 21.95 61.12 0 0.00 0.0 0.00 552
X3SsnPorch 0 1.00 3.41 29.32 0 0.00 0.0 0.00 508
ScreenPorch 0 1.00 15.06 55.76 0 0.00 0.0 0.00 480
PoolArea 0 1.00 2.76 40.18 0 0.00 0.0 0.00 738
MiscVal 0 1.00 43.49 496.12 0 0.00 0.0 0.00 15500
MoSold 0 1.00 6.32 2.70 1 5.00 6.0 8.00 12
YrSold 0 1.00 2007.82 1.33 2006 2007.00 2008.0 2009.00 2010
SalePrice 0 1.00 180921.20 79442.50 34900 129975.00 163000.0 214000.00 755000

Menangani Missing Value

Sintaks untuk melakukan replace missing value, khususnya jika datanya berupa factor atau string. Kemudian na.omit digunakan untuk menghapus semua baris yang mengandung missing value

data_house1 <- data_house %>%
  select(-Id) %>% 
  mutate(Alley = forcats::fct_explicit_na(Alley, na_level = "Ukn"), FireplaceQu=forcats::fct_explicit_na(FireplaceQu,na_level = "Ukn"), PoolQC = forcats::fct_explicit_na(PoolQC, na_level = "Ukn"), Fence = forcats::fct_explicit_na(Fence, na_level = "Ukn"), MiscFeature = forcats::fct_explicit_na(MiscFeature, na_level = "Ukn")) %>% na.omit
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Alley = forcats::fct_explicit_na(Alley, na_level = "Ukn")`.
## Caused by warning:
## ! `fct_explicit_na()` was deprecated in forcats 1.0.0.
## ℹ Please use `fct_na_value_to_level()` instead.

Kemudian kita akan lihat kembali data yang sudah kita tangani missing valuenya

plot_intro(data = data_house1)

skim_without_charts(data_house1)
Data summary
Name data_house1
Number of rows 1094
Number of columns 80
_______________________
Column type frequency:
factor 43
numeric 37
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
MSZoning 0 1 FALSE 5 RL: 850, RM: 173, FV: 54, RH: 9
Street 0 1 FALSE 2 Pav: 1090, Grv: 4
Alley 0 1 FALSE 3 Ukn: 1017, Grv: 41, Pav: 36
LotShape 0 1 FALSE 4 Reg: 760, IR1: 301, IR2: 26, IR3: 7
LandContour 0 1 FALSE 4 Lvl: 991, Bnk: 45, HLS: 44, Low: 14
Utilities 0 1 FALSE 1 All: 1094, NoS: 0
LotConfig 0 1 FALSE 5 Ins: 830, Cor: 187, Cul: 44, FR2: 29
LandSlope 0 1 FALSE 3 Gtl: 1045, Mod: 44, Sev: 5
Neighborhood 0 1 FALSE 25 NAm: 173, Col: 122, Old: 96, Som: 75
Condition1 0 1 FALSE 9 Nor: 950, Fee: 52, Art: 42, RRA: 24
Condition2 0 1 FALSE 6 Nor: 1082, Fee: 5, Art: 2, Pos: 2
BldgType 0 1 FALSE 5 1Fa: 925, Twn: 90, Twn: 35, Dup: 24
HouseStyle 0 1 FALSE 8 1St: 540, 2St: 346, 1.5: 117, SLv: 43
RoofStyle 0 1 FALSE 5 Gab: 843, Hip: 230, Gam: 10, Man: 6
RoofMatl 0 1 FALSE 7 Com: 1078, WdS: 6, Tar: 5, WdS: 2
Exterior1st 0 1 FALSE 14 Vin: 421, Met: 172, HdB: 151, Wd : 149
Exterior2nd 0 1 FALSE 16 Vin: 412, Met: 169, Wd : 145, HdB: 138
MasVnrType 0 1 FALSE 4 Non: 639, Brk: 327, Sto: 119, Brk: 9
ExterQual 0 1 FALSE 4 TA: 646, Gd: 395, Ex: 46, Fa: 7
ExterCond 0 1 FALSE 4 TA: 973, Gd: 104, Fa: 15, Ex: 2
Foundation 0 1 FALSE 5 PCo: 518, CBl: 446, Brk: 122, Sto: 6
BsmtQual 0 1 FALSE 4 TA: 486, Gd: 463, Ex: 113, Fa: 32
BsmtCond 0 1 FALSE 4 TA: 1006, Gd: 51, Fa: 36, Po: 1
BsmtExposure 0 1 FALSE 4 No: 734, Av: 174, Gd: 97, Mn: 89
BsmtFinType1 0 1 FALSE 6 Unf: 343, GLQ: 323, ALQ: 162, BLQ: 105
BsmtFinType2 0 1 FALSE 6 Unf: 972, Rec: 37, LwQ: 35, BLQ: 25
Heating 0 1 FALSE 4 Gas: 1075, Gas: 16, Gra: 2, Oth: 1
HeatingQC 0 1 FALSE 5 Ex: 594, TA: 298, Gd: 174, Fa: 27
CentralAir 0 1 FALSE 2 Y: 1036, N: 58
Electrical 0 1 FALSE 5 SBr: 1009, Fus: 67, Fus: 15, Fus: 2
KitchenQual 0 1 FALSE 4 TA: 528, Gd: 454, Ex: 91, Fa: 21
Functional 0 1 FALSE 6 Typ: 1024, Min: 25, Min: 21, Maj: 10
FireplaceQu 0 1 FALSE 6 Ukn: 511, Gd: 315, TA: 212, Fa: 24
GarageType 0 1 FALSE 6 Att: 680, Det: 325, Bui: 63, Bas: 15
GarageFinish 0 1 FALSE 3 Unf: 485, RFn: 333, Fin: 276
GarageQual 0 1 FALSE 5 TA: 1031, Fa: 46, Gd: 11, Ex: 3
GarageCond 0 1 FALSE 5 TA: 1050, Fa: 31, Po: 6, Gd: 5
PavedDrive 0 1 FALSE 3 Y: 1023, N: 48, P: 23
PoolQC 0 1 FALSE 4 Ukn: 1088, Ex: 2, Fa: 2, Gd: 2
Fence 0 1 FALSE 5 Ukn: 882, MnP: 117, GdP: 46, GdW: 39
MiscFeature 0 1 FALSE 4 Ukn: 1059, She: 33, Oth: 1, Ten: 1
SaleType 0 1 FALSE 9 WD: 928, New: 116, COD: 31, Con: 5
SaleCondition 0 1 FALSE 6 Nor: 880, Par: 119, Abn: 70, Fam: 18

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
MSSubClass 0 1 56.13 41.98 20 20.00 50.0 70.00 190
LotFrontage 0 1 70.76 24.51 21 60.00 70.0 80.00 313
LotArea 0 1 10132.35 8212.25 1300 7606.75 9444.5 11387.25 215245
OverallQual 0 1 6.25 1.37 2 5.00 6.0 7.00 10
OverallCond 0 1 5.58 1.07 2 5.00 5.0 6.00 9
YearBuilt 0 1 1972.41 31.19 1880 1953.00 1975.0 2003.00 2010
YearRemodAdd 0 1 1985.92 20.93 1950 1967.00 1995.0 2005.00 2010
MasVnrArea 0 1 109.86 190.67 0 0.00 0.0 171.75 1600
BsmtFinSF1 0 1 448.19 468.73 0 0.00 384.5 712.75 5644
BsmtFinSF2 0 1 45.25 159.08 0 0.00 0.0 0.00 1474
BsmtUnfSF 0 1 606.12 445.83 0 270.00 525.0 846.00 2336
TotalBsmtSF 0 1 1099.56 415.85 105 816.00 1023.0 1345.50 6110
X1stFlrSF 0 1 1173.81 387.68 438 894.00 1097.0 1413.50 4692
X2ndFlrSF 0 1 356.54 439.26 0 0.00 0.0 729.00 2065
LowQualFinSF 0 1 4.68 42.10 0 0.00 0.0 0.00 572
GrLivArea 0 1 1535.03 526.12 438 1164.00 1480.0 1779.00 5642
BsmtFullBath 0 1 0.42 0.51 0 0.00 0.0 1.00 2
BsmtHalfBath 0 1 0.06 0.24 0 0.00 0.0 0.00 2
FullBath 0 1 1.58 0.55 0 1.00 2.0 2.00 3
HalfBath 0 1 0.39 0.50 0 0.00 0.0 1.00 2
BedroomAbvGr 0 1 2.86 0.76 0 2.00 3.0 3.00 6
KitchenAbvGr 0 1 1.03 0.19 1 1.00 1.0 1.00 3
TotRmsAbvGrd 0 1 6.57 1.58 3 5.00 6.0 7.00 12
Fireplaces 0 1 0.61 0.63 0 0.00 1.0 1.00 3
GarageYrBlt 0 1 1978.57 25.93 1900 1960.00 1982.0 2003.00 2010
GarageCars 0 1 1.88 0.66 1 1.00 2.0 2.00 4
GarageArea 0 1 503.76 192.26 160 360.00 484.0 602.50 1418
WoodDeckSF 0 1 94.34 122.62 0 0.00 0.0 169.75 857
OpenPorchSF 0 1 46.95 64.82 0 0.00 28.0 68.00 547
EnclosedPorch 0 1 22.05 61.57 0 0.00 0.0 0.00 552
X3SsnPorch 0 1 3.27 29.66 0 0.00 0.0 0.00 508
ScreenPorch 0 1 16.50 58.46 0 0.00 0.0 0.00 480
PoolArea 0 1 3.01 40.71 0 0.00 0.0 0.00 648
MiscVal 0 1 23.55 167.14 0 0.00 0.0 0.00 2500
MoSold 0 1 6.34 2.69 1 5.00 6.0 8.00 12
YrSold 0 1 2007.79 1.33 2006 2007.00 2008.0 2009.00 2010
SalePrice 0 1 187033.26 83165.33 35311 132500.00 165750.0 221000.00 755000

Ada kolom yang hanya memiliki satu kategori saja yaitu kolom Utilities. Sehingga kita perlu menghapusnya.

data_house1 <- data_house1 %>% 
  select(-Utilities)

Memeriksa Sebaran Data

plot_histogram(data = data_house1,nrow=3,ncol = 3, geom_histogram_args = list(fill="steelblue"), ggtheme = theme_bw())
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Menggunakan PlotBar

plot_bar(data = data_house1,ggtheme =theme_bw(),nrow = 1)

Memeriksa Korelasi Peubah

Kita memeriksa korelasi peubah dengan scatterplot

plot_scatterplot(data = data_house1 %>% select_if(is.numeric), by="SalePrice",geom_point_args = list(color="steelblue"),ggtheme = theme_bw())

cor_mat <- cor(data_house1%>% select_if(is.numeric),method = "spearman")
cor_mat[upper.tri(cor_mat,diag = TRUE)] <- NA 
cor_df <- cor_mat %>%
  as.data.frame() %>%
  rownames_to_column(var = "Var1") %>%
  pivot_longer(names_to = "Var2", values_to = "corr", -Var1) %>% na.omit

cor_df %>% filter(abs(corr)>0.6) %>% arrange(desc(abs(corr)))
## # A tibble: 31 × 3
##    Var1         Var2          corr
##    <chr>        <chr>        <dbl>
##  1 GarageYrBlt  YearBuilt    0.895
##  2 X1stFlrSF    TotalBsmtSF  0.877
##  3 GarageArea   GarageCars   0.841
##  4 TotRmsAbvGrd GrLivArea    0.829
##  5 SalePrice    OverallQual  0.823
##  6 GarageYrBlt  YearRemodAdd 0.747
##  7 YearRemodAdd YearBuilt    0.738
##  8 SalePrice    GrLivArea    0.731
##  9 SalePrice    GarageCars   0.681
## 10 SalePrice    FullBath     0.671
## # … with 21 more rows
cor_df %>% filter(abs(corr)<=0.6)  
## # A tibble: 635 × 3
##    Var1        Var2            corr
##    <chr>       <chr>          <dbl>
##  1 LotFrontage MSSubClass  -0.313  
##  2 LotArea     MSSubClass  -0.255  
##  3 OverallQual MSSubClass   0.0992 
##  4 OverallQual LotFrontage  0.238  
##  5 OverallQual LotArea      0.283  
##  6 OverallCond MSSubClass  -0.0763 
##  7 OverallCond LotFrontage -0.0693 
##  8 OverallCond LotArea     -0.0873 
##  9 OverallCond OverallQual -0.264  
## 10 YearBuilt   MSSubClass  -0.00468
## # … with 625 more rows
cat_var_names <- data_house1 %>% 
  select(where(is.factor),SalePrice) %>%
  names
cat_var_names
##  [1] "MSZoning"      "Street"        "Alley"         "LotShape"     
##  [5] "LandContour"   "LotConfig"     "LandSlope"     "Neighborhood" 
##  [9] "Condition1"    "Condition2"    "BldgType"      "HouseStyle"   
## [13] "RoofStyle"     "RoofMatl"      "Exterior1st"   "Exterior2nd"  
## [17] "MasVnrType"    "ExterQual"     "ExterCond"     "Foundation"   
## [21] "BsmtQual"      "BsmtCond"      "BsmtExposure"  "BsmtFinType1" 
## [25] "BsmtFinType2"  "Heating"       "HeatingQC"     "CentralAir"   
## [29] "Electrical"    "KitchenQual"   "Functional"    "FireplaceQu"  
## [33] "GarageType"    "GarageFinish"  "GarageQual"    "GarageCond"   
## [37] "PavedDrive"    "PoolQC"        "Fence"         "MiscFeature"  
## [41] "SaleType"      "SaleCondition" "SalePrice"
for(i in cat_var_names[-43]){
plot_boxplot(data = data_house1 %>% 
select(where(is.factor),SalePrice),geom_boxplot_args=list(fill="steelblue"),by=i,ggtheme = theme_bw())}