Memanggil Packages

Kita melakukan pemanggilan Packages yang akan kita gunakan dalam Data Wrangling ini

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(skimr)
library(ggpubr)

Data Import

Kemudian kita melakukan import data yang akan kita kerjakan

library(readr)
house_price1 <- read_csv("C:/Users/sarah/Downloads/house_price1.csv")
## Rows: 1460 Columns: 81
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (38): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(house_price1)

Mengubah namanya menjadi data_house

data_house <- read.csv("C:/Users/sarah/Downloads/house_price1.csv", stringsAsFactors = TRUE)
glimpse(data_house)
## Rows: 1,460
## Columns: 81
## $ Id            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ MSSubClass    <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20,…
## $ MSZoning      <fct> RL, RL, RL, RL, RL, RL, RL, RL, RM, RL, RL, RL, RL, RL, …
## $ LotFrontage   <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, …
## $ LotArea       <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 612…
## $ Street        <fct> Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pa…
## $ Alley         <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ LotShape      <fct> Reg, Reg, IR1, IR1, IR1, IR1, Reg, IR1, Reg, Reg, Reg, I…
## $ LandContour   <fct> Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, L…
## $ Utilities     <fct> AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, …
## $ LotConfig     <fct> Inside, FR2, Inside, Corner, FR2, Inside, Inside, Corner…
## $ LandSlope     <fct> Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, G…
## $ Neighborhood  <fct> CollgCr, Veenker, CollgCr, Crawfor, NoRidge, Mitchel, So…
## $ Condition1    <fct> Norm, Feedr, Norm, Norm, Norm, Norm, Norm, PosN, Artery,…
## $ Condition2    <fct> Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Ar…
## $ BldgType      <fct> 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 2f…
## $ HouseStyle    <fct> 2Story, 1Story, 2Story, 2Story, 2Story, 1.5Fin, 1Story, …
## $ OverallQual   <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5,…
## $ OverallCond   <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5,…
## $ YearBuilt     <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 19…
## $ YearRemodAdd  <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 19…
## $ RoofStyle     <fct> Gable, Gable, Gable, Gable, Gable, Gable, Gable, Gable, …
## $ RoofMatl      <fct> CompShg, CompShg, CompShg, CompShg, CompShg, CompShg, Co…
## $ Exterior1st   <fct> VinylSd, MetalSd, VinylSd, Wd Sdng, VinylSd, VinylSd, Vi…
## $ Exterior2nd   <fct> VinylSd, MetalSd, VinylSd, Wd Shng, VinylSd, VinylSd, Vi…
## $ MasVnrType    <fct> BrkFace, None, BrkFace, None, BrkFace, None, Stone, Ston…
## $ MasVnrArea    <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306, …
## $ ExterQual     <fct> Gd, TA, Gd, TA, Gd, TA, Gd, TA, TA, TA, TA, Ex, TA, Gd, …
## $ ExterCond     <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, …
## $ Foundation    <fct> PConc, CBlock, PConc, BrkTil, PConc, Wood, PConc, CBlock…
## $ BsmtQual      <fct> Gd, Gd, Gd, TA, Gd, Gd, Ex, Gd, TA, TA, TA, Ex, TA, Gd, …
## $ BsmtCond      <fct> TA, TA, TA, Gd, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, …
## $ BsmtExposure  <fct> No, Gd, Mn, No, Av, No, Av, Mn, No, No, No, No, No, Av, …
## $ BsmtFinType1  <fct> GLQ, ALQ, GLQ, ALQ, GLQ, GLQ, GLQ, ALQ, Unf, GLQ, Rec, G…
## $ BsmtFinSF1    <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 99…
## $ BsmtFinType2  <fct> Unf, Unf, Unf, Unf, Unf, Unf, Unf, BLQ, Unf, Unf, Unf, U…
## $ BsmtFinSF2    <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ BsmtUnfSF     <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 17…
## $ TotalBsmtSF   <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 10…
## $ Heating       <fct> GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, Ga…
## $ HeatingQC     <fct> Ex, Ex, Ex, Gd, Ex, Ex, Ex, Ex, Gd, Ex, Ex, Ex, TA, Ex, …
## $ CentralAir    <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,…
## $ Electrical    <fct> SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, …
## $ X1stFlrSF     <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, …
## $ X2ndFlrSF     <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0,…
## $ LowQualFinSF  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ GrLivArea     <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 10…
## $ BsmtFullBath  <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,…
## $ BsmtHalfBath  <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FullBath      <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1,…
## $ HalfBath      <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,…
## $ BedroomAbvGr  <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3,…
## $ KitchenAbvGr  <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
## $ KitchenQual   <fct> Gd, TA, Gd, Gd, Gd, TA, Gd, TA, TA, TA, TA, Ex, TA, Gd, …
## $ TotRmsAbvGrd  <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6…
## $ Functional    <fct> Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ, Min1, Typ, Typ, …
## $ Fireplaces    <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0,…
## $ FireplaceQu   <fct> NA, TA, TA, Gd, TA, NA, Gd, TA, TA, TA, NA, Gd, NA, Gd, …
## $ GarageType    <fct> Attchd, Attchd, Attchd, Detchd, Attchd, Attchd, Attchd, …
## $ GarageYrBlt   <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, 19…
## $ GarageFinish  <fct> RFn, RFn, RFn, Unf, RFn, Unf, RFn, RFn, Unf, RFn, Unf, F…
## $ GarageCars    <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2,…
## $ GarageArea    <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 7…
## $ GarageQual    <fct> TA, TA, TA, TA, TA, TA, TA, TA, Fa, Gd, TA, TA, TA, TA, …
## $ GarageCond    <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, …
## $ PavedDrive    <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,…
## $ WoodDeckSF    <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160…
## $ OpenPorchSF   <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213,…
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, …
## $ X3SsnPorch    <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ScreenPorch   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, …
## $ PoolArea      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ PoolQC        <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ Fence         <fct> NA, NA, NA, NA, NA, MnPrv, NA, NA, NA, NA, NA, NA, NA, N…
## $ MiscFeature   <fct> NA, NA, NA, NA, NA, Shed, NA, Shed, NA, NA, NA, NA, NA, …
## $ MiscVal       <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700,…
## $ MoSold        <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10…
## $ YrSold        <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 20…
## $ SaleType      <fct> WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, New, WD, New…
## $ SaleCondition <fct> Normal, Normal, Normal, Abnorml, Normal, Normal, Normal,…
## $ SalePrice     <int> 208500, 181500, 223500, 140000, 250000, 143000, 307000, …

Data Cleaning

Pada tahap ini kita melakukan penghapusan Data

Mengahapus Missing Data

Kita pilih terlebih dahulu data yang akan kita hapus

skim_without_charts(data_house)
Data summary
Name data_house
Number of rows 1460
Number of columns 81
_______________________
Column type frequency:
factor 43
numeric 38
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
MSZoning 0 1.00 FALSE 5 RL: 1151, RM: 218, FV: 65, RH: 16
Street 0 1.00 FALSE 2 Pav: 1454, Grv: 6
Alley 1369 0.06 FALSE 2 Grv: 50, Pav: 41
LotShape 0 1.00 FALSE 4 Reg: 925, IR1: 484, IR2: 41, IR3: 10
LandContour 0 1.00 FALSE 4 Lvl: 1311, Bnk: 63, HLS: 50, Low: 36
Utilities 0 1.00 FALSE 2 All: 1459, NoS: 1
LotConfig 0 1.00 FALSE 5 Ins: 1052, Cor: 263, Cul: 94, FR2: 47
LandSlope 0 1.00 FALSE 3 Gtl: 1382, Mod: 65, Sev: 13
Neighborhood 0 1.00 FALSE 25 NAm: 225, Col: 150, Old: 113, Edw: 100
Condition1 0 1.00 FALSE 9 Nor: 1260, Fee: 81, Art: 48, RRA: 26
Condition2 0 1.00 FALSE 8 Nor: 1445, Fee: 6, Art: 2, Pos: 2
BldgType 0 1.00 FALSE 5 1Fa: 1220, Twn: 114, Dup: 52, Twn: 43
HouseStyle 0 1.00 FALSE 8 1St: 726, 2St: 445, 1.5: 154, SLv: 65
RoofStyle 0 1.00 FALSE 6 Gab: 1141, Hip: 286, Fla: 13, Gam: 11
RoofMatl 0 1.00 FALSE 8 Com: 1434, Tar: 11, WdS: 6, WdS: 5
Exterior1st 0 1.00 FALSE 15 Vin: 515, HdB: 222, Met: 220, Wd : 206
Exterior2nd 0 1.00 FALSE 16 Vin: 504, Met: 214, HdB: 207, Wd : 197
MasVnrType 8 0.99 FALSE 4 Non: 864, Brk: 445, Sto: 128, Brk: 15
ExterQual 0 1.00 FALSE 4 TA: 906, Gd: 488, Ex: 52, Fa: 14
ExterCond 0 1.00 FALSE 5 TA: 1282, Gd: 146, Fa: 28, Ex: 3
Foundation 0 1.00 FALSE 6 PCo: 647, CBl: 634, Brk: 146, Sla: 24
BsmtQual 37 0.97 FALSE 4 TA: 649, Gd: 618, Ex: 121, Fa: 35
BsmtCond 37 0.97 FALSE 4 TA: 1311, Gd: 65, Fa: 45, Po: 2
BsmtExposure 38 0.97 FALSE 4 No: 953, Av: 221, Gd: 134, Mn: 114
BsmtFinType1 37 0.97 FALSE 6 Unf: 430, GLQ: 418, ALQ: 220, BLQ: 148
BsmtFinType2 38 0.97 FALSE 6 Unf: 1256, Rec: 54, LwQ: 46, BLQ: 33
Heating 0 1.00 FALSE 6 Gas: 1428, Gas: 18, Gra: 7, Wal: 4
HeatingQC 0 1.00 FALSE 5 Ex: 741, TA: 428, Gd: 241, Fa: 49
CentralAir 0 1.00 FALSE 2 Y: 1365, N: 95
Electrical 1 1.00 FALSE 5 SBr: 1334, Fus: 94, Fus: 27, Fus: 3
KitchenQual 0 1.00 FALSE 4 TA: 735, Gd: 586, Ex: 100, Fa: 39
Functional 0 1.00 FALSE 7 Typ: 1360, Min: 34, Min: 31, Mod: 15
FireplaceQu 690 0.53 FALSE 5 Gd: 380, TA: 313, Fa: 33, Ex: 24
GarageType 81 0.94 FALSE 6 Att: 870, Det: 387, Bui: 88, Bas: 19
GarageFinish 81 0.94 FALSE 3 Unf: 605, RFn: 422, Fin: 352
GarageQual 81 0.94 FALSE 5 TA: 1311, Fa: 48, Gd: 14, Ex: 3
GarageCond 81 0.94 FALSE 5 TA: 1326, Fa: 35, Gd: 9, Po: 7
PavedDrive 0 1.00 FALSE 3 Y: 1340, N: 90, P: 30
PoolQC 1453 0.00 FALSE 3 Gd: 3, Ex: 2, Fa: 2
Fence 1179 0.19 FALSE 4 MnP: 157, GdP: 59, GdW: 54, MnW: 11
MiscFeature 1406 0.04 FALSE 4 She: 49, Gar: 2, Oth: 2, Ten: 1
SaleType 0 1.00 FALSE 9 WD: 1267, New: 122, COD: 43, Con: 9
SaleCondition 0 1.00 FALSE 6 Nor: 1198, Par: 125, Abn: 101, Fam: 20

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
Id 0 1.00 730.50 421.61 1 365.75 730.5 1095.25 1460
MSSubClass 0 1.00 56.90 42.30 20 20.00 50.0 70.00 190
LotFrontage 259 0.82 70.05 24.28 21 59.00 69.0 80.00 313
LotArea 0 1.00 10516.83 9981.26 1300 7553.50 9478.5 11601.50 215245
OverallQual 0 1.00 6.10 1.38 1 5.00 6.0 7.00 10
OverallCond 0 1.00 5.58 1.11 1 5.00 5.0 6.00 9
YearBuilt 0 1.00 1971.27 30.20 1872 1954.00 1973.0 2000.00 2010
YearRemodAdd 0 1.00 1984.87 20.65 1950 1967.00 1994.0 2004.00 2010
MasVnrArea 8 0.99 103.69 181.07 0 0.00 0.0 166.00 1600
BsmtFinSF1 0 1.00 443.64 456.10 0 0.00 383.5 712.25 5644
BsmtFinSF2 0 1.00 46.55 161.32 0 0.00 0.0 0.00 1474
BsmtUnfSF 0 1.00 567.24 441.87 0 223.00 477.5 808.00 2336
TotalBsmtSF 0 1.00 1057.43 438.71 0 795.75 991.5 1298.25 6110
X1stFlrSF 0 1.00 1162.63 386.59 334 882.00 1087.0 1391.25 4692
X2ndFlrSF 0 1.00 346.99 436.53 0 0.00 0.0 728.00 2065
LowQualFinSF 0 1.00 5.84 48.62 0 0.00 0.0 0.00 572
GrLivArea 0 1.00 1515.46 525.48 334 1129.50 1464.0 1776.75 5642
BsmtFullBath 0 1.00 0.43 0.52 0 0.00 0.0 1.00 3
BsmtHalfBath 0 1.00 0.06 0.24 0 0.00 0.0 0.00 2
FullBath 0 1.00 1.57 0.55 0 1.00 2.0 2.00 3
HalfBath 0 1.00 0.38 0.50 0 0.00 0.0 1.00 2
BedroomAbvGr 0 1.00 2.87 0.82 0 2.00 3.0 3.00 8
KitchenAbvGr 0 1.00 1.05 0.22 0 1.00 1.0 1.00 3
TotRmsAbvGrd 0 1.00 6.52 1.63 2 5.00 6.0 7.00 14
Fireplaces 0 1.00 0.61 0.64 0 0.00 1.0 1.00 3
GarageYrBlt 81 0.94 1978.51 24.69 1900 1961.00 1980.0 2002.00 2010
GarageCars 0 1.00 1.77 0.75 0 1.00 2.0 2.00 4
GarageArea 0 1.00 472.98 213.80 0 334.50 480.0 576.00 1418
WoodDeckSF 0 1.00 94.24 125.34 0 0.00 0.0 168.00 857
OpenPorchSF 0 1.00 46.66 66.26 0 0.00 25.0 68.00 547
EnclosedPorch 0 1.00 21.95 61.12 0 0.00 0.0 0.00 552
X3SsnPorch 0 1.00 3.41 29.32 0 0.00 0.0 0.00 508
ScreenPorch 0 1.00 15.06 55.76 0 0.00 0.0 0.00 480
PoolArea 0 1.00 2.76 40.18 0 0.00 0.0 0.00 738
MiscVal 0 1.00 43.49 496.12 0 0.00 0.0 0.00 15500
MoSold 0 1.00 6.32 2.70 1 5.00 6.0 8.00 12
YrSold 0 1.00 2007.82 1.33 2006 2007.00 2008.0 2009.00 2010
SalePrice 0 1.00 180921.20 79442.50 34900 129975.00 163000.0 214000.00 755000
Kali ini kita akan menghapus kolom Alley dimana banyak observasi yang missing pada kolom Alley maka kolom Alley dihapus
dt_res1 <- data_house %>%
  select(-Alley) %>%
  na.omit()
skim_without_charts(dt_res1)
Data summary
Name dt_res1
Number of rows 1
Number of columns 80
_______________________
Column type frequency:
factor 42
numeric 38
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
MSZoning 0 1 FALSE 1 RL: 1, C (: 0, FV: 0, RH: 0
Street 0 1 FALSE 1 Pav: 1, Grv: 0
LotShape 0 1 FALSE 1 IR1: 1, IR2: 0, IR3: 0, Reg: 0
LandContour 0 1 FALSE 1 Lvl: 1, Bnk: 0, HLS: 0, Low: 0
Utilities 0 1 FALSE 1 All: 1, NoS: 0
LotConfig 0 1 FALSE 1 Ins: 1, Cor: 0, Cul: 0, FR2: 0
LandSlope 0 1 FALSE 1 Gtl: 1, Mod: 0, Sev: 0
Neighborhood 0 1 FALSE 1 NWA: 1, Blm: 0, Blu: 0, BrD: 0
Condition1 0 1 FALSE 1 RRA: 1, Art: 0, Fee: 0, Nor: 0
Condition2 0 1 FALSE 1 Nor: 1, Art: 0, Fee: 0, Pos: 0
BldgType 0 1 FALSE 1 1Fa: 1, 2fm: 0, Dup: 0, Twn: 0
HouseStyle 0 1 FALSE 1 2St: 1, 1.5: 0, 1.5: 0, 1St: 0
RoofStyle 0 1 FALSE 1 Gab: 1, Fla: 0, Gam: 0, Hip: 0
RoofMatl 0 1 FALSE 1 Com: 1, Cly: 0, Mem: 0, Met: 0
Exterior1st 0 1 FALSE 1 Ply: 1, Asb: 0, Asp: 0, Brk: 0
Exterior2nd 0 1 FALSE 1 Ply: 1, Asb: 0, Asp: 0, Brk: 0
MasVnrType 0 1 FALSE 1 Brk: 1, Brk: 0, Non: 0, Sto: 0
ExterQual 0 1 FALSE 1 TA: 1, Ex: 0, Fa: 0, Gd: 0
ExterCond 0 1 FALSE 1 TA: 1, Ex: 0, Fa: 0, Gd: 0
Foundation 0 1 FALSE 1 CBl: 1, Brk: 0, PCo: 0, Sla: 0
BsmtQual 0 1 FALSE 1 Gd: 1, Ex: 0, Fa: 0, TA: 0
BsmtCond 0 1 FALSE 1 TA: 1, Fa: 0, Gd: 0, Po: 0
BsmtExposure 0 1 FALSE 1 No: 1, Av: 0, Gd: 0, Mn: 0
BsmtFinType1 0 1 FALSE 1 BLQ: 1, ALQ: 0, GLQ: 0, LwQ: 0
BsmtFinType2 0 1 FALSE 1 LwQ: 1, ALQ: 0, BLQ: 0, GLQ: 0
Heating 0 1 FALSE 1 Gas: 1, Flo: 0, Gas: 0, Gra: 0
HeatingQC 0 1 FALSE 1 TA: 1, Ex: 0, Fa: 0, Gd: 0
CentralAir 0 1 FALSE 1 Y: 1, N: 0
Electrical 0 1 FALSE 1 SBr: 1, Fus: 0, Fus: 0, Fus: 0
KitchenQual 0 1 FALSE 1 Gd: 1, Ex: 0, Fa: 0, TA: 0
Functional 0 1 FALSE 1 Typ: 1, Maj: 0, Maj: 0, Min: 0
FireplaceQu 0 1 FALSE 1 TA: 1, Ex: 0, Fa: 0, Gd: 0
GarageType 0 1 FALSE 1 Att: 1, 2Ty: 0, Bas: 0, Bui: 0
GarageFinish 0 1 FALSE 1 RFn: 1, Fin: 0, Unf: 0
GarageQual 0 1 FALSE 1 TA: 1, Ex: 0, Fa: 0, Gd: 0
GarageCond 0 1 FALSE 1 TA: 1, Ex: 0, Fa: 0, Gd: 0
PavedDrive 0 1 FALSE 1 Y: 1, N: 0, P: 0
PoolQC 0 1 FALSE 1 Fa: 1, Ex: 0, Gd: 0
Fence 0 1 FALSE 1 MnP: 1, GdP: 0, GdW: 0, MnW: 0
MiscFeature 0 1 FALSE 1 Ten: 1, Gar: 0, Oth: 0, She: 0
SaleType 0 1 FALSE 1 WD: 1, COD: 0, Con: 0, Con: 0
SaleCondition 0 1 FALSE 1 Nor: 1, Abn: 0, Adj: 0, All: 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
Id 0 1 1387 NA 1387 1387 1387 1387 1387
MSSubClass 0 1 60 NA 60 60 60 60 60
LotFrontage 0 1 80 NA 80 80 80 80 80
LotArea 0 1 16692 NA 16692 16692 16692 16692 16692
OverallQual 0 1 7 NA 7 7 7 7 7
OverallCond 0 1 5 NA 5 5 5 5 5
YearBuilt 0 1 1978 NA 1978 1978 1978 1978 1978
YearRemodAdd 0 1 1978 NA 1978 1978 1978 1978 1978
MasVnrArea 0 1 184 NA 184 184 184 184 184
BsmtFinSF1 0 1 790 NA 790 790 790 790 790
BsmtFinSF2 0 1 469 NA 469 469 469 469 469
BsmtUnfSF 0 1 133 NA 133 133 133 133 133
TotalBsmtSF 0 1 1392 NA 1392 1392 1392 1392 1392
X1stFlrSF 0 1 1392 NA 1392 1392 1392 1392 1392
X2ndFlrSF 0 1 1392 NA 1392 1392 1392 1392 1392
LowQualFinSF 0 1 0 NA 0 0 0 0 0
GrLivArea 0 1 2784 NA 2784 2784 2784 2784 2784
BsmtFullBath 0 1 1 NA 1 1 1 1 1
BsmtHalfBath 0 1 0 NA 0 0 0 0 0
FullBath 0 1 3 NA 3 3 3 3 3
HalfBath 0 1 1 NA 1 1 1 1 1
BedroomAbvGr 0 1 5 NA 5 5 5 5 5
KitchenAbvGr 0 1 1 NA 1 1 1 1 1
TotRmsAbvGrd 0 1 12 NA 12 12 12 12 12
Fireplaces 0 1 2 NA 2 2 2 2 2
GarageYrBlt 0 1 1978 NA 1978 1978 1978 1978 1978
GarageCars 0 1 2 NA 2 2 2 2 2
GarageArea 0 1 564 NA 564 564 564 564 564
WoodDeckSF 0 1 0 NA 0 0 0 0 0
OpenPorchSF 0 1 112 NA 112 112 112 112 112
EnclosedPorch 0 1 0 NA 0 0 0 0 0
X3SsnPorch 0 1 0 NA 0 0 0 0 0
ScreenPorch 0 1 440 NA 440 440 440 440 440
PoolArea 0 1 519 NA 519 519 519 519 519
MiscVal 0 1 2000 NA 2000 2000 2000 2000 2000
MoSold 0 1 7 NA 7 7 7 7 7
YrSold 0 1 2006 NA 2006 2006 2006 2006 2006
SalePrice 0 1 250000 NA 250000 250000 250000 250000 250000

Koreksi Kesalahan pada Data

skim_without_charts(data_house)
Data summary
Name data_house
Number of rows 1460
Number of columns 81
_______________________
Column type frequency:
factor 43
numeric 38
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
MSZoning 0 1.00 FALSE 5 RL: 1151, RM: 218, FV: 65, RH: 16
Street 0 1.00 FALSE 2 Pav: 1454, Grv: 6
Alley 1369 0.06 FALSE 2 Grv: 50, Pav: 41
LotShape 0 1.00 FALSE 4 Reg: 925, IR1: 484, IR2: 41, IR3: 10
LandContour 0 1.00 FALSE 4 Lvl: 1311, Bnk: 63, HLS: 50, Low: 36
Utilities 0 1.00 FALSE 2 All: 1459, NoS: 1
LotConfig 0 1.00 FALSE 5 Ins: 1052, Cor: 263, Cul: 94, FR2: 47
LandSlope 0 1.00 FALSE 3 Gtl: 1382, Mod: 65, Sev: 13
Neighborhood 0 1.00 FALSE 25 NAm: 225, Col: 150, Old: 113, Edw: 100
Condition1 0 1.00 FALSE 9 Nor: 1260, Fee: 81, Art: 48, RRA: 26
Condition2 0 1.00 FALSE 8 Nor: 1445, Fee: 6, Art: 2, Pos: 2
BldgType 0 1.00 FALSE 5 1Fa: 1220, Twn: 114, Dup: 52, Twn: 43
HouseStyle 0 1.00 FALSE 8 1St: 726, 2St: 445, 1.5: 154, SLv: 65
RoofStyle 0 1.00 FALSE 6 Gab: 1141, Hip: 286, Fla: 13, Gam: 11
RoofMatl 0 1.00 FALSE 8 Com: 1434, Tar: 11, WdS: 6, WdS: 5
Exterior1st 0 1.00 FALSE 15 Vin: 515, HdB: 222, Met: 220, Wd : 206
Exterior2nd 0 1.00 FALSE 16 Vin: 504, Met: 214, HdB: 207, Wd : 197
MasVnrType 8 0.99 FALSE 4 Non: 864, Brk: 445, Sto: 128, Brk: 15
ExterQual 0 1.00 FALSE 4 TA: 906, Gd: 488, Ex: 52, Fa: 14
ExterCond 0 1.00 FALSE 5 TA: 1282, Gd: 146, Fa: 28, Ex: 3
Foundation 0 1.00 FALSE 6 PCo: 647, CBl: 634, Brk: 146, Sla: 24
BsmtQual 37 0.97 FALSE 4 TA: 649, Gd: 618, Ex: 121, Fa: 35
BsmtCond 37 0.97 FALSE 4 TA: 1311, Gd: 65, Fa: 45, Po: 2
BsmtExposure 38 0.97 FALSE 4 No: 953, Av: 221, Gd: 134, Mn: 114
BsmtFinType1 37 0.97 FALSE 6 Unf: 430, GLQ: 418, ALQ: 220, BLQ: 148
BsmtFinType2 38 0.97 FALSE 6 Unf: 1256, Rec: 54, LwQ: 46, BLQ: 33
Heating 0 1.00 FALSE 6 Gas: 1428, Gas: 18, Gra: 7, Wal: 4
HeatingQC 0 1.00 FALSE 5 Ex: 741, TA: 428, Gd: 241, Fa: 49
CentralAir 0 1.00 FALSE 2 Y: 1365, N: 95
Electrical 1 1.00 FALSE 5 SBr: 1334, Fus: 94, Fus: 27, Fus: 3
KitchenQual 0 1.00 FALSE 4 TA: 735, Gd: 586, Ex: 100, Fa: 39
Functional 0 1.00 FALSE 7 Typ: 1360, Min: 34, Min: 31, Mod: 15
FireplaceQu 690 0.53 FALSE 5 Gd: 380, TA: 313, Fa: 33, Ex: 24
GarageType 81 0.94 FALSE 6 Att: 870, Det: 387, Bui: 88, Bas: 19
GarageFinish 81 0.94 FALSE 3 Unf: 605, RFn: 422, Fin: 352
GarageQual 81 0.94 FALSE 5 TA: 1311, Fa: 48, Gd: 14, Ex: 3
GarageCond 81 0.94 FALSE 5 TA: 1326, Fa: 35, Gd: 9, Po: 7
PavedDrive 0 1.00 FALSE 3 Y: 1340, N: 90, P: 30
PoolQC 1453 0.00 FALSE 3 Gd: 3, Ex: 2, Fa: 2
Fence 1179 0.19 FALSE 4 MnP: 157, GdP: 59, GdW: 54, MnW: 11
MiscFeature 1406 0.04 FALSE 4 She: 49, Gar: 2, Oth: 2, Ten: 1
SaleType 0 1.00 FALSE 9 WD: 1267, New: 122, COD: 43, Con: 9
SaleCondition 0 1.00 FALSE 6 Nor: 1198, Par: 125, Abn: 101, Fam: 20

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
Id 0 1.00 730.50 421.61 1 365.75 730.5 1095.25 1460
MSSubClass 0 1.00 56.90 42.30 20 20.00 50.0 70.00 190
LotFrontage 259 0.82 70.05 24.28 21 59.00 69.0 80.00 313
LotArea 0 1.00 10516.83 9981.26 1300 7553.50 9478.5 11601.50 215245
OverallQual 0 1.00 6.10 1.38 1 5.00 6.0 7.00 10
OverallCond 0 1.00 5.58 1.11 1 5.00 5.0 6.00 9
YearBuilt 0 1.00 1971.27 30.20 1872 1954.00 1973.0 2000.00 2010
YearRemodAdd 0 1.00 1984.87 20.65 1950 1967.00 1994.0 2004.00 2010
MasVnrArea 8 0.99 103.69 181.07 0 0.00 0.0 166.00 1600
BsmtFinSF1 0 1.00 443.64 456.10 0 0.00 383.5 712.25 5644
BsmtFinSF2 0 1.00 46.55 161.32 0 0.00 0.0 0.00 1474
BsmtUnfSF 0 1.00 567.24 441.87 0 223.00 477.5 808.00 2336
TotalBsmtSF 0 1.00 1057.43 438.71 0 795.75 991.5 1298.25 6110
X1stFlrSF 0 1.00 1162.63 386.59 334 882.00 1087.0 1391.25 4692
X2ndFlrSF 0 1.00 346.99 436.53 0 0.00 0.0 728.00 2065
LowQualFinSF 0 1.00 5.84 48.62 0 0.00 0.0 0.00 572
GrLivArea 0 1.00 1515.46 525.48 334 1129.50 1464.0 1776.75 5642
BsmtFullBath 0 1.00 0.43 0.52 0 0.00 0.0 1.00 3
BsmtHalfBath 0 1.00 0.06 0.24 0 0.00 0.0 0.00 2
FullBath 0 1.00 1.57 0.55 0 1.00 2.0 2.00 3
HalfBath 0 1.00 0.38 0.50 0 0.00 0.0 1.00 2
BedroomAbvGr 0 1.00 2.87 0.82 0 2.00 3.0 3.00 8
KitchenAbvGr 0 1.00 1.05 0.22 0 1.00 1.0 1.00 3
TotRmsAbvGrd 0 1.00 6.52 1.63 2 5.00 6.0 7.00 14
Fireplaces 0 1.00 0.61 0.64 0 0.00 1.0 1.00 3
GarageYrBlt 81 0.94 1978.51 24.69 1900 1961.00 1980.0 2002.00 2010
GarageCars 0 1.00 1.77 0.75 0 1.00 2.0 2.00 4
GarageArea 0 1.00 472.98 213.80 0 334.50 480.0 576.00 1418
WoodDeckSF 0 1.00 94.24 125.34 0 0.00 0.0 168.00 857
OpenPorchSF 0 1.00 46.66 66.26 0 0.00 25.0 68.00 547
EnclosedPorch 0 1.00 21.95 61.12 0 0.00 0.0 0.00 552
X3SsnPorch 0 1.00 3.41 29.32 0 0.00 0.0 0.00 508
ScreenPorch 0 1.00 15.06 55.76 0 0.00 0.0 0.00 480
PoolArea 0 1.00 2.76 40.18 0 0.00 0.0 0.00 738
MiscVal 0 1.00 43.49 496.12 0 0.00 0.0 0.00 15500
MoSold 0 1.00 6.32 2.70 1 5.00 6.0 8.00 12
YrSold 0 1.00 2007.82 1.33 2006 2007.00 2008.0 2009.00 2010
SalePrice 0 1.00 180921.20 79442.50 34900 129975.00 163000.0 214000.00 755000

Kemudian kita koreksi kesalahan pada data

dt_res2 <- data_house %>% mutate(Street=case_when(Street=="Pavr" ~ "Pave", Street=="Pavd" ~ "Pave",TRUE ~ Street))
dt_res2 %>% count(Street)
##   Street    n
## 1   Grvl    6
## 2   Pave 1454

Koreksi Ketidak Konsistenan Data

data_house %>% 
  filter(GarageArea==0) %>%
  select(GarageArea, GarageType)
##    GarageArea GarageType
## 1           0       <NA>
## 2           0       <NA>
## 3           0       <NA>
## 4           0       <NA>
## 5           0       <NA>
## 6           0       <NA>
## 7           0       <NA>
## 8           0       <NA>
## 9           0       <NA>
## 10          0       <NA>
## 11          0       <NA>
## 12          0       <NA>
## 13          0       <NA>
## 14          0       <NA>
## 15          0       <NA>
## 16          0       <NA>
## 17          0       <NA>
## 18          0       <NA>
## 19          0       <NA>
## 20          0       <NA>
## 21          0       <NA>
## 22          0       <NA>
## 23          0       <NA>
## 24          0       <NA>
## 25          0       <NA>
## 26          0       <NA>
## 27          0       <NA>
## 28          0       <NA>
## 29          0       <NA>
## 30          0       <NA>
## 31          0       <NA>
## 32          0       <NA>
## 33          0       <NA>
## 34          0       <NA>
## 35          0       <NA>
## 36          0       <NA>
## 37          0       <NA>
## 38          0       <NA>
## 39          0       <NA>
## 40          0       <NA>
## 41          0       <NA>
## 42          0       <NA>
## 43          0       <NA>
## 44          0       <NA>
## 45          0       <NA>
## 46          0       <NA>
## 47          0       <NA>
## 48          0       <NA>
## 49          0       <NA>
## 50          0       <NA>
## 51          0       <NA>
## 52          0       <NA>
## 53          0       <NA>
## 54          0       <NA>
## 55          0       <NA>
## 56          0       <NA>
## 57          0       <NA>
## 58          0       <NA>
## 59          0       <NA>
## 60          0       <NA>
## 61          0       <NA>
## 62          0       <NA>
## 63          0       <NA>
## 64          0       <NA>
## 65          0       <NA>
## 66          0       <NA>
## 67          0       <NA>
## 68          0       <NA>
## 69          0       <NA>
## 70          0       <NA>
## 71          0       <NA>
## 72          0       <NA>
## 73          0       <NA>
## 74          0       <NA>
## 75          0       <NA>
## 76          0       <NA>
## 77          0       <NA>
## 78          0       <NA>
## 79          0       <NA>
## 80          0       <NA>
## 81          0       <NA>
dt_res3 <- data_house %>%
  mutate(GarageType==case_when(GarageArea ==0 ~ NA, TRUE ~ GarageType))
dt_res3 %>%
  filter(GarageArea==0) %>%
  select(GarageArea,GarageType)
##    GarageArea GarageType
## 1           0       <NA>
## 2           0       <NA>
## 3           0       <NA>
## 4           0       <NA>
## 5           0       <NA>
## 6           0       <NA>
## 7           0       <NA>
## 8           0       <NA>
## 9           0       <NA>
## 10          0       <NA>
## 11          0       <NA>
## 12          0       <NA>
## 13          0       <NA>
## 14          0       <NA>
## 15          0       <NA>
## 16          0       <NA>
## 17          0       <NA>
## 18          0       <NA>
## 19          0       <NA>
## 20          0       <NA>
## 21          0       <NA>
## 22          0       <NA>
## 23          0       <NA>
## 24          0       <NA>
## 25          0       <NA>
## 26          0       <NA>
## 27          0       <NA>
## 28          0       <NA>
## 29          0       <NA>
## 30          0       <NA>
## 31          0       <NA>
## 32          0       <NA>
## 33          0       <NA>
## 34          0       <NA>
## 35          0       <NA>
## 36          0       <NA>
## 37          0       <NA>
## 38          0       <NA>
## 39          0       <NA>
## 40          0       <NA>
## 41          0       <NA>
## 42          0       <NA>
## 43          0       <NA>
## 44          0       <NA>
## 45          0       <NA>
## 46          0       <NA>
## 47          0       <NA>
## 48          0       <NA>
## 49          0       <NA>
## 50          0       <NA>
## 51          0       <NA>
## 52          0       <NA>
## 53          0       <NA>
## 54          0       <NA>
## 55          0       <NA>
## 56          0       <NA>
## 57          0       <NA>
## 58          0       <NA>
## 59          0       <NA>
## 60          0       <NA>
## 61          0       <NA>
## 62          0       <NA>
## 63          0       <NA>
## 64          0       <NA>
## 65          0       <NA>
## 66          0       <NA>
## 67          0       <NA>
## 68          0       <NA>
## 69          0       <NA>
## 70          0       <NA>
## 71          0       <NA>
## 72          0       <NA>
## 73          0       <NA>
## 74          0       <NA>
## 75          0       <NA>
## 76          0       <NA>
## 77          0       <NA>
## 78          0       <NA>
## 79          0       <NA>
## 80          0       <NA>
## 81          0       <NA>

Data Transformation

gghistogram(data = data_house,x = "SalePrice",fill = "steelblue")+ scale_y_continuous(expand = c(0,0))
## Warning: Using `bins = 30` by default. Pick better value with the argument
## `bins`.

dt_res4 = data_house %>% 
  mutate(log_SalePrice=log(SalePrice))
gghistogram(data = dt_res4,x = "log_SalePrice",fill = "steelblue")+ scale_y_continuous(expand = c(0,0))
## Warning: Using `bins = 30` by default. Pick better value with the argument
## `bins`.

Data Normalization

gghistogram(data = data_house,x = "SalePrice",fill = "steelblue")+ scale_y_continuous(expand = c(0,0))
## Warning: Using `bins = 30` by default. Pick better value with the argument
## `bins`.

Menggunakan Transformasi Z

dt_res5 = data_house %>% 
  mutate(SalePrice_std=scale(SalePrice, center=TRUE,scale=TRUE))
gghistogram(data = dt_res5,x = "SalePrice_std",fill = "steelblue")+ scale_y_continuous(expand = c(0,0))
## Warning: Using `bins = 30` by default. Pick better value with the argument
## `bins`.

Dengan Min-Max Scaling

gghistogram(data = data_house,x = "SalePrice",fill = "steelblue")+ scale_y_continuous(expand = c(0,0))
## Warning: Using `bins = 30` by default. Pick better value with the argument
## `bins`.

minMax <- function(x) {(x - min(x)) / (max(x) - min(x))}
dt_res6 = data_house %>%
  mutate(SalePrice_mm=minMax(SalePrice))
gghistogram(data = dt_res6,x = "SalePrice_mm", fill = "steelblue") + scale_y_continuous(expand = c(0,0))
## Warning: Using `bins = 30` by default. Pick better value with the argument
## `bins`.