DATA - WRANGLING

Install Package

install.packages("tidyverse")
install.packages("skimr")
install.packages("ggpubr")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(skimr)
library(ggpubr)

Data Import

data_house <- read.csv("C:/Users/afwa/Downloads/house_price1.csv",stringsAsFactors = TRUE)
glimpse(data_house)
## Rows: 1,460
## Columns: 81
## $ Id            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ MSSubClass    <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20,…
## $ MSZoning      <fct> RL, RL, RL, RL, RL, RL, RL, RL, RM, RL, RL, RL, RL, RL, …
## $ LotFrontage   <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, …
## $ LotArea       <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 612…
## $ Street        <fct> Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pa…
## $ Alley         <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ LotShape      <fct> Reg, Reg, IR1, IR1, IR1, IR1, Reg, IR1, Reg, Reg, Reg, I…
## $ LandContour   <fct> Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, L…
## $ Utilities     <fct> AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, …
## $ LotConfig     <fct> Inside, FR2, Inside, Corner, FR2, Inside, Inside, Corner…
## $ LandSlope     <fct> Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, G…
## $ Neighborhood  <fct> CollgCr, Veenker, CollgCr, Crawfor, NoRidge, Mitchel, So…
## $ Condition1    <fct> Norm, Feedr, Norm, Norm, Norm, Norm, Norm, PosN, Artery,…
## $ Condition2    <fct> Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Ar…
## $ BldgType      <fct> 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 2f…
## $ HouseStyle    <fct> 2Story, 1Story, 2Story, 2Story, 2Story, 1.5Fin, 1Story, …
## $ OverallQual   <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5,…
## $ OverallCond   <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5,…
## $ YearBuilt     <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 19…
## $ YearRemodAdd  <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 19…
## $ RoofStyle     <fct> Gable, Gable, Gable, Gable, Gable, Gable, Gable, Gable, …
## $ RoofMatl      <fct> CompShg, CompShg, CompShg, CompShg, CompShg, CompShg, Co…
## $ Exterior1st   <fct> VinylSd, MetalSd, VinylSd, Wd Sdng, VinylSd, VinylSd, Vi…
## $ Exterior2nd   <fct> VinylSd, MetalSd, VinylSd, Wd Shng, VinylSd, VinylSd, Vi…
## $ MasVnrType    <fct> BrkFace, None, BrkFace, None, BrkFace, None, Stone, Ston…
## $ MasVnrArea    <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306, …
## $ ExterQual     <fct> Gd, TA, Gd, TA, Gd, TA, Gd, TA, TA, TA, TA, Ex, TA, Gd, …
## $ ExterCond     <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, …
## $ Foundation    <fct> PConc, CBlock, PConc, BrkTil, PConc, Wood, PConc, CBlock…
## $ BsmtQual      <fct> Gd, Gd, Gd, TA, Gd, Gd, Ex, Gd, TA, TA, TA, Ex, TA, Gd, …
## $ BsmtCond      <fct> TA, TA, TA, Gd, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, …
## $ BsmtExposure  <fct> No, Gd, Mn, No, Av, No, Av, Mn, No, No, No, No, No, Av, …
## $ BsmtFinType1  <fct> GLQ, ALQ, GLQ, ALQ, GLQ, GLQ, GLQ, ALQ, Unf, GLQ, Rec, G…
## $ BsmtFinSF1    <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 99…
## $ BsmtFinType2  <fct> Unf, Unf, Unf, Unf, Unf, Unf, Unf, BLQ, Unf, Unf, Unf, U…
## $ BsmtFinSF2    <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ BsmtUnfSF     <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 17…
## $ TotalBsmtSF   <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 10…
## $ Heating       <fct> GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, Ga…
## $ HeatingQC     <fct> Ex, Ex, Ex, Gd, Ex, Ex, Ex, Ex, Gd, Ex, Ex, Ex, TA, Ex, …
## $ CentralAir    <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,…
## $ Electrical    <fct> SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, …
## $ X1stFlrSF     <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, …
## $ X2ndFlrSF     <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0,…
## $ LowQualFinSF  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ GrLivArea     <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 10…
## $ BsmtFullBath  <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,…
## $ BsmtHalfBath  <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FullBath      <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1,…
## $ HalfBath      <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,…
## $ BedroomAbvGr  <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3,…
## $ KitchenAbvGr  <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
## $ KitchenQual   <fct> Gd, TA, Gd, Gd, Gd, TA, Gd, TA, TA, TA, TA, Ex, TA, Gd, …
## $ TotRmsAbvGrd  <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6…
## $ Functional    <fct> Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ, Min1, Typ, Typ, …
## $ Fireplaces    <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0,…
## $ FireplaceQu   <fct> NA, TA, TA, Gd, TA, NA, Gd, TA, TA, TA, NA, Gd, NA, Gd, …
## $ GarageType    <fct> Attchd, Attchd, Attchd, Detchd, Attchd, Attchd, Attchd, …
## $ GarageYrBlt   <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, 19…
## $ GarageFinish  <fct> RFn, RFn, RFn, Unf, RFn, Unf, RFn, RFn, Unf, RFn, Unf, F…
## $ GarageCars    <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2,…
## $ GarageArea    <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 7…
## $ GarageQual    <fct> TA, TA, TA, TA, TA, TA, TA, TA, Fa, Gd, TA, TA, TA, TA, …
## $ GarageCond    <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, …
## $ PavedDrive    <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,…
## $ WoodDeckSF    <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160…
## $ OpenPorchSF   <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213,…
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, …
## $ X3SsnPorch    <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ScreenPorch   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, …
## $ PoolArea      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ PoolQC        <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ Fence         <fct> NA, NA, NA, NA, NA, MnPrv, NA, NA, NA, NA, NA, NA, NA, N…
## $ MiscFeature   <fct> NA, NA, NA, NA, NA, Shed, NA, Shed, NA, NA, NA, NA, NA, …
## $ MiscVal       <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700,…
## $ MoSold        <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10…
## $ YrSold        <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 20…
## $ SaleType      <fct> WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, New, WD, New…
## $ SaleCondition <fct> Normal, Normal, Normal, Abnorml, Normal, Normal, Normal,…
## $ SalePrice     <int> 208500, 181500, 223500, 140000, 250000, 143000, 307000, …

Didapatkan tampilan informasi singkat mengenai dataset house_price1 yang dimasukkan ke dalam variabel data_house.

Data Cleaning

skim_without_charts(data_house)
Data summary
Name data_house
Number of rows 1460
Number of columns 81
_______________________
Column type frequency:
factor 43
numeric 38
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
MSZoning 0 1.00 FALSE 5 RL: 1151, RM: 218, FV: 65, RH: 16
Street 0 1.00 FALSE 2 Pav: 1454, Grv: 6
Alley 1369 0.06 FALSE 2 Grv: 50, Pav: 41
LotShape 0 1.00 FALSE 4 Reg: 925, IR1: 484, IR2: 41, IR3: 10
LandContour 0 1.00 FALSE 4 Lvl: 1311, Bnk: 63, HLS: 50, Low: 36
Utilities 0 1.00 FALSE 2 All: 1459, NoS: 1
LotConfig 0 1.00 FALSE 5 Ins: 1052, Cor: 263, Cul: 94, FR2: 47
LandSlope 0 1.00 FALSE 3 Gtl: 1382, Mod: 65, Sev: 13
Neighborhood 0 1.00 FALSE 25 NAm: 225, Col: 150, Old: 113, Edw: 100
Condition1 0 1.00 FALSE 9 Nor: 1260, Fee: 81, Art: 48, RRA: 26
Condition2 0 1.00 FALSE 8 Nor: 1445, Fee: 6, Art: 2, Pos: 2
BldgType 0 1.00 FALSE 5 1Fa: 1220, Twn: 114, Dup: 52, Twn: 43
HouseStyle 0 1.00 FALSE 8 1St: 726, 2St: 445, 1.5: 154, SLv: 65
RoofStyle 0 1.00 FALSE 6 Gab: 1141, Hip: 286, Fla: 13, Gam: 11
RoofMatl 0 1.00 FALSE 8 Com: 1434, Tar: 11, WdS: 6, WdS: 5
Exterior1st 0 1.00 FALSE 15 Vin: 515, HdB: 222, Met: 220, Wd : 206
Exterior2nd 0 1.00 FALSE 16 Vin: 504, Met: 214, HdB: 207, Wd : 197
MasVnrType 8 0.99 FALSE 4 Non: 864, Brk: 445, Sto: 128, Brk: 15
ExterQual 0 1.00 FALSE 4 TA: 906, Gd: 488, Ex: 52, Fa: 14
ExterCond 0 1.00 FALSE 5 TA: 1282, Gd: 146, Fa: 28, Ex: 3
Foundation 0 1.00 FALSE 6 PCo: 647, CBl: 634, Brk: 146, Sla: 24
BsmtQual 37 0.97 FALSE 4 TA: 649, Gd: 618, Ex: 121, Fa: 35
BsmtCond 37 0.97 FALSE 4 TA: 1311, Gd: 65, Fa: 45, Po: 2
BsmtExposure 38 0.97 FALSE 4 No: 953, Av: 221, Gd: 134, Mn: 114
BsmtFinType1 37 0.97 FALSE 6 Unf: 430, GLQ: 418, ALQ: 220, BLQ: 148
BsmtFinType2 38 0.97 FALSE 6 Unf: 1256, Rec: 54, LwQ: 46, BLQ: 33
Heating 0 1.00 FALSE 6 Gas: 1428, Gas: 18, Gra: 7, Wal: 4
HeatingQC 0 1.00 FALSE 5 Ex: 741, TA: 428, Gd: 241, Fa: 49
CentralAir 0 1.00 FALSE 2 Y: 1365, N: 95
Electrical 1 1.00 FALSE 5 SBr: 1334, Fus: 94, Fus: 27, Fus: 3
KitchenQual 0 1.00 FALSE 4 TA: 735, Gd: 586, Ex: 100, Fa: 39
Functional 0 1.00 FALSE 7 Typ: 1360, Min: 34, Min: 31, Mod: 15
FireplaceQu 690 0.53 FALSE 5 Gd: 380, TA: 313, Fa: 33, Ex: 24
GarageType 81 0.94 FALSE 6 Att: 870, Det: 387, Bui: 88, Bas: 19
GarageFinish 81 0.94 FALSE 3 Unf: 605, RFn: 422, Fin: 352
GarageQual 81 0.94 FALSE 5 TA: 1311, Fa: 48, Gd: 14, Ex: 3
GarageCond 81 0.94 FALSE 5 TA: 1326, Fa: 35, Gd: 9, Po: 7
PavedDrive 0 1.00 FALSE 3 Y: 1340, N: 90, P: 30
PoolQC 1453 0.00 FALSE 3 Gd: 3, Ex: 2, Fa: 2
Fence 1179 0.19 FALSE 4 MnP: 157, GdP: 59, GdW: 54, MnW: 11
MiscFeature 1406 0.04 FALSE 4 She: 49, Gar: 2, Oth: 2, Ten: 1
SaleType 0 1.00 FALSE 9 WD: 1267, New: 122, COD: 43, Con: 9
SaleCondition 0 1.00 FALSE 6 Nor: 1198, Par: 125, Abn: 101, Fam: 20

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
Id 0 1.00 730.50 421.61 1 365.75 730.5 1095.25 1460
MSSubClass 0 1.00 56.90 42.30 20 20.00 50.0 70.00 190
LotFrontage 259 0.82 70.05 24.28 21 59.00 69.0 80.00 313
LotArea 0 1.00 10516.83 9981.26 1300 7553.50 9478.5 11601.50 215245
OverallQual 0 1.00 6.10 1.38 1 5.00 6.0 7.00 10
OverallCond 0 1.00 5.58 1.11 1 5.00 5.0 6.00 9
YearBuilt 0 1.00 1971.27 30.20 1872 1954.00 1973.0 2000.00 2010
YearRemodAdd 0 1.00 1984.87 20.65 1950 1967.00 1994.0 2004.00 2010
MasVnrArea 8 0.99 103.69 181.07 0 0.00 0.0 166.00 1600
BsmtFinSF1 0 1.00 443.64 456.10 0 0.00 383.5 712.25 5644
BsmtFinSF2 0 1.00 46.55 161.32 0 0.00 0.0 0.00 1474
BsmtUnfSF 0 1.00 567.24 441.87 0 223.00 477.5 808.00 2336
TotalBsmtSF 0 1.00 1057.43 438.71 0 795.75 991.5 1298.25 6110
X1stFlrSF 0 1.00 1162.63 386.59 334 882.00 1087.0 1391.25 4692
X2ndFlrSF 0 1.00 346.99 436.53 0 0.00 0.0 728.00 2065
LowQualFinSF 0 1.00 5.84 48.62 0 0.00 0.0 0.00 572
GrLivArea 0 1.00 1515.46 525.48 334 1129.50 1464.0 1776.75 5642
BsmtFullBath 0 1.00 0.43 0.52 0 0.00 0.0 1.00 3
BsmtHalfBath 0 1.00 0.06 0.24 0 0.00 0.0 0.00 2
FullBath 0 1.00 1.57 0.55 0 1.00 2.0 2.00 3
HalfBath 0 1.00 0.38 0.50 0 0.00 0.0 1.00 2
BedroomAbvGr 0 1.00 2.87 0.82 0 2.00 3.0 3.00 8
KitchenAbvGr 0 1.00 1.05 0.22 0 1.00 1.0 1.00 3
TotRmsAbvGrd 0 1.00 6.52 1.63 2 5.00 6.0 7.00 14
Fireplaces 0 1.00 0.61 0.64 0 0.00 1.0 1.00 3
GarageYrBlt 81 0.94 1978.51 24.69 1900 1961.00 1980.0 2002.00 2010
GarageCars 0 1.00 1.77 0.75 0 1.00 2.0 2.00 4
GarageArea 0 1.00 472.98 213.80 0 334.50 480.0 576.00 1418
WoodDeckSF 0 1.00 94.24 125.34 0 0.00 0.0 168.00 857
OpenPorchSF 0 1.00 46.66 66.26 0 0.00 25.0 68.00 547
EnclosedPorch 0 1.00 21.95 61.12 0 0.00 0.0 0.00 552
X3SsnPorch 0 1.00 3.41 29.32 0 0.00 0.0 0.00 508
ScreenPorch 0 1.00 15.06 55.76 0 0.00 0.0 0.00 480
PoolArea 0 1.00 2.76 40.18 0 0.00 0.0 0.00 738
MiscVal 0 1.00 43.49 496.12 0 0.00 0.0 0.00 15500
MoSold 0 1.00 6.32 2.70 1 5.00 6.0 8.00 12
YrSold 0 1.00 2007.82 1.33 2006 2007.00 2008.0 2009.00 2010
SalePrice 0 1.00 180921.20 79442.50 34900 129975.00 163000.0 214000.00 755000
dt_res1 <- data_house %>% 
  #menghapus kolom Alley
  select(-Alley) %>% 
  #menghapus semua baris yang mengandung missing value
  na.omit()
skim_without_charts(dt_res1)
Data summary
Name dt_res1
Number of rows 1
Number of columns 80
_______________________
Column type frequency:
factor 42
numeric 38
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
MSZoning 0 1 FALSE 1 RL: 1, C (: 0, FV: 0, RH: 0
Street 0 1 FALSE 1 Pav: 1, Grv: 0
LotShape 0 1 FALSE 1 IR1: 1, IR2: 0, IR3: 0, Reg: 0
LandContour 0 1 FALSE 1 Lvl: 1, Bnk: 0, HLS: 0, Low: 0
Utilities 0 1 FALSE 1 All: 1, NoS: 0
LotConfig 0 1 FALSE 1 Ins: 1, Cor: 0, Cul: 0, FR2: 0
LandSlope 0 1 FALSE 1 Gtl: 1, Mod: 0, Sev: 0
Neighborhood 0 1 FALSE 1 NWA: 1, Blm: 0, Blu: 0, BrD: 0
Condition1 0 1 FALSE 1 RRA: 1, Art: 0, Fee: 0, Nor: 0
Condition2 0 1 FALSE 1 Nor: 1, Art: 0, Fee: 0, Pos: 0
BldgType 0 1 FALSE 1 1Fa: 1, 2fm: 0, Dup: 0, Twn: 0
HouseStyle 0 1 FALSE 1 2St: 1, 1.5: 0, 1.5: 0, 1St: 0
RoofStyle 0 1 FALSE 1 Gab: 1, Fla: 0, Gam: 0, Hip: 0
RoofMatl 0 1 FALSE 1 Com: 1, Cly: 0, Mem: 0, Met: 0
Exterior1st 0 1 FALSE 1 Ply: 1, Asb: 0, Asp: 0, Brk: 0
Exterior2nd 0 1 FALSE 1 Ply: 1, Asb: 0, Asp: 0, Brk: 0
MasVnrType 0 1 FALSE 1 Brk: 1, Brk: 0, Non: 0, Sto: 0
ExterQual 0 1 FALSE 1 TA: 1, Ex: 0, Fa: 0, Gd: 0
ExterCond 0 1 FALSE 1 TA: 1, Ex: 0, Fa: 0, Gd: 0
Foundation 0 1 FALSE 1 CBl: 1, Brk: 0, PCo: 0, Sla: 0
BsmtQual 0 1 FALSE 1 Gd: 1, Ex: 0, Fa: 0, TA: 0
BsmtCond 0 1 FALSE 1 TA: 1, Fa: 0, Gd: 0, Po: 0
BsmtExposure 0 1 FALSE 1 No: 1, Av: 0, Gd: 0, Mn: 0
BsmtFinType1 0 1 FALSE 1 BLQ: 1, ALQ: 0, GLQ: 0, LwQ: 0
BsmtFinType2 0 1 FALSE 1 LwQ: 1, ALQ: 0, BLQ: 0, GLQ: 0
Heating 0 1 FALSE 1 Gas: 1, Flo: 0, Gas: 0, Gra: 0
HeatingQC 0 1 FALSE 1 TA: 1, Ex: 0, Fa: 0, Gd: 0
CentralAir 0 1 FALSE 1 Y: 1, N: 0
Electrical 0 1 FALSE 1 SBr: 1, Fus: 0, Fus: 0, Fus: 0
KitchenQual 0 1 FALSE 1 Gd: 1, Ex: 0, Fa: 0, TA: 0
Functional 0 1 FALSE 1 Typ: 1, Maj: 0, Maj: 0, Min: 0
FireplaceQu 0 1 FALSE 1 TA: 1, Ex: 0, Fa: 0, Gd: 0
GarageType 0 1 FALSE 1 Att: 1, 2Ty: 0, Bas: 0, Bui: 0
GarageFinish 0 1 FALSE 1 RFn: 1, Fin: 0, Unf: 0
GarageQual 0 1 FALSE 1 TA: 1, Ex: 0, Fa: 0, Gd: 0
GarageCond 0 1 FALSE 1 TA: 1, Ex: 0, Fa: 0, Gd: 0
PavedDrive 0 1 FALSE 1 Y: 1, N: 0, P: 0
PoolQC 0 1 FALSE 1 Fa: 1, Ex: 0, Gd: 0
Fence 0 1 FALSE 1 MnP: 1, GdP: 0, GdW: 0, MnW: 0
MiscFeature 0 1 FALSE 1 Ten: 1, Gar: 0, Oth: 0, She: 0
SaleType 0 1 FALSE 1 WD: 1, COD: 0, Con: 0, Con: 0
SaleCondition 0 1 FALSE 1 Nor: 1, Abn: 0, Adj: 0, All: 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
Id 0 1 1387 NA 1387 1387 1387 1387 1387
MSSubClass 0 1 60 NA 60 60 60 60 60
LotFrontage 0 1 80 NA 80 80 80 80 80
LotArea 0 1 16692 NA 16692 16692 16692 16692 16692
OverallQual 0 1 7 NA 7 7 7 7 7
OverallCond 0 1 5 NA 5 5 5 5 5
YearBuilt 0 1 1978 NA 1978 1978 1978 1978 1978
YearRemodAdd 0 1 1978 NA 1978 1978 1978 1978 1978
MasVnrArea 0 1 184 NA 184 184 184 184 184
BsmtFinSF1 0 1 790 NA 790 790 790 790 790
BsmtFinSF2 0 1 469 NA 469 469 469 469 469
BsmtUnfSF 0 1 133 NA 133 133 133 133 133
TotalBsmtSF 0 1 1392 NA 1392 1392 1392 1392 1392
X1stFlrSF 0 1 1392 NA 1392 1392 1392 1392 1392
X2ndFlrSF 0 1 1392 NA 1392 1392 1392 1392 1392
LowQualFinSF 0 1 0 NA 0 0 0 0 0
GrLivArea 0 1 2784 NA 2784 2784 2784 2784 2784
BsmtFullBath 0 1 1 NA 1 1 1 1 1
BsmtHalfBath 0 1 0 NA 0 0 0 0 0
FullBath 0 1 3 NA 3 3 3 3 3
HalfBath 0 1 1 NA 1 1 1 1 1
BedroomAbvGr 0 1 5 NA 5 5 5 5 5
KitchenAbvGr 0 1 1 NA 1 1 1 1 1
TotRmsAbvGrd 0 1 12 NA 12 12 12 12 12
Fireplaces 0 1 2 NA 2 2 2 2 2
GarageYrBlt 0 1 1978 NA 1978 1978 1978 1978 1978
GarageCars 0 1 2 NA 2 2 2 2 2
GarageArea 0 1 564 NA 564 564 564 564 564
WoodDeckSF 0 1 0 NA 0 0 0 0 0
OpenPorchSF 0 1 112 NA 112 112 112 112 112
EnclosedPorch 0 1 0 NA 0 0 0 0 0
X3SsnPorch 0 1 0 NA 0 0 0 0 0
ScreenPorch 0 1 440 NA 440 440 440 440 440
PoolArea 0 1 519 NA 519 519 519 519 519
MiscVal 0 1 2000 NA 2000 2000 2000 2000 2000
MoSold 0 1 7 NA 7 7 7 7 7
YrSold 0 1 2006 NA 2006 2006 2006 2006 2006
SalePrice 0 1 250000 NA 250000 250000 250000 250000 250000

Dilakukan penghapusan data. Pertama, melalui skim_without_chart didapatkan ringkasan data numerik tanpa grafik dari variabel “data_house”. Selanjutnya dilakukan penghapusan kolom Alley dan menghapus baris yang mengandung missing value, lalu didapatkan informasi seperti jumlah kolom dan baris serta frekuensi dari data tersebut tanpa grafik melalui skim_without_chart dari variabel “dt_res1”.

Koreksi Kesalahan pada Data

skim_without_charts(data_house)
Data summary
Name data_house
Number of rows 1460
Number of columns 81
_______________________
Column type frequency:
factor 43
numeric 38
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
MSZoning 0 1.00 FALSE 5 RL: 1151, RM: 218, FV: 65, RH: 16
Street 0 1.00 FALSE 2 Pav: 1454, Grv: 6
Alley 1369 0.06 FALSE 2 Grv: 50, Pav: 41
LotShape 0 1.00 FALSE 4 Reg: 925, IR1: 484, IR2: 41, IR3: 10
LandContour 0 1.00 FALSE 4 Lvl: 1311, Bnk: 63, HLS: 50, Low: 36
Utilities 0 1.00 FALSE 2 All: 1459, NoS: 1
LotConfig 0 1.00 FALSE 5 Ins: 1052, Cor: 263, Cul: 94, FR2: 47
LandSlope 0 1.00 FALSE 3 Gtl: 1382, Mod: 65, Sev: 13
Neighborhood 0 1.00 FALSE 25 NAm: 225, Col: 150, Old: 113, Edw: 100
Condition1 0 1.00 FALSE 9 Nor: 1260, Fee: 81, Art: 48, RRA: 26
Condition2 0 1.00 FALSE 8 Nor: 1445, Fee: 6, Art: 2, Pos: 2
BldgType 0 1.00 FALSE 5 1Fa: 1220, Twn: 114, Dup: 52, Twn: 43
HouseStyle 0 1.00 FALSE 8 1St: 726, 2St: 445, 1.5: 154, SLv: 65
RoofStyle 0 1.00 FALSE 6 Gab: 1141, Hip: 286, Fla: 13, Gam: 11
RoofMatl 0 1.00 FALSE 8 Com: 1434, Tar: 11, WdS: 6, WdS: 5
Exterior1st 0 1.00 FALSE 15 Vin: 515, HdB: 222, Met: 220, Wd : 206
Exterior2nd 0 1.00 FALSE 16 Vin: 504, Met: 214, HdB: 207, Wd : 197
MasVnrType 8 0.99 FALSE 4 Non: 864, Brk: 445, Sto: 128, Brk: 15
ExterQual 0 1.00 FALSE 4 TA: 906, Gd: 488, Ex: 52, Fa: 14
ExterCond 0 1.00 FALSE 5 TA: 1282, Gd: 146, Fa: 28, Ex: 3
Foundation 0 1.00 FALSE 6 PCo: 647, CBl: 634, Brk: 146, Sla: 24
BsmtQual 37 0.97 FALSE 4 TA: 649, Gd: 618, Ex: 121, Fa: 35
BsmtCond 37 0.97 FALSE 4 TA: 1311, Gd: 65, Fa: 45, Po: 2
BsmtExposure 38 0.97 FALSE 4 No: 953, Av: 221, Gd: 134, Mn: 114
BsmtFinType1 37 0.97 FALSE 6 Unf: 430, GLQ: 418, ALQ: 220, BLQ: 148
BsmtFinType2 38 0.97 FALSE 6 Unf: 1256, Rec: 54, LwQ: 46, BLQ: 33
Heating 0 1.00 FALSE 6 Gas: 1428, Gas: 18, Gra: 7, Wal: 4
HeatingQC 0 1.00 FALSE 5 Ex: 741, TA: 428, Gd: 241, Fa: 49
CentralAir 0 1.00 FALSE 2 Y: 1365, N: 95
Electrical 1 1.00 FALSE 5 SBr: 1334, Fus: 94, Fus: 27, Fus: 3
KitchenQual 0 1.00 FALSE 4 TA: 735, Gd: 586, Ex: 100, Fa: 39
Functional 0 1.00 FALSE 7 Typ: 1360, Min: 34, Min: 31, Mod: 15
FireplaceQu 690 0.53 FALSE 5 Gd: 380, TA: 313, Fa: 33, Ex: 24
GarageType 81 0.94 FALSE 6 Att: 870, Det: 387, Bui: 88, Bas: 19
GarageFinish 81 0.94 FALSE 3 Unf: 605, RFn: 422, Fin: 352
GarageQual 81 0.94 FALSE 5 TA: 1311, Fa: 48, Gd: 14, Ex: 3
GarageCond 81 0.94 FALSE 5 TA: 1326, Fa: 35, Gd: 9, Po: 7
PavedDrive 0 1.00 FALSE 3 Y: 1340, N: 90, P: 30
PoolQC 1453 0.00 FALSE 3 Gd: 3, Ex: 2, Fa: 2
Fence 1179 0.19 FALSE 4 MnP: 157, GdP: 59, GdW: 54, MnW: 11
MiscFeature 1406 0.04 FALSE 4 She: 49, Gar: 2, Oth: 2, Ten: 1
SaleType 0 1.00 FALSE 9 WD: 1267, New: 122, COD: 43, Con: 9
SaleCondition 0 1.00 FALSE 6 Nor: 1198, Par: 125, Abn: 101, Fam: 20

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
Id 0 1.00 730.50 421.61 1 365.75 730.5 1095.25 1460
MSSubClass 0 1.00 56.90 42.30 20 20.00 50.0 70.00 190
LotFrontage 259 0.82 70.05 24.28 21 59.00 69.0 80.00 313
LotArea 0 1.00 10516.83 9981.26 1300 7553.50 9478.5 11601.50 215245
OverallQual 0 1.00 6.10 1.38 1 5.00 6.0 7.00 10
OverallCond 0 1.00 5.58 1.11 1 5.00 5.0 6.00 9
YearBuilt 0 1.00 1971.27 30.20 1872 1954.00 1973.0 2000.00 2010
YearRemodAdd 0 1.00 1984.87 20.65 1950 1967.00 1994.0 2004.00 2010
MasVnrArea 8 0.99 103.69 181.07 0 0.00 0.0 166.00 1600
BsmtFinSF1 0 1.00 443.64 456.10 0 0.00 383.5 712.25 5644
BsmtFinSF2 0 1.00 46.55 161.32 0 0.00 0.0 0.00 1474
BsmtUnfSF 0 1.00 567.24 441.87 0 223.00 477.5 808.00 2336
TotalBsmtSF 0 1.00 1057.43 438.71 0 795.75 991.5 1298.25 6110
X1stFlrSF 0 1.00 1162.63 386.59 334 882.00 1087.0 1391.25 4692
X2ndFlrSF 0 1.00 346.99 436.53 0 0.00 0.0 728.00 2065
LowQualFinSF 0 1.00 5.84 48.62 0 0.00 0.0 0.00 572
GrLivArea 0 1.00 1515.46 525.48 334 1129.50 1464.0 1776.75 5642
BsmtFullBath 0 1.00 0.43 0.52 0 0.00 0.0 1.00 3
BsmtHalfBath 0 1.00 0.06 0.24 0 0.00 0.0 0.00 2
FullBath 0 1.00 1.57 0.55 0 1.00 2.0 2.00 3
HalfBath 0 1.00 0.38 0.50 0 0.00 0.0 1.00 2
BedroomAbvGr 0 1.00 2.87 0.82 0 2.00 3.0 3.00 8
KitchenAbvGr 0 1.00 1.05 0.22 0 1.00 1.0 1.00 3
TotRmsAbvGrd 0 1.00 6.52 1.63 2 5.00 6.0 7.00 14
Fireplaces 0 1.00 0.61 0.64 0 0.00 1.0 1.00 3
GarageYrBlt 81 0.94 1978.51 24.69 1900 1961.00 1980.0 2002.00 2010
GarageCars 0 1.00 1.77 0.75 0 1.00 2.0 2.00 4
GarageArea 0 1.00 472.98 213.80 0 334.50 480.0 576.00 1418
WoodDeckSF 0 1.00 94.24 125.34 0 0.00 0.0 168.00 857
OpenPorchSF 0 1.00 46.66 66.26 0 0.00 25.0 68.00 547
EnclosedPorch 0 1.00 21.95 61.12 0 0.00 0.0 0.00 552
X3SsnPorch 0 1.00 3.41 29.32 0 0.00 0.0 0.00 508
ScreenPorch 0 1.00 15.06 55.76 0 0.00 0.0 0.00 480
PoolArea 0 1.00 2.76 40.18 0 0.00 0.0 0.00 738
MiscVal 0 1.00 43.49 496.12 0 0.00 0.0 0.00 15500
MoSold 0 1.00 6.32 2.70 1 5.00 6.0 8.00 12
YrSold 0 1.00 2007.82 1.33 2006 2007.00 2008.0 2009.00 2010
SalePrice 0 1.00 180921.20 79442.50 34900 129975.00 163000.0 214000.00 755000
dt_res2 <- data_house %>% mutate(Street=case_when(Street=="Pavr" ~ "Pave",
                                                  Street=="Pavd" ~ "Pave",
                                                  TRUE ~ Street))
dt_res2 %>% count(Street)
##   Street    n
## 1   Grvl    6
## 2   Pave 1454

Dilakukan koreksi kesalahan yang dimulai dengan mengganti “Pavr” dan “Pavd” menjadi “Pave” menggunakan fungsi case_when. Kemudian, menggunakan fungsi count untuk menghitung jumlah masing-masing nilai pada kolom “Street” yang telah diubah.

Koreksi Ketdiakonsistenan Data

data_house %>% 
  filter(GarageArea==0) %>% 
  select(GarageArea,GarageType)
##    GarageArea GarageType
## 1           0       <NA>
## 2           0       <NA>
## 3           0       <NA>
## 4           0       <NA>
## 5           0       <NA>
## 6           0       <NA>
## 7           0       <NA>
## 8           0       <NA>
## 9           0       <NA>
## 10          0       <NA>
## 11          0       <NA>
## 12          0       <NA>
## 13          0       <NA>
## 14          0       <NA>
## 15          0       <NA>
## 16          0       <NA>
## 17          0       <NA>
## 18          0       <NA>
## 19          0       <NA>
## 20          0       <NA>
## 21          0       <NA>
## 22          0       <NA>
## 23          0       <NA>
## 24          0       <NA>
## 25          0       <NA>
## 26          0       <NA>
## 27          0       <NA>
## 28          0       <NA>
## 29          0       <NA>
## 30          0       <NA>
## 31          0       <NA>
## 32          0       <NA>
## 33          0       <NA>
## 34          0       <NA>
## 35          0       <NA>
## 36          0       <NA>
## 37          0       <NA>
## 38          0       <NA>
## 39          0       <NA>
## 40          0       <NA>
## 41          0       <NA>
## 42          0       <NA>
## 43          0       <NA>
## 44          0       <NA>
## 45          0       <NA>
## 46          0       <NA>
## 47          0       <NA>
## 48          0       <NA>
## 49          0       <NA>
## 50          0       <NA>
## 51          0       <NA>
## 52          0       <NA>
## 53          0       <NA>
## 54          0       <NA>
## 55          0       <NA>
## 56          0       <NA>
## 57          0       <NA>
## 58          0       <NA>
## 59          0       <NA>
## 60          0       <NA>
## 61          0       <NA>
## 62          0       <NA>
## 63          0       <NA>
## 64          0       <NA>
## 65          0       <NA>
## 66          0       <NA>
## 67          0       <NA>
## 68          0       <NA>
## 69          0       <NA>
## 70          0       <NA>
## 71          0       <NA>
## 72          0       <NA>
## 73          0       <NA>
## 74          0       <NA>
## 75          0       <NA>
## 76          0       <NA>
## 77          0       <NA>
## 78          0       <NA>
## 79          0       <NA>
## 80          0       <NA>
## 81          0       <NA>
dt_res3 <- data_house %>% 
  mutate(GarageType=case_when(GarageArea ==0 ~ NA,
                              TRUE ~ GarageType))
dt_res3 %>% 
  filter(GarageArea==0) %>% 
  select(GarageArea,GarageType)
##    GarageArea GarageType
## 1           0       <NA>
## 2           0       <NA>
## 3           0       <NA>
## 4           0       <NA>
## 5           0       <NA>
## 6           0       <NA>
## 7           0       <NA>
## 8           0       <NA>
## 9           0       <NA>
## 10          0       <NA>
## 11          0       <NA>
## 12          0       <NA>
## 13          0       <NA>
## 14          0       <NA>
## 15          0       <NA>
## 16          0       <NA>
## 17          0       <NA>
## 18          0       <NA>
## 19          0       <NA>
## 20          0       <NA>
## 21          0       <NA>
## 22          0       <NA>
## 23          0       <NA>
## 24          0       <NA>
## 25          0       <NA>
## 26          0       <NA>
## 27          0       <NA>
## 28          0       <NA>
## 29          0       <NA>
## 30          0       <NA>
## 31          0       <NA>
## 32          0       <NA>
## 33          0       <NA>
## 34          0       <NA>
## 35          0       <NA>
## 36          0       <NA>
## 37          0       <NA>
## 38          0       <NA>
## 39          0       <NA>
## 40          0       <NA>
## 41          0       <NA>
## 42          0       <NA>
## 43          0       <NA>
## 44          0       <NA>
## 45          0       <NA>
## 46          0       <NA>
## 47          0       <NA>
## 48          0       <NA>
## 49          0       <NA>
## 50          0       <NA>
## 51          0       <NA>
## 52          0       <NA>
## 53          0       <NA>
## 54          0       <NA>
## 55          0       <NA>
## 56          0       <NA>
## 57          0       <NA>
## 58          0       <NA>
## 59          0       <NA>
## 60          0       <NA>
## 61          0       <NA>
## 62          0       <NA>
## 63          0       <NA>
## 64          0       <NA>
## 65          0       <NA>
## 66          0       <NA>
## 67          0       <NA>
## 68          0       <NA>
## 69          0       <NA>
## 70          0       <NA>
## 71          0       <NA>
## 72          0       <NA>
## 73          0       <NA>
## 74          0       <NA>
## 75          0       <NA>
## 76          0       <NA>
## 77          0       <NA>
## 78          0       <NA>
## 79          0       <NA>
## 80          0       <NA>
## 81          0       <NA>

Dilakukan filter pada data_house untuk mengoreksi ketidakkonstanan data dengan membuat nilai GarageArea = 0 dan variabel yang dipilih hanya GarageArea dan GarageType. Lalu case_when digunakan untuk mengubah nilai GarageType menjadi NA dan GarageArea = 0. Kemudian hasilnya dimasukkan dalam variabel dt_res3

Data Transformation

gghistogram(data = data_house,x = "SalePrice",fill = "steelblue")+
  scale_y_continuous(expand = c(0,0))
## Warning: Using `bins = 30` by default. Pick better value with the argument
## `bins`.

dt_res4 = data_house %>% 
  mutate(log_SalePrice=log(SalePrice))

gghistogram(data = dt_res4,x = "log_SalePrice",fill = "steelblue")+
  scale_y_continuous(expand = c(0,0))
## Warning: Using `bins = 30` by default. Pick better value with the argument
## `bins`.

Dilakukan transformasi data untuk melihat distribusi harga rumah dalam dataset dengan melihat histogram dari data = data_house. Yang kedua adanya transformasi data ke log_SalePrice yang memungkinkan kita untuk mengurangi efek dari outlier dan membuat distribusi data lebih normal sehingga lebih mudah dianalisis. Histogram pertama memiliki skewness positif yang cukup tinggi, yang berarti ada banyak nilai yang lebih tinggi dari nilai rata-rata. Histogram kedua digunakan untuk melihat distribusi logaritma harga rumah dengan data = dt_res4 dan hasilnya memiliki skewness yang lebih rendah dan menunjukkan distribusi yang lebih simetris.

Data Normalization

gghistogram(data = data_house,x = "SalePrice",fill = "steelblue")+
  scale_y_continuous(expand = c(0,0))
## Warning: Using `bins = 30` by default. Pick better value with the argument
## `bins`.

bagian normalization data dihasilkan histogram dari variabel “SalePrice” dalam dataset “data_house”

#Dengan menggunakan Transformasi Z

dt_res5 = data_house %>% 
  mutate(SalePrice_std=scale(SalePrice, center=TRUE,scale=TRUE))

gghistogram(data = dt_res5,x = "SalePrice_std",fill = "steelblue")+
  scale_y_continuous(expand = c(0,0))
## Warning: Using `bins = 30` by default. Pick better value with the argument
## `bins`.

Dilakukan normalisasi data dengan menggunakan Transformasi Z pada variabel “SalePrice” dalam dataset “data_house”. Pertama, digunakan fungsi “mutate” untuk menambahkan kolom baru dengan nama “SalePrice_std”. Pada kolom tersebut, nilai SalePrice diubah menjadi nilai Z-Score (nilai rata-rata dikurangi dari setiap nilai dalam variabel dan hasilnya dibagi dengan standar deviasi dari variabel). Lalu fungsi “scale” digunakan untuk melakukan transformasi Z-Score Scaling. Kemudian dibuatkannya histogram dari variabel “SalePrice_std” yang telah dinormalisasi.

#Dengan Min-Max Scaling
gghistogram(data = data_house,x = "SalePrice",fill = "steelblue")+
  scale_y_continuous(expand = c(0,0))
## Warning: Using `bins = 30` by default. Pick better value with the argument
## `bins`.

minMax <- function(x) {
  (x - min(x)) / (max(x) - min(x))
}

dt_res6 = data_house %>% 
  mutate(SalePrice_mm=minMax(SalePrice))

gghistogram(data = dt_res6,x = "SalePrice_mm",
            fill = "steelblue")+
  scale_y_continuous(expand = c(0,0))
## Warning: Using `bins = 30` by default. Pick better value with the argument
## `bins`.

Dilakukan normalisasi data dengan menggunakan Min-Max Scaling. Yang pertama dihasilkan histogram dari variabel “SalePrice”. Selanjutnya fungsi “minMax” didefinisikan untuk menghitung nilai Min-Max Scaling dari setiap nilai dalam variabel. Setelah itu, fungsi “mutate” digunakan untuk membuat variabel baru “SalePrice_mm” yang merupakan hasil Min-Max Scaling dari variabel “SalePrice”. Terakhir, menggunakan variabel “SalePrice_mm” dibuatkan sebuah histogram yang menunjukkan bahwa rentang nilai SalePrice_mm adalah 0 hingga 1, dengan distribusi yang sama seperti SalePrice.

VISUALISASI DATA DENGAN EKSPLORASI DATA ANALISIS

Install Package

install.packages("tidyverse")
install.packages("DataExplorer")
install.packages("skimr")
library(tidyverse)
library(DataExplorer)
library(skimr)

Data Import

data_house <- read.csv("C:/Users/afwa/Downloads/house_price1.csv",stringsAsFactors = TRUE)
glimpse(data_house)
## Rows: 1,460
## Columns: 81
## $ Id            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ MSSubClass    <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20,…
## $ MSZoning      <fct> RL, RL, RL, RL, RL, RL, RL, RL, RM, RL, RL, RL, RL, RL, …
## $ LotFrontage   <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, …
## $ LotArea       <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 612…
## $ Street        <fct> Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pa…
## $ Alley         <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ LotShape      <fct> Reg, Reg, IR1, IR1, IR1, IR1, Reg, IR1, Reg, Reg, Reg, I…
## $ LandContour   <fct> Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, L…
## $ Utilities     <fct> AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, …
## $ LotConfig     <fct> Inside, FR2, Inside, Corner, FR2, Inside, Inside, Corner…
## $ LandSlope     <fct> Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, G…
## $ Neighborhood  <fct> CollgCr, Veenker, CollgCr, Crawfor, NoRidge, Mitchel, So…
## $ Condition1    <fct> Norm, Feedr, Norm, Norm, Norm, Norm, Norm, PosN, Artery,…
## $ Condition2    <fct> Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Ar…
## $ BldgType      <fct> 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 2f…
## $ HouseStyle    <fct> 2Story, 1Story, 2Story, 2Story, 2Story, 1.5Fin, 1Story, …
## $ OverallQual   <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5,…
## $ OverallCond   <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5,…
## $ YearBuilt     <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 19…
## $ YearRemodAdd  <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 19…
## $ RoofStyle     <fct> Gable, Gable, Gable, Gable, Gable, Gable, Gable, Gable, …
## $ RoofMatl      <fct> CompShg, CompShg, CompShg, CompShg, CompShg, CompShg, Co…
## $ Exterior1st   <fct> VinylSd, MetalSd, VinylSd, Wd Sdng, VinylSd, VinylSd, Vi…
## $ Exterior2nd   <fct> VinylSd, MetalSd, VinylSd, Wd Shng, VinylSd, VinylSd, Vi…
## $ MasVnrType    <fct> BrkFace, None, BrkFace, None, BrkFace, None, Stone, Ston…
## $ MasVnrArea    <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306, …
## $ ExterQual     <fct> Gd, TA, Gd, TA, Gd, TA, Gd, TA, TA, TA, TA, Ex, TA, Gd, …
## $ ExterCond     <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, …
## $ Foundation    <fct> PConc, CBlock, PConc, BrkTil, PConc, Wood, PConc, CBlock…
## $ BsmtQual      <fct> Gd, Gd, Gd, TA, Gd, Gd, Ex, Gd, TA, TA, TA, Ex, TA, Gd, …
## $ BsmtCond      <fct> TA, TA, TA, Gd, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, …
## $ BsmtExposure  <fct> No, Gd, Mn, No, Av, No, Av, Mn, No, No, No, No, No, Av, …
## $ BsmtFinType1  <fct> GLQ, ALQ, GLQ, ALQ, GLQ, GLQ, GLQ, ALQ, Unf, GLQ, Rec, G…
## $ BsmtFinSF1    <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 99…
## $ BsmtFinType2  <fct> Unf, Unf, Unf, Unf, Unf, Unf, Unf, BLQ, Unf, Unf, Unf, U…
## $ BsmtFinSF2    <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ BsmtUnfSF     <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 17…
## $ TotalBsmtSF   <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 10…
## $ Heating       <fct> GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, Ga…
## $ HeatingQC     <fct> Ex, Ex, Ex, Gd, Ex, Ex, Ex, Ex, Gd, Ex, Ex, Ex, TA, Ex, …
## $ CentralAir    <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,…
## $ Electrical    <fct> SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, …
## $ X1stFlrSF     <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, …
## $ X2ndFlrSF     <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0,…
## $ LowQualFinSF  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ GrLivArea     <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 10…
## $ BsmtFullBath  <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,…
## $ BsmtHalfBath  <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FullBath      <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1,…
## $ HalfBath      <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,…
## $ BedroomAbvGr  <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3,…
## $ KitchenAbvGr  <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
## $ KitchenQual   <fct> Gd, TA, Gd, Gd, Gd, TA, Gd, TA, TA, TA, TA, Ex, TA, Gd, …
## $ TotRmsAbvGrd  <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6…
## $ Functional    <fct> Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ, Min1, Typ, Typ, …
## $ Fireplaces    <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0,…
## $ FireplaceQu   <fct> NA, TA, TA, Gd, TA, NA, Gd, TA, TA, TA, NA, Gd, NA, Gd, …
## $ GarageType    <fct> Attchd, Attchd, Attchd, Detchd, Attchd, Attchd, Attchd, …
## $ GarageYrBlt   <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, 19…
## $ GarageFinish  <fct> RFn, RFn, RFn, Unf, RFn, Unf, RFn, RFn, Unf, RFn, Unf, F…
## $ GarageCars    <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2,…
## $ GarageArea    <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 7…
## $ GarageQual    <fct> TA, TA, TA, TA, TA, TA, TA, TA, Fa, Gd, TA, TA, TA, TA, …
## $ GarageCond    <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, …
## $ PavedDrive    <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,…
## $ WoodDeckSF    <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160…
## $ OpenPorchSF   <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213,…
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, …
## $ X3SsnPorch    <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ScreenPorch   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, …
## $ PoolArea      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ PoolQC        <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ Fence         <fct> NA, NA, NA, NA, NA, MnPrv, NA, NA, NA, NA, NA, NA, NA, N…
## $ MiscFeature   <fct> NA, NA, NA, NA, NA, Shed, NA, Shed, NA, NA, NA, NA, NA, …
## $ MiscVal       <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700,…
## $ MoSold        <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10…
## $ YrSold        <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 20…
## $ SaleType      <fct> WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, New, WD, New…
## $ SaleCondition <fct> Normal, Normal, Normal, Abnorml, Normal, Normal, Normal,…
## $ SalePrice     <int> 208500, 181500, 223500, 140000, 250000, 143000, 307000, …

Didapatkan tampilan informasi singkat mengenai dataset house_price1 yang dimasukkan ke dalam variabel data_house.

Memeriksa Gambaran Umum Data

plot_intro(data = data_house,
           geom_label_args = list(size=2.5))

skim_without_charts(data = data_house)
Data summary
Name data_house
Number of rows 1460
Number of columns 81
_______________________
Column type frequency:
factor 43
numeric 38
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
MSZoning 0 1.00 FALSE 5 RL: 1151, RM: 218, FV: 65, RH: 16
Street 0 1.00 FALSE 2 Pav: 1454, Grv: 6
Alley 1369 0.06 FALSE 2 Grv: 50, Pav: 41
LotShape 0 1.00 FALSE 4 Reg: 925, IR1: 484, IR2: 41, IR3: 10
LandContour 0 1.00 FALSE 4 Lvl: 1311, Bnk: 63, HLS: 50, Low: 36
Utilities 0 1.00 FALSE 2 All: 1459, NoS: 1
LotConfig 0 1.00 FALSE 5 Ins: 1052, Cor: 263, Cul: 94, FR2: 47
LandSlope 0 1.00 FALSE 3 Gtl: 1382, Mod: 65, Sev: 13
Neighborhood 0 1.00 FALSE 25 NAm: 225, Col: 150, Old: 113, Edw: 100
Condition1 0 1.00 FALSE 9 Nor: 1260, Fee: 81, Art: 48, RRA: 26
Condition2 0 1.00 FALSE 8 Nor: 1445, Fee: 6, Art: 2, Pos: 2
BldgType 0 1.00 FALSE 5 1Fa: 1220, Twn: 114, Dup: 52, Twn: 43
HouseStyle 0 1.00 FALSE 8 1St: 726, 2St: 445, 1.5: 154, SLv: 65
RoofStyle 0 1.00 FALSE 6 Gab: 1141, Hip: 286, Fla: 13, Gam: 11
RoofMatl 0 1.00 FALSE 8 Com: 1434, Tar: 11, WdS: 6, WdS: 5
Exterior1st 0 1.00 FALSE 15 Vin: 515, HdB: 222, Met: 220, Wd : 206
Exterior2nd 0 1.00 FALSE 16 Vin: 504, Met: 214, HdB: 207, Wd : 197
MasVnrType 8 0.99 FALSE 4 Non: 864, Brk: 445, Sto: 128, Brk: 15
ExterQual 0 1.00 FALSE 4 TA: 906, Gd: 488, Ex: 52, Fa: 14
ExterCond 0 1.00 FALSE 5 TA: 1282, Gd: 146, Fa: 28, Ex: 3
Foundation 0 1.00 FALSE 6 PCo: 647, CBl: 634, Brk: 146, Sla: 24
BsmtQual 37 0.97 FALSE 4 TA: 649, Gd: 618, Ex: 121, Fa: 35
BsmtCond 37 0.97 FALSE 4 TA: 1311, Gd: 65, Fa: 45, Po: 2
BsmtExposure 38 0.97 FALSE 4 No: 953, Av: 221, Gd: 134, Mn: 114
BsmtFinType1 37 0.97 FALSE 6 Unf: 430, GLQ: 418, ALQ: 220, BLQ: 148
BsmtFinType2 38 0.97 FALSE 6 Unf: 1256, Rec: 54, LwQ: 46, BLQ: 33
Heating 0 1.00 FALSE 6 Gas: 1428, Gas: 18, Gra: 7, Wal: 4
HeatingQC 0 1.00 FALSE 5 Ex: 741, TA: 428, Gd: 241, Fa: 49
CentralAir 0 1.00 FALSE 2 Y: 1365, N: 95
Electrical 1 1.00 FALSE 5 SBr: 1334, Fus: 94, Fus: 27, Fus: 3
KitchenQual 0 1.00 FALSE 4 TA: 735, Gd: 586, Ex: 100, Fa: 39
Functional 0 1.00 FALSE 7 Typ: 1360, Min: 34, Min: 31, Mod: 15
FireplaceQu 690 0.53 FALSE 5 Gd: 380, TA: 313, Fa: 33, Ex: 24
GarageType 81 0.94 FALSE 6 Att: 870, Det: 387, Bui: 88, Bas: 19
GarageFinish 81 0.94 FALSE 3 Unf: 605, RFn: 422, Fin: 352
GarageQual 81 0.94 FALSE 5 TA: 1311, Fa: 48, Gd: 14, Ex: 3
GarageCond 81 0.94 FALSE 5 TA: 1326, Fa: 35, Gd: 9, Po: 7
PavedDrive 0 1.00 FALSE 3 Y: 1340, N: 90, P: 30
PoolQC 1453 0.00 FALSE 3 Gd: 3, Ex: 2, Fa: 2
Fence 1179 0.19 FALSE 4 MnP: 157, GdP: 59, GdW: 54, MnW: 11
MiscFeature 1406 0.04 FALSE 4 She: 49, Gar: 2, Oth: 2, Ten: 1
SaleType 0 1.00 FALSE 9 WD: 1267, New: 122, COD: 43, Con: 9
SaleCondition 0 1.00 FALSE 6 Nor: 1198, Par: 125, Abn: 101, Fam: 20

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
Id 0 1.00 730.50 421.61 1 365.75 730.5 1095.25 1460
MSSubClass 0 1.00 56.90 42.30 20 20.00 50.0 70.00 190
LotFrontage 259 0.82 70.05 24.28 21 59.00 69.0 80.00 313
LotArea 0 1.00 10516.83 9981.26 1300 7553.50 9478.5 11601.50 215245
OverallQual 0 1.00 6.10 1.38 1 5.00 6.0 7.00 10
OverallCond 0 1.00 5.58 1.11 1 5.00 5.0 6.00 9
YearBuilt 0 1.00 1971.27 30.20 1872 1954.00 1973.0 2000.00 2010
YearRemodAdd 0 1.00 1984.87 20.65 1950 1967.00 1994.0 2004.00 2010
MasVnrArea 8 0.99 103.69 181.07 0 0.00 0.0 166.00 1600
BsmtFinSF1 0 1.00 443.64 456.10 0 0.00 383.5 712.25 5644
BsmtFinSF2 0 1.00 46.55 161.32 0 0.00 0.0 0.00 1474
BsmtUnfSF 0 1.00 567.24 441.87 0 223.00 477.5 808.00 2336
TotalBsmtSF 0 1.00 1057.43 438.71 0 795.75 991.5 1298.25 6110
X1stFlrSF 0 1.00 1162.63 386.59 334 882.00 1087.0 1391.25 4692
X2ndFlrSF 0 1.00 346.99 436.53 0 0.00 0.0 728.00 2065
LowQualFinSF 0 1.00 5.84 48.62 0 0.00 0.0 0.00 572
GrLivArea 0 1.00 1515.46 525.48 334 1129.50 1464.0 1776.75 5642
BsmtFullBath 0 1.00 0.43 0.52 0 0.00 0.0 1.00 3
BsmtHalfBath 0 1.00 0.06 0.24 0 0.00 0.0 0.00 2
FullBath 0 1.00 1.57 0.55 0 1.00 2.0 2.00 3
HalfBath 0 1.00 0.38 0.50 0 0.00 0.0 1.00 2
BedroomAbvGr 0 1.00 2.87 0.82 0 2.00 3.0 3.00 8
KitchenAbvGr 0 1.00 1.05 0.22 0 1.00 1.0 1.00 3
TotRmsAbvGrd 0 1.00 6.52 1.63 2 5.00 6.0 7.00 14
Fireplaces 0 1.00 0.61 0.64 0 0.00 1.0 1.00 3
GarageYrBlt 81 0.94 1978.51 24.69 1900 1961.00 1980.0 2002.00 2010
GarageCars 0 1.00 1.77 0.75 0 1.00 2.0 2.00 4
GarageArea 0 1.00 472.98 213.80 0 334.50 480.0 576.00 1418
WoodDeckSF 0 1.00 94.24 125.34 0 0.00 0.0 168.00 857
OpenPorchSF 0 1.00 46.66 66.26 0 0.00 25.0 68.00 547
EnclosedPorch 0 1.00 21.95 61.12 0 0.00 0.0 0.00 552
X3SsnPorch 0 1.00 3.41 29.32 0 0.00 0.0 0.00 508
ScreenPorch 0 1.00 15.06 55.76 0 0.00 0.0 0.00 480
PoolArea 0 1.00 2.76 40.18 0 0.00 0.0 0.00 738
MiscVal 0 1.00 43.49 496.12 0 0.00 0.0 0.00 15500
MoSold 0 1.00 6.32 2.70 1 5.00 6.0 8.00 12
YrSold 0 1.00 2007.82 1.33 2006 2007.00 2008.0 2009.00 2010
SalePrice 0 1.00 180921.20 79442.50 34900 129975.00 163000.0 214000.00 755000

Fungsi plot_intro digunakan untuk memeriksa gambaran umum data pada variabel “data_house”. Hasilnya dapat diketahui bahwa terdapat 53.1% kolom diskrit dan 46.9% kolom kontinu, serta terdapat 5.9% observasi yang hilang/kosong. Selanjutnya melalui skim_without_chart didapatkan ringkasan data numerik tanpa grafik dari variabel “data_house”.

Menangani Missing Value

data_house1 <- data_house %>%
  select(-Id) %>% 
  mutate(
    Alley = forcats::fct_explicit_na(Alley, na_level = "Ukn"),
    FireplaceQu=forcats::fct_explicit_na(FireplaceQu,
                                         na_level = "Ukn"   
    ),
    PoolQC = forcats::fct_explicit_na(PoolQC, na_level = "Ukn"),
    Fence = forcats::fct_explicit_na(Fence, na_level = "Ukn"),
    MiscFeature = forcats::fct_explicit_na(MiscFeature, na_level = "Ukn")
  ) %>% na.omit
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Alley = forcats::fct_explicit_na(Alley, na_level = "Ukn")`.
## Caused by warning:
## ! `fct_explicit_na()` was deprecated in forcats 1.0.0.
## ℹ Please use `fct_na_value_to_level()` instead.

Diatas merupakan sintaks untuk melakukan replace missing value, khususnya jika data berupa factor atau string. Kemudian na.omit berfungsi untuk menghapus semua baris yang mengandung missing value

plot_intro(data = data_house1)

Kemudian digunakan fungsi plot_intro digunakan untuk memeriksa gambaran umum data pada variabel “data_house1”. Hasilnya dapat diketahui bahwa terdapat 53.8% kolom diskrit dan 46.2% kolom kontinu, serta terdapat 0% kolom yang hilang/kosong, yang berarti seluruh data sudah lengkap

skim_without_charts(data_house1)
Data summary
Name data_house1
Number of rows 1094
Number of columns 80
_______________________
Column type frequency:
factor 43
numeric 37
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
MSZoning 0 1 FALSE 5 RL: 850, RM: 173, FV: 54, RH: 9
Street 0 1 FALSE 2 Pav: 1090, Grv: 4
Alley 0 1 FALSE 3 Ukn: 1017, Grv: 41, Pav: 36
LotShape 0 1 FALSE 4 Reg: 760, IR1: 301, IR2: 26, IR3: 7
LandContour 0 1 FALSE 4 Lvl: 991, Bnk: 45, HLS: 44, Low: 14
Utilities 0 1 FALSE 1 All: 1094, NoS: 0
LotConfig 0 1 FALSE 5 Ins: 830, Cor: 187, Cul: 44, FR2: 29
LandSlope 0 1 FALSE 3 Gtl: 1045, Mod: 44, Sev: 5
Neighborhood 0 1 FALSE 25 NAm: 173, Col: 122, Old: 96, Som: 75
Condition1 0 1 FALSE 9 Nor: 950, Fee: 52, Art: 42, RRA: 24
Condition2 0 1 FALSE 6 Nor: 1082, Fee: 5, Art: 2, Pos: 2
BldgType 0 1 FALSE 5 1Fa: 925, Twn: 90, Twn: 35, Dup: 24
HouseStyle 0 1 FALSE 8 1St: 540, 2St: 346, 1.5: 117, SLv: 43
RoofStyle 0 1 FALSE 5 Gab: 843, Hip: 230, Gam: 10, Man: 6
RoofMatl 0 1 FALSE 7 Com: 1078, WdS: 6, Tar: 5, WdS: 2
Exterior1st 0 1 FALSE 14 Vin: 421, Met: 172, HdB: 151, Wd : 149
Exterior2nd 0 1 FALSE 16 Vin: 412, Met: 169, Wd : 145, HdB: 138
MasVnrType 0 1 FALSE 4 Non: 639, Brk: 327, Sto: 119, Brk: 9
ExterQual 0 1 FALSE 4 TA: 646, Gd: 395, Ex: 46, Fa: 7
ExterCond 0 1 FALSE 4 TA: 973, Gd: 104, Fa: 15, Ex: 2
Foundation 0 1 FALSE 5 PCo: 518, CBl: 446, Brk: 122, Sto: 6
BsmtQual 0 1 FALSE 4 TA: 486, Gd: 463, Ex: 113, Fa: 32
BsmtCond 0 1 FALSE 4 TA: 1006, Gd: 51, Fa: 36, Po: 1
BsmtExposure 0 1 FALSE 4 No: 734, Av: 174, Gd: 97, Mn: 89
BsmtFinType1 0 1 FALSE 6 Unf: 343, GLQ: 323, ALQ: 162, BLQ: 105
BsmtFinType2 0 1 FALSE 6 Unf: 972, Rec: 37, LwQ: 35, BLQ: 25
Heating 0 1 FALSE 4 Gas: 1075, Gas: 16, Gra: 2, Oth: 1
HeatingQC 0 1 FALSE 5 Ex: 594, TA: 298, Gd: 174, Fa: 27
CentralAir 0 1 FALSE 2 Y: 1036, N: 58
Electrical 0 1 FALSE 5 SBr: 1009, Fus: 67, Fus: 15, Fus: 2
KitchenQual 0 1 FALSE 4 TA: 528, Gd: 454, Ex: 91, Fa: 21
Functional 0 1 FALSE 6 Typ: 1024, Min: 25, Min: 21, Maj: 10
FireplaceQu 0 1 FALSE 6 Ukn: 511, Gd: 315, TA: 212, Fa: 24
GarageType 0 1 FALSE 6 Att: 680, Det: 325, Bui: 63, Bas: 15
GarageFinish 0 1 FALSE 3 Unf: 485, RFn: 333, Fin: 276
GarageQual 0 1 FALSE 5 TA: 1031, Fa: 46, Gd: 11, Ex: 3
GarageCond 0 1 FALSE 5 TA: 1050, Fa: 31, Po: 6, Gd: 5
PavedDrive 0 1 FALSE 3 Y: 1023, N: 48, P: 23
PoolQC 0 1 FALSE 4 Ukn: 1088, Ex: 2, Fa: 2, Gd: 2
Fence 0 1 FALSE 5 Ukn: 882, MnP: 117, GdP: 46, GdW: 39
MiscFeature 0 1 FALSE 4 Ukn: 1059, She: 33, Oth: 1, Ten: 1
SaleType 0 1 FALSE 9 WD: 928, New: 116, COD: 31, Con: 5
SaleCondition 0 1 FALSE 6 Nor: 880, Par: 119, Abn: 70, Fam: 18

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
MSSubClass 0 1 56.13 41.98 20 20.00 50.0 70.00 190
LotFrontage 0 1 70.76 24.51 21 60.00 70.0 80.00 313
LotArea 0 1 10132.35 8212.25 1300 7606.75 9444.5 11387.25 215245
OverallQual 0 1 6.25 1.37 2 5.00 6.0 7.00 10
OverallCond 0 1 5.58 1.07 2 5.00 5.0 6.00 9
YearBuilt 0 1 1972.41 31.19 1880 1953.00 1975.0 2003.00 2010
YearRemodAdd 0 1 1985.92 20.93 1950 1967.00 1995.0 2005.00 2010
MasVnrArea 0 1 109.86 190.67 0 0.00 0.0 171.75 1600
BsmtFinSF1 0 1 448.19 468.73 0 0.00 384.5 712.75 5644
BsmtFinSF2 0 1 45.25 159.08 0 0.00 0.0 0.00 1474
BsmtUnfSF 0 1 606.12 445.83 0 270.00 525.0 846.00 2336
TotalBsmtSF 0 1 1099.56 415.85 105 816.00 1023.0 1345.50 6110
X1stFlrSF 0 1 1173.81 387.68 438 894.00 1097.0 1413.50 4692
X2ndFlrSF 0 1 356.54 439.26 0 0.00 0.0 729.00 2065
LowQualFinSF 0 1 4.68 42.10 0 0.00 0.0 0.00 572
GrLivArea 0 1 1535.03 526.12 438 1164.00 1480.0 1779.00 5642
BsmtFullBath 0 1 0.42 0.51 0 0.00 0.0 1.00 2
BsmtHalfBath 0 1 0.06 0.24 0 0.00 0.0 0.00 2
FullBath 0 1 1.58 0.55 0 1.00 2.0 2.00 3
HalfBath 0 1 0.39 0.50 0 0.00 0.0 1.00 2
BedroomAbvGr 0 1 2.86 0.76 0 2.00 3.0 3.00 6
KitchenAbvGr 0 1 1.03 0.19 1 1.00 1.0 1.00 3
TotRmsAbvGrd 0 1 6.57 1.58 3 5.00 6.0 7.00 12
Fireplaces 0 1 0.61 0.63 0 0.00 1.0 1.00 3
GarageYrBlt 0 1 1978.57 25.93 1900 1960.00 1982.0 2003.00 2010
GarageCars 0 1 1.88 0.66 1 1.00 2.0 2.00 4
GarageArea 0 1 503.76 192.26 160 360.00 484.0 602.50 1418
WoodDeckSF 0 1 94.34 122.62 0 0.00 0.0 169.75 857
OpenPorchSF 0 1 46.95 64.82 0 0.00 28.0 68.00 547
EnclosedPorch 0 1 22.05 61.57 0 0.00 0.0 0.00 552
X3SsnPorch 0 1 3.27 29.66 0 0.00 0.0 0.00 508
ScreenPorch 0 1 16.50 58.46 0 0.00 0.0 0.00 480
PoolArea 0 1 3.01 40.71 0 0.00 0.0 0.00 648
MiscVal 0 1 23.55 167.14 0 0.00 0.0 0.00 2500
MoSold 0 1 6.34 2.69 1 5.00 6.0 8.00 12
YrSold 0 1 2007.79 1.33 2006 2007.00 2008.0 2009.00 2010
SalePrice 0 1 187033.26 83165.33 35311 132500.00 165750.0 221000.00 755000

Melalui skim_without_chart didapatkan ringkasan data numerik tanpa grafik dari variabel “data_house1”.

data_house1 <- data_house1 %>% 
  select(-Utilities)

Setelah dilihat kembali ternyata ada kolom yang hanya memiliki satu kategori saja yaitu kolom Utilities. Sehingga dilakukan penghapusan.

Memeriksa Sebaran Data

plot_histogram(data = data_house1,nrow=3,ncol = 3,
               geom_histogram_args = list(fill="steelblue"),
               ggtheme = theme_bw()
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Membuat histogram dari data rumah dalam variabel “data_house1”. Histogram akan dibuat dalam bentuk grid dengan 3 baris dan 3 kolom. Dan hasil histogram dapat digunakan untuk melihat pola distribusi data, seperti bentuk kurva, jumlah data yang terkonsentrasi di sekitar nilai tertentu, dan data yang tersebar di seluruh jangkauan nilai.

plot_bar(data = data_house1,ggtheme =theme_bw(),nrow = 1)

Selanjutnya dibuat bar chart dari data rumah dalam variabel “data_house1” yang akan ditampilkan dalam satu baris. Dari bar chart tersebut, kita dapat melihat perbedaan jumlah atau proporsi antara kategori yang berbeda.

Memeriksa Korelasi Peubah

plot_scatterplot(data = data_house1 %>%
                   select_if(is.numeric),
                 by="SalePrice",geom_point_args = list(color="steelblue"),ggtheme = theme_bw() )

Selanjutnya membuat scatterplot dari data rumah yang hanya memilih kolom numerik dan mengelompokkannya berdasarkan kolom “SalePrice”. Scatterplot akan menunjukkan hubungan antara dua variabel numerik dalam bentuk titik-titik dan kita dapat melihat hubungan antara dua variabel numerik.

cor_mat <- cor(data_house1%>%
                 select_if(is.numeric),method = "spearman")
cor_mat[upper.tri(cor_mat,diag = TRUE)] <- NA 
cor_df <- cor_mat   %>%
  as.data.frame() %>% 
  rownames_to_column(var = "Var1") %>%
  pivot_longer(names_to = "Var2",
               values_to = "corr",
               -Var1) %>% na.omit

cor_df %>% filter(abs(corr)>0.6) %>% arrange(desc(abs(corr)))
## # A tibble: 31 × 3
##    Var1         Var2          corr
##    <chr>        <chr>        <dbl>
##  1 GarageYrBlt  YearBuilt    0.895
##  2 X1stFlrSF    TotalBsmtSF  0.877
##  3 GarageArea   GarageCars   0.841
##  4 TotRmsAbvGrd GrLivArea    0.829
##  5 SalePrice    OverallQual  0.823
##  6 GarageYrBlt  YearRemodAdd 0.747
##  7 YearRemodAdd YearBuilt    0.738
##  8 SalePrice    GrLivArea    0.731
##  9 SalePrice    GarageCars   0.681
## 10 SalePrice    FullBath     0.671
## # … with 21 more rows
cor_df %>% filter(abs(corr)<=0.6)
## # A tibble: 635 × 3
##    Var1        Var2            corr
##    <chr>       <chr>          <dbl>
##  1 LotFrontage MSSubClass  -0.313  
##  2 LotArea     MSSubClass  -0.255  
##  3 OverallQual MSSubClass   0.0992 
##  4 OverallQual LotFrontage  0.238  
##  5 OverallQual LotArea      0.283  
##  6 OverallCond MSSubClass  -0.0763 
##  7 OverallCond LotFrontage -0.0693 
##  8 OverallCond LotArea     -0.0873 
##  9 OverallCond OverallQual -0.264  
## 10 YearBuilt   MSSubClass  -0.00468
## # … with 625 more rows
cat_var_names <- data_house1 %>% 
  select(where(is.factor),SalePrice) %>%
  names
cat_var_names
##  [1] "MSZoning"      "Street"        "Alley"         "LotShape"     
##  [5] "LandContour"   "LotConfig"     "LandSlope"     "Neighborhood" 
##  [9] "Condition1"    "Condition2"    "BldgType"      "HouseStyle"   
## [13] "RoofStyle"     "RoofMatl"      "Exterior1st"   "Exterior2nd"  
## [17] "MasVnrType"    "ExterQual"     "ExterCond"     "Foundation"   
## [21] "BsmtQual"      "BsmtCond"      "BsmtExposure"  "BsmtFinType1" 
## [25] "BsmtFinType2"  "Heating"       "HeatingQC"     "CentralAir"   
## [29] "Electrical"    "KitchenQual"   "Functional"    "FireplaceQu"  
## [33] "GarageType"    "GarageFinish"  "GarageQual"    "GarageCond"   
## [37] "PavedDrive"    "PoolQC"        "Fence"         "MiscFeature"  
## [41] "SaleType"      "SaleCondition" "SalePrice"

Kode diatas digunakan untuk membuat matriks korelasi Spearman antar variabel numerik dalam data “data_house1” dan mengekstrak koefisien korelasi yang lebih besar dari 0,6 dalam bentuk data frame. Kemudian, kode juga mencari nama variabel kategorikal dalam data “data_house1” yang kemudian disimpan dalam variabel “cat_var_names”.

for(i in cat_var_names[-43]){
  plot_boxplot(data = data_house1 %>% 
                 select(where(is.factor),SalePrice),
               geom_boxplot_args=list(fill="steelblue"),
               by=i,ggtheme = theme_bw())
}

ditampilkan beberapa boxplot dari setiap variabel kategorikal dalam data “data_house1” kecuali variabel “SalePrice”. Setiap boxplot akan menunjukkan distribusi harga jual (SalePrice) untuk setiap nilai kategorikal dalam variabel yang diberikan. Masing-masing boxplot menunjukkan distribusi harga jual (SalePrice) untuk setiap nilai dalam variabel kategorikal yang diberikan. Analisis hasilnya bergantung pada variabel kategorikal yang digunakan. Kita dapat melihat distribusi harga jual di antara setiap kelompok dalam variabel kategorikal.

REGRESI LINEAR BERGANDA

Install Package

install.packages("rsample")
install.packages("DataExplorer")
install.packages("sjPlot")
install.packages("openxlsx")
install.packages("lmtest")
install.packages("fBasics")
install.packages("mlr3measures")

Memanggil Package

library(rsample)
library(DataExplorer)
library(sjPlot)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(openxlsx)

Import Data

data_insurance <- read.csv("C:/Users/afwa/Downloads/insurance.csv",stringsAsFactors = TRUE)
head(data_insurance)
##   age    sex    bmi children smoker    region  expenses
## 1  19 female 27.900        0    yes southwest 16884.924
## 2  18   male 33.770        1     no southeast  1725.552
## 3  28   male 33.000        3     no southeast  4449.462
## 4  33   male 22.705        0     no northwest 21984.471
## 5  32   male 28.880        0     no northwest  3866.855
## 6  31 female 25.740        0     no southeast  3756.622

Data Exploration

#1. Memeriksa sebaran data
plot_histogram(data = data_insurance,nrow=3,ncol = 3,
               geom_histogram_args = list(fill="steelblue"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Empat histogram di atas menampilkan frekuensi dari empat variabel data age, bmi, children dan expenses. Dapat dilihat frekuensi variabel dengan memperhatikan angka-angka pada histogram.

#Transformasi response
data_insurance$expenses <- log(data_insurance$expenses)

plot_histogram(data = data_insurance,nrow=3,ncol = 3,
               geom_histogram_args = list(fill="steelblue"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Histogram diatas menampilkan histogram dari variabel expenses yang telah diubah sekalanya dengan transformasi data. Transformasi data dilakukan untuk tujuan mengubah skala pengukuran data asli menjadi bentuk lain sehingga dapat memenuhi syarat regresi linier.

#Sebaran untuk peubah kategorik
plot_bar(data = data_insurance,nrow=3,ncol = 3
)

Plot bar di atas menampilkan frekuensi sebaran sex, smoker dan region dari data. Plot ini menampilkan jumlah dari data pada setiap sebaran.

#2. Memeriksa Korelasi Peubah
plot_scatterplot(data = data_insurance[,c("expenses","age","bmi","children")],
                 by="expenses",geom_point_args= list(color="steelblue") )

Selanjutkan dilakukan pemeriksaan korelasi antara variabel. Secara sederhana, uji ini adalah uji linearitas yaitu pengujian untuk memeriksa apakah terdapat hubungan antara variabel independe dengan variabel dependen. Hal ini dapat dilihat dari sebaran data, jika sebaran data terlihat berhubungan satu sama lain maka data tersebut linear, sedangkan jika sebaran data tersebut takterarah maka dapat disebut data tersebut tidak linear. Terlihat dari ketiga variabel bebas yang diuji pada variabel terikat diatas, bahwa sebaran data nya berkorelasi satu sama lain sehingga dapat disebut sebagai data yang linear. Karena uji tersebut telah berhasil, maka dapat dilakukan uji regresi linear berganda.

Model Regresi Linear

regresi <- lm(formula = expenses~.,data = data_insurance)
summary(regresi)
## 
## Call:
## lm(formula = expenses ~ ., data = data_insurance)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.07186 -0.19835 -0.04917  0.06598  2.16636 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      7.0305581  0.0723960  97.112  < 2e-16 ***
## age              0.0345816  0.0008721  39.655  < 2e-16 ***
## sexmale         -0.0754164  0.0244012  -3.091 0.002038 ** 
## bmi              0.0133748  0.0020960   6.381 2.42e-10 ***
## children         0.1018568  0.0100995  10.085  < 2e-16 ***
## smokeryes        1.5543228  0.0302795  51.333  < 2e-16 ***
## regionnorthwest -0.0637876  0.0349057  -1.827 0.067860 .  
## regionsoutheast -0.1571967  0.0350828  -4.481 8.08e-06 ***
## regionsouthwest -0.1289522  0.0350271  -3.681 0.000241 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4443 on 1329 degrees of freedom
## Multiple R-squared:  0.7679, Adjusted R-squared:  0.7666 
## F-statistic: 549.8 on 8 and 1329 DF,  p-value: < 2.2e-16
plot_model(regresi,type = "est",sort.est = TRUE,
           transform = "exp" )

plot_model(model = regresi,type="pred")
## $age

## 
## $sex

## 
## $bmi

## 
## $children

## 
## $smoker

## 
## $region

## Model Checking

plot_model(regresi,type = "diag")
## [[1]]

## 
## [[2]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[3]]

## 
## [[4]]
## `geom_smooth()` using formula = 'y ~ x'

res <- residuals(regresi)
# uji normalitas
shapiro.test(res)
## 
##  Shapiro-Wilk normality test
## 
## data:  res
## W = 0.8373, p-value < 2.2e-16
fBasics::jarqueberaTest(res)
## 
## Title:
##  Jarque - Bera Normalality Test
## 
## Test Results:
##   STATISTIC:
##     X-squared: 1673.7604
##   P VALUE:
##     Asymptotic p Value: < 2.2e-16 
## 
## Description:
##  Fri Feb 24 21:27:53 2023 by user: afwa
fBasics::ksnormTest(res,)
## Warning in ks.test.default(x, "pnorm", alternative = "two.sided"): ties should
## not be present for the Kolmogorov-Smirnov test
## Warning in ks.test.default(x, "pnorm", alternative = "less"): ties should not
## be present for the Kolmogorov-Smirnov test
## Warning in ks.test.default(x, "pnorm", alternative = "greater"): ties should
## not be present for the Kolmogorov-Smirnov test
## 
## Title:
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## Test Results:
##   STATISTIC:
##     D: 0.2831
##   P VALUE:
##     Alternative Two-Sided: < 2.2e-16 
##     Alternative      Less: < 2.2e-16 
##     Alternative   Greater: < 2.2e-16 
## 
## Description:
##  Fri Feb 24 21:27:53 2023 by user: afwa
print(fBasics::adTest(res))
## 
## Title:
##  Anderson - Darling Normality Test
## 
## Test Results:
##   STATISTIC:
##     A: 74.8074
##   P VALUE:
##     < 2.2e-16 
## 
## Description:
##  Fri Feb 24 21:27:53 2023 by user: afwa
# uji homogen ragam
lmtest::bptest(expenses ~.,
               data = data_insurance,
               studentize = F)
## 
##  Breusch-Pagan test
## 
## data:  expenses ~ .
## BP = 243.98, df = 8, p-value < 2.2e-16
#Prediksi Regresi Linear

#Membagi data menjadi training testing
set.seed(123)
data_split <- initial_split(data = data_insurance,prop = 0.8)

train1 <- training(data_split)
test1 <- testing(data_split)

regresi2 <- lm(expenses ~.,data = train1)

#Prediksi data testing
prediksi <- predict(regresi2,newdata = test1)
head(prediksi)
##       14       15       21       22       27       33 
## 9.347747 9.893421 9.580387 8.495743 9.490087 8.474506
#Evaluasi hasil prediksi

# RMSE
mlr3measures::rmse(response = prediksi,truth = test1$expenses)
## [1] 0.4415859
# MAPE
mlr3measures::mape(response = prediksi,truth = test1$expenses)
## [1] 0.03175772
# Spearman Correlation
mlr3measures::srho(response = prediksi,truth = test1$expenses)
## [1] 0.9039972

Regresi linear berganda yang telah dilakukan diatas dapat dilhat kesimpulannya dengan melihat spearman correlationnya. Hasil yang didapat adalah 0.9039972 dimana mendekati 1. Semakin mendekati 1 maka korelasi antar variabel bebas dan terikat semakin kuat.