install.packages("tidyverse")
install.packages("skimr")
install.packages("ggpubr")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.1.8
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(skimr)
library(ggpubr)
data_house <- read.csv("C:/Users/afwa/Downloads/house_price1.csv",stringsAsFactors = TRUE)
glimpse(data_house)
## Rows: 1,460
## Columns: 81
## $ Id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ MSSubClass <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20,…
## $ MSZoning <fct> RL, RL, RL, RL, RL, RL, RL, RL, RM, RL, RL, RL, RL, RL, …
## $ LotFrontage <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, …
## $ LotArea <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 612…
## $ Street <fct> Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pa…
## $ Alley <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ LotShape <fct> Reg, Reg, IR1, IR1, IR1, IR1, Reg, IR1, Reg, Reg, Reg, I…
## $ LandContour <fct> Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, L…
## $ Utilities <fct> AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, …
## $ LotConfig <fct> Inside, FR2, Inside, Corner, FR2, Inside, Inside, Corner…
## $ LandSlope <fct> Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, G…
## $ Neighborhood <fct> CollgCr, Veenker, CollgCr, Crawfor, NoRidge, Mitchel, So…
## $ Condition1 <fct> Norm, Feedr, Norm, Norm, Norm, Norm, Norm, PosN, Artery,…
## $ Condition2 <fct> Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Ar…
## $ BldgType <fct> 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 2f…
## $ HouseStyle <fct> 2Story, 1Story, 2Story, 2Story, 2Story, 1.5Fin, 1Story, …
## $ OverallQual <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5,…
## $ OverallCond <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5,…
## $ YearBuilt <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 19…
## $ YearRemodAdd <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 19…
## $ RoofStyle <fct> Gable, Gable, Gable, Gable, Gable, Gable, Gable, Gable, …
## $ RoofMatl <fct> CompShg, CompShg, CompShg, CompShg, CompShg, CompShg, Co…
## $ Exterior1st <fct> VinylSd, MetalSd, VinylSd, Wd Sdng, VinylSd, VinylSd, Vi…
## $ Exterior2nd <fct> VinylSd, MetalSd, VinylSd, Wd Shng, VinylSd, VinylSd, Vi…
## $ MasVnrType <fct> BrkFace, None, BrkFace, None, BrkFace, None, Stone, Ston…
## $ MasVnrArea <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306, …
## $ ExterQual <fct> Gd, TA, Gd, TA, Gd, TA, Gd, TA, TA, TA, TA, Ex, TA, Gd, …
## $ ExterCond <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, …
## $ Foundation <fct> PConc, CBlock, PConc, BrkTil, PConc, Wood, PConc, CBlock…
## $ BsmtQual <fct> Gd, Gd, Gd, TA, Gd, Gd, Ex, Gd, TA, TA, TA, Ex, TA, Gd, …
## $ BsmtCond <fct> TA, TA, TA, Gd, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, …
## $ BsmtExposure <fct> No, Gd, Mn, No, Av, No, Av, Mn, No, No, No, No, No, Av, …
## $ BsmtFinType1 <fct> GLQ, ALQ, GLQ, ALQ, GLQ, GLQ, GLQ, ALQ, Unf, GLQ, Rec, G…
## $ BsmtFinSF1 <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 99…
## $ BsmtFinType2 <fct> Unf, Unf, Unf, Unf, Unf, Unf, Unf, BLQ, Unf, Unf, Unf, U…
## $ BsmtFinSF2 <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ BsmtUnfSF <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 17…
## $ TotalBsmtSF <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 10…
## $ Heating <fct> GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, Ga…
## $ HeatingQC <fct> Ex, Ex, Ex, Gd, Ex, Ex, Ex, Ex, Gd, Ex, Ex, Ex, TA, Ex, …
## $ CentralAir <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,…
## $ Electrical <fct> SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, …
## $ X1stFlrSF <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, …
## $ X2ndFlrSF <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0,…
## $ LowQualFinSF <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ GrLivArea <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 10…
## $ BsmtFullBath <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,…
## $ BsmtHalfBath <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FullBath <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1,…
## $ HalfBath <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,…
## $ BedroomAbvGr <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3,…
## $ KitchenAbvGr <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
## $ KitchenQual <fct> Gd, TA, Gd, Gd, Gd, TA, Gd, TA, TA, TA, TA, Ex, TA, Gd, …
## $ TotRmsAbvGrd <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6…
## $ Functional <fct> Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ, Min1, Typ, Typ, …
## $ Fireplaces <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0,…
## $ FireplaceQu <fct> NA, TA, TA, Gd, TA, NA, Gd, TA, TA, TA, NA, Gd, NA, Gd, …
## $ GarageType <fct> Attchd, Attchd, Attchd, Detchd, Attchd, Attchd, Attchd, …
## $ GarageYrBlt <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, 19…
## $ GarageFinish <fct> RFn, RFn, RFn, Unf, RFn, Unf, RFn, RFn, Unf, RFn, Unf, F…
## $ GarageCars <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2,…
## $ GarageArea <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 7…
## $ GarageQual <fct> TA, TA, TA, TA, TA, TA, TA, TA, Fa, Gd, TA, TA, TA, TA, …
## $ GarageCond <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, …
## $ PavedDrive <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,…
## $ WoodDeckSF <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160…
## $ OpenPorchSF <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213,…
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, …
## $ X3SsnPorch <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ScreenPorch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, …
## $ PoolArea <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ PoolQC <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ Fence <fct> NA, NA, NA, NA, NA, MnPrv, NA, NA, NA, NA, NA, NA, NA, N…
## $ MiscFeature <fct> NA, NA, NA, NA, NA, Shed, NA, Shed, NA, NA, NA, NA, NA, …
## $ MiscVal <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700,…
## $ MoSold <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10…
## $ YrSold <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 20…
## $ SaleType <fct> WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, New, WD, New…
## $ SaleCondition <fct> Normal, Normal, Normal, Abnorml, Normal, Normal, Normal,…
## $ SalePrice <int> 208500, 181500, 223500, 140000, 250000, 143000, 307000, …
Didapatkan tampilan informasi singkat mengenai dataset house_price1 yang dimasukkan ke dalam variabel data_house.
skim_without_charts(data_house)
| Name | data_house |
| Number of rows | 1460 |
| Number of columns | 81 |
| _______________________ | |
| Column type frequency: | |
| factor | 43 |
| numeric | 38 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| MSZoning | 0 | 1.00 | FALSE | 5 | RL: 1151, RM: 218, FV: 65, RH: 16 |
| Street | 0 | 1.00 | FALSE | 2 | Pav: 1454, Grv: 6 |
| Alley | 1369 | 0.06 | FALSE | 2 | Grv: 50, Pav: 41 |
| LotShape | 0 | 1.00 | FALSE | 4 | Reg: 925, IR1: 484, IR2: 41, IR3: 10 |
| LandContour | 0 | 1.00 | FALSE | 4 | Lvl: 1311, Bnk: 63, HLS: 50, Low: 36 |
| Utilities | 0 | 1.00 | FALSE | 2 | All: 1459, NoS: 1 |
| LotConfig | 0 | 1.00 | FALSE | 5 | Ins: 1052, Cor: 263, Cul: 94, FR2: 47 |
| LandSlope | 0 | 1.00 | FALSE | 3 | Gtl: 1382, Mod: 65, Sev: 13 |
| Neighborhood | 0 | 1.00 | FALSE | 25 | NAm: 225, Col: 150, Old: 113, Edw: 100 |
| Condition1 | 0 | 1.00 | FALSE | 9 | Nor: 1260, Fee: 81, Art: 48, RRA: 26 |
| Condition2 | 0 | 1.00 | FALSE | 8 | Nor: 1445, Fee: 6, Art: 2, Pos: 2 |
| BldgType | 0 | 1.00 | FALSE | 5 | 1Fa: 1220, Twn: 114, Dup: 52, Twn: 43 |
| HouseStyle | 0 | 1.00 | FALSE | 8 | 1St: 726, 2St: 445, 1.5: 154, SLv: 65 |
| RoofStyle | 0 | 1.00 | FALSE | 6 | Gab: 1141, Hip: 286, Fla: 13, Gam: 11 |
| RoofMatl | 0 | 1.00 | FALSE | 8 | Com: 1434, Tar: 11, WdS: 6, WdS: 5 |
| Exterior1st | 0 | 1.00 | FALSE | 15 | Vin: 515, HdB: 222, Met: 220, Wd : 206 |
| Exterior2nd | 0 | 1.00 | FALSE | 16 | Vin: 504, Met: 214, HdB: 207, Wd : 197 |
| MasVnrType | 8 | 0.99 | FALSE | 4 | Non: 864, Brk: 445, Sto: 128, Brk: 15 |
| ExterQual | 0 | 1.00 | FALSE | 4 | TA: 906, Gd: 488, Ex: 52, Fa: 14 |
| ExterCond | 0 | 1.00 | FALSE | 5 | TA: 1282, Gd: 146, Fa: 28, Ex: 3 |
| Foundation | 0 | 1.00 | FALSE | 6 | PCo: 647, CBl: 634, Brk: 146, Sla: 24 |
| BsmtQual | 37 | 0.97 | FALSE | 4 | TA: 649, Gd: 618, Ex: 121, Fa: 35 |
| BsmtCond | 37 | 0.97 | FALSE | 4 | TA: 1311, Gd: 65, Fa: 45, Po: 2 |
| BsmtExposure | 38 | 0.97 | FALSE | 4 | No: 953, Av: 221, Gd: 134, Mn: 114 |
| BsmtFinType1 | 37 | 0.97 | FALSE | 6 | Unf: 430, GLQ: 418, ALQ: 220, BLQ: 148 |
| BsmtFinType2 | 38 | 0.97 | FALSE | 6 | Unf: 1256, Rec: 54, LwQ: 46, BLQ: 33 |
| Heating | 0 | 1.00 | FALSE | 6 | Gas: 1428, Gas: 18, Gra: 7, Wal: 4 |
| HeatingQC | 0 | 1.00 | FALSE | 5 | Ex: 741, TA: 428, Gd: 241, Fa: 49 |
| CentralAir | 0 | 1.00 | FALSE | 2 | Y: 1365, N: 95 |
| Electrical | 1 | 1.00 | FALSE | 5 | SBr: 1334, Fus: 94, Fus: 27, Fus: 3 |
| KitchenQual | 0 | 1.00 | FALSE | 4 | TA: 735, Gd: 586, Ex: 100, Fa: 39 |
| Functional | 0 | 1.00 | FALSE | 7 | Typ: 1360, Min: 34, Min: 31, Mod: 15 |
| FireplaceQu | 690 | 0.53 | FALSE | 5 | Gd: 380, TA: 313, Fa: 33, Ex: 24 |
| GarageType | 81 | 0.94 | FALSE | 6 | Att: 870, Det: 387, Bui: 88, Bas: 19 |
| GarageFinish | 81 | 0.94 | FALSE | 3 | Unf: 605, RFn: 422, Fin: 352 |
| GarageQual | 81 | 0.94 | FALSE | 5 | TA: 1311, Fa: 48, Gd: 14, Ex: 3 |
| GarageCond | 81 | 0.94 | FALSE | 5 | TA: 1326, Fa: 35, Gd: 9, Po: 7 |
| PavedDrive | 0 | 1.00 | FALSE | 3 | Y: 1340, N: 90, P: 30 |
| PoolQC | 1453 | 0.00 | FALSE | 3 | Gd: 3, Ex: 2, Fa: 2 |
| Fence | 1179 | 0.19 | FALSE | 4 | MnP: 157, GdP: 59, GdW: 54, MnW: 11 |
| MiscFeature | 1406 | 0.04 | FALSE | 4 | She: 49, Gar: 2, Oth: 2, Ten: 1 |
| SaleType | 0 | 1.00 | FALSE | 9 | WD: 1267, New: 122, COD: 43, Con: 9 |
| SaleCondition | 0 | 1.00 | FALSE | 6 | Nor: 1198, Par: 125, Abn: 101, Fam: 20 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| Id | 0 | 1.00 | 730.50 | 421.61 | 1 | 365.75 | 730.5 | 1095.25 | 1460 |
| MSSubClass | 0 | 1.00 | 56.90 | 42.30 | 20 | 20.00 | 50.0 | 70.00 | 190 |
| LotFrontage | 259 | 0.82 | 70.05 | 24.28 | 21 | 59.00 | 69.0 | 80.00 | 313 |
| LotArea | 0 | 1.00 | 10516.83 | 9981.26 | 1300 | 7553.50 | 9478.5 | 11601.50 | 215245 |
| OverallQual | 0 | 1.00 | 6.10 | 1.38 | 1 | 5.00 | 6.0 | 7.00 | 10 |
| OverallCond | 0 | 1.00 | 5.58 | 1.11 | 1 | 5.00 | 5.0 | 6.00 | 9 |
| YearBuilt | 0 | 1.00 | 1971.27 | 30.20 | 1872 | 1954.00 | 1973.0 | 2000.00 | 2010 |
| YearRemodAdd | 0 | 1.00 | 1984.87 | 20.65 | 1950 | 1967.00 | 1994.0 | 2004.00 | 2010 |
| MasVnrArea | 8 | 0.99 | 103.69 | 181.07 | 0 | 0.00 | 0.0 | 166.00 | 1600 |
| BsmtFinSF1 | 0 | 1.00 | 443.64 | 456.10 | 0 | 0.00 | 383.5 | 712.25 | 5644 |
| BsmtFinSF2 | 0 | 1.00 | 46.55 | 161.32 | 0 | 0.00 | 0.0 | 0.00 | 1474 |
| BsmtUnfSF | 0 | 1.00 | 567.24 | 441.87 | 0 | 223.00 | 477.5 | 808.00 | 2336 |
| TotalBsmtSF | 0 | 1.00 | 1057.43 | 438.71 | 0 | 795.75 | 991.5 | 1298.25 | 6110 |
| X1stFlrSF | 0 | 1.00 | 1162.63 | 386.59 | 334 | 882.00 | 1087.0 | 1391.25 | 4692 |
| X2ndFlrSF | 0 | 1.00 | 346.99 | 436.53 | 0 | 0.00 | 0.0 | 728.00 | 2065 |
| LowQualFinSF | 0 | 1.00 | 5.84 | 48.62 | 0 | 0.00 | 0.0 | 0.00 | 572 |
| GrLivArea | 0 | 1.00 | 1515.46 | 525.48 | 334 | 1129.50 | 1464.0 | 1776.75 | 5642 |
| BsmtFullBath | 0 | 1.00 | 0.43 | 0.52 | 0 | 0.00 | 0.0 | 1.00 | 3 |
| BsmtHalfBath | 0 | 1.00 | 0.06 | 0.24 | 0 | 0.00 | 0.0 | 0.00 | 2 |
| FullBath | 0 | 1.00 | 1.57 | 0.55 | 0 | 1.00 | 2.0 | 2.00 | 3 |
| HalfBath | 0 | 1.00 | 0.38 | 0.50 | 0 | 0.00 | 0.0 | 1.00 | 2 |
| BedroomAbvGr | 0 | 1.00 | 2.87 | 0.82 | 0 | 2.00 | 3.0 | 3.00 | 8 |
| KitchenAbvGr | 0 | 1.00 | 1.05 | 0.22 | 0 | 1.00 | 1.0 | 1.00 | 3 |
| TotRmsAbvGrd | 0 | 1.00 | 6.52 | 1.63 | 2 | 5.00 | 6.0 | 7.00 | 14 |
| Fireplaces | 0 | 1.00 | 0.61 | 0.64 | 0 | 0.00 | 1.0 | 1.00 | 3 |
| GarageYrBlt | 81 | 0.94 | 1978.51 | 24.69 | 1900 | 1961.00 | 1980.0 | 2002.00 | 2010 |
| GarageCars | 0 | 1.00 | 1.77 | 0.75 | 0 | 1.00 | 2.0 | 2.00 | 4 |
| GarageArea | 0 | 1.00 | 472.98 | 213.80 | 0 | 334.50 | 480.0 | 576.00 | 1418 |
| WoodDeckSF | 0 | 1.00 | 94.24 | 125.34 | 0 | 0.00 | 0.0 | 168.00 | 857 |
| OpenPorchSF | 0 | 1.00 | 46.66 | 66.26 | 0 | 0.00 | 25.0 | 68.00 | 547 |
| EnclosedPorch | 0 | 1.00 | 21.95 | 61.12 | 0 | 0.00 | 0.0 | 0.00 | 552 |
| X3SsnPorch | 0 | 1.00 | 3.41 | 29.32 | 0 | 0.00 | 0.0 | 0.00 | 508 |
| ScreenPorch | 0 | 1.00 | 15.06 | 55.76 | 0 | 0.00 | 0.0 | 0.00 | 480 |
| PoolArea | 0 | 1.00 | 2.76 | 40.18 | 0 | 0.00 | 0.0 | 0.00 | 738 |
| MiscVal | 0 | 1.00 | 43.49 | 496.12 | 0 | 0.00 | 0.0 | 0.00 | 15500 |
| MoSold | 0 | 1.00 | 6.32 | 2.70 | 1 | 5.00 | 6.0 | 8.00 | 12 |
| YrSold | 0 | 1.00 | 2007.82 | 1.33 | 2006 | 2007.00 | 2008.0 | 2009.00 | 2010 |
| SalePrice | 0 | 1.00 | 180921.20 | 79442.50 | 34900 | 129975.00 | 163000.0 | 214000.00 | 755000 |
dt_res1 <- data_house %>%
#menghapus kolom Alley
select(-Alley) %>%
#menghapus semua baris yang mengandung missing value
na.omit()
skim_without_charts(dt_res1)
| Name | dt_res1 |
| Number of rows | 1 |
| Number of columns | 80 |
| _______________________ | |
| Column type frequency: | |
| factor | 42 |
| numeric | 38 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| MSZoning | 0 | 1 | FALSE | 1 | RL: 1, C (: 0, FV: 0, RH: 0 |
| Street | 0 | 1 | FALSE | 1 | Pav: 1, Grv: 0 |
| LotShape | 0 | 1 | FALSE | 1 | IR1: 1, IR2: 0, IR3: 0, Reg: 0 |
| LandContour | 0 | 1 | FALSE | 1 | Lvl: 1, Bnk: 0, HLS: 0, Low: 0 |
| Utilities | 0 | 1 | FALSE | 1 | All: 1, NoS: 0 |
| LotConfig | 0 | 1 | FALSE | 1 | Ins: 1, Cor: 0, Cul: 0, FR2: 0 |
| LandSlope | 0 | 1 | FALSE | 1 | Gtl: 1, Mod: 0, Sev: 0 |
| Neighborhood | 0 | 1 | FALSE | 1 | NWA: 1, Blm: 0, Blu: 0, BrD: 0 |
| Condition1 | 0 | 1 | FALSE | 1 | RRA: 1, Art: 0, Fee: 0, Nor: 0 |
| Condition2 | 0 | 1 | FALSE | 1 | Nor: 1, Art: 0, Fee: 0, Pos: 0 |
| BldgType | 0 | 1 | FALSE | 1 | 1Fa: 1, 2fm: 0, Dup: 0, Twn: 0 |
| HouseStyle | 0 | 1 | FALSE | 1 | 2St: 1, 1.5: 0, 1.5: 0, 1St: 0 |
| RoofStyle | 0 | 1 | FALSE | 1 | Gab: 1, Fla: 0, Gam: 0, Hip: 0 |
| RoofMatl | 0 | 1 | FALSE | 1 | Com: 1, Cly: 0, Mem: 0, Met: 0 |
| Exterior1st | 0 | 1 | FALSE | 1 | Ply: 1, Asb: 0, Asp: 0, Brk: 0 |
| Exterior2nd | 0 | 1 | FALSE | 1 | Ply: 1, Asb: 0, Asp: 0, Brk: 0 |
| MasVnrType | 0 | 1 | FALSE | 1 | Brk: 1, Brk: 0, Non: 0, Sto: 0 |
| ExterQual | 0 | 1 | FALSE | 1 | TA: 1, Ex: 0, Fa: 0, Gd: 0 |
| ExterCond | 0 | 1 | FALSE | 1 | TA: 1, Ex: 0, Fa: 0, Gd: 0 |
| Foundation | 0 | 1 | FALSE | 1 | CBl: 1, Brk: 0, PCo: 0, Sla: 0 |
| BsmtQual | 0 | 1 | FALSE | 1 | Gd: 1, Ex: 0, Fa: 0, TA: 0 |
| BsmtCond | 0 | 1 | FALSE | 1 | TA: 1, Fa: 0, Gd: 0, Po: 0 |
| BsmtExposure | 0 | 1 | FALSE | 1 | No: 1, Av: 0, Gd: 0, Mn: 0 |
| BsmtFinType1 | 0 | 1 | FALSE | 1 | BLQ: 1, ALQ: 0, GLQ: 0, LwQ: 0 |
| BsmtFinType2 | 0 | 1 | FALSE | 1 | LwQ: 1, ALQ: 0, BLQ: 0, GLQ: 0 |
| Heating | 0 | 1 | FALSE | 1 | Gas: 1, Flo: 0, Gas: 0, Gra: 0 |
| HeatingQC | 0 | 1 | FALSE | 1 | TA: 1, Ex: 0, Fa: 0, Gd: 0 |
| CentralAir | 0 | 1 | FALSE | 1 | Y: 1, N: 0 |
| Electrical | 0 | 1 | FALSE | 1 | SBr: 1, Fus: 0, Fus: 0, Fus: 0 |
| KitchenQual | 0 | 1 | FALSE | 1 | Gd: 1, Ex: 0, Fa: 0, TA: 0 |
| Functional | 0 | 1 | FALSE | 1 | Typ: 1, Maj: 0, Maj: 0, Min: 0 |
| FireplaceQu | 0 | 1 | FALSE | 1 | TA: 1, Ex: 0, Fa: 0, Gd: 0 |
| GarageType | 0 | 1 | FALSE | 1 | Att: 1, 2Ty: 0, Bas: 0, Bui: 0 |
| GarageFinish | 0 | 1 | FALSE | 1 | RFn: 1, Fin: 0, Unf: 0 |
| GarageQual | 0 | 1 | FALSE | 1 | TA: 1, Ex: 0, Fa: 0, Gd: 0 |
| GarageCond | 0 | 1 | FALSE | 1 | TA: 1, Ex: 0, Fa: 0, Gd: 0 |
| PavedDrive | 0 | 1 | FALSE | 1 | Y: 1, N: 0, P: 0 |
| PoolQC | 0 | 1 | FALSE | 1 | Fa: 1, Ex: 0, Gd: 0 |
| Fence | 0 | 1 | FALSE | 1 | MnP: 1, GdP: 0, GdW: 0, MnW: 0 |
| MiscFeature | 0 | 1 | FALSE | 1 | Ten: 1, Gar: 0, Oth: 0, She: 0 |
| SaleType | 0 | 1 | FALSE | 1 | WD: 1, COD: 0, Con: 0, Con: 0 |
| SaleCondition | 0 | 1 | FALSE | 1 | Nor: 1, Abn: 0, Adj: 0, All: 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| Id | 0 | 1 | 1387 | NA | 1387 | 1387 | 1387 | 1387 | 1387 |
| MSSubClass | 0 | 1 | 60 | NA | 60 | 60 | 60 | 60 | 60 |
| LotFrontage | 0 | 1 | 80 | NA | 80 | 80 | 80 | 80 | 80 |
| LotArea | 0 | 1 | 16692 | NA | 16692 | 16692 | 16692 | 16692 | 16692 |
| OverallQual | 0 | 1 | 7 | NA | 7 | 7 | 7 | 7 | 7 |
| OverallCond | 0 | 1 | 5 | NA | 5 | 5 | 5 | 5 | 5 |
| YearBuilt | 0 | 1 | 1978 | NA | 1978 | 1978 | 1978 | 1978 | 1978 |
| YearRemodAdd | 0 | 1 | 1978 | NA | 1978 | 1978 | 1978 | 1978 | 1978 |
| MasVnrArea | 0 | 1 | 184 | NA | 184 | 184 | 184 | 184 | 184 |
| BsmtFinSF1 | 0 | 1 | 790 | NA | 790 | 790 | 790 | 790 | 790 |
| BsmtFinSF2 | 0 | 1 | 469 | NA | 469 | 469 | 469 | 469 | 469 |
| BsmtUnfSF | 0 | 1 | 133 | NA | 133 | 133 | 133 | 133 | 133 |
| TotalBsmtSF | 0 | 1 | 1392 | NA | 1392 | 1392 | 1392 | 1392 | 1392 |
| X1stFlrSF | 0 | 1 | 1392 | NA | 1392 | 1392 | 1392 | 1392 | 1392 |
| X2ndFlrSF | 0 | 1 | 1392 | NA | 1392 | 1392 | 1392 | 1392 | 1392 |
| LowQualFinSF | 0 | 1 | 0 | NA | 0 | 0 | 0 | 0 | 0 |
| GrLivArea | 0 | 1 | 2784 | NA | 2784 | 2784 | 2784 | 2784 | 2784 |
| BsmtFullBath | 0 | 1 | 1 | NA | 1 | 1 | 1 | 1 | 1 |
| BsmtHalfBath | 0 | 1 | 0 | NA | 0 | 0 | 0 | 0 | 0 |
| FullBath | 0 | 1 | 3 | NA | 3 | 3 | 3 | 3 | 3 |
| HalfBath | 0 | 1 | 1 | NA | 1 | 1 | 1 | 1 | 1 |
| BedroomAbvGr | 0 | 1 | 5 | NA | 5 | 5 | 5 | 5 | 5 |
| KitchenAbvGr | 0 | 1 | 1 | NA | 1 | 1 | 1 | 1 | 1 |
| TotRmsAbvGrd | 0 | 1 | 12 | NA | 12 | 12 | 12 | 12 | 12 |
| Fireplaces | 0 | 1 | 2 | NA | 2 | 2 | 2 | 2 | 2 |
| GarageYrBlt | 0 | 1 | 1978 | NA | 1978 | 1978 | 1978 | 1978 | 1978 |
| GarageCars | 0 | 1 | 2 | NA | 2 | 2 | 2 | 2 | 2 |
| GarageArea | 0 | 1 | 564 | NA | 564 | 564 | 564 | 564 | 564 |
| WoodDeckSF | 0 | 1 | 0 | NA | 0 | 0 | 0 | 0 | 0 |
| OpenPorchSF | 0 | 1 | 112 | NA | 112 | 112 | 112 | 112 | 112 |
| EnclosedPorch | 0 | 1 | 0 | NA | 0 | 0 | 0 | 0 | 0 |
| X3SsnPorch | 0 | 1 | 0 | NA | 0 | 0 | 0 | 0 | 0 |
| ScreenPorch | 0 | 1 | 440 | NA | 440 | 440 | 440 | 440 | 440 |
| PoolArea | 0 | 1 | 519 | NA | 519 | 519 | 519 | 519 | 519 |
| MiscVal | 0 | 1 | 2000 | NA | 2000 | 2000 | 2000 | 2000 | 2000 |
| MoSold | 0 | 1 | 7 | NA | 7 | 7 | 7 | 7 | 7 |
| YrSold | 0 | 1 | 2006 | NA | 2006 | 2006 | 2006 | 2006 | 2006 |
| SalePrice | 0 | 1 | 250000 | NA | 250000 | 250000 | 250000 | 250000 | 250000 |
Dilakukan penghapusan data. Pertama, melalui skim_without_chart didapatkan ringkasan data numerik tanpa grafik dari variabel “data_house”. Selanjutnya dilakukan penghapusan kolom Alley dan menghapus baris yang mengandung missing value, lalu didapatkan informasi seperti jumlah kolom dan baris serta frekuensi dari data tersebut tanpa grafik melalui skim_without_chart dari variabel “dt_res1”.
skim_without_charts(data_house)
| Name | data_house |
| Number of rows | 1460 |
| Number of columns | 81 |
| _______________________ | |
| Column type frequency: | |
| factor | 43 |
| numeric | 38 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| MSZoning | 0 | 1.00 | FALSE | 5 | RL: 1151, RM: 218, FV: 65, RH: 16 |
| Street | 0 | 1.00 | FALSE | 2 | Pav: 1454, Grv: 6 |
| Alley | 1369 | 0.06 | FALSE | 2 | Grv: 50, Pav: 41 |
| LotShape | 0 | 1.00 | FALSE | 4 | Reg: 925, IR1: 484, IR2: 41, IR3: 10 |
| LandContour | 0 | 1.00 | FALSE | 4 | Lvl: 1311, Bnk: 63, HLS: 50, Low: 36 |
| Utilities | 0 | 1.00 | FALSE | 2 | All: 1459, NoS: 1 |
| LotConfig | 0 | 1.00 | FALSE | 5 | Ins: 1052, Cor: 263, Cul: 94, FR2: 47 |
| LandSlope | 0 | 1.00 | FALSE | 3 | Gtl: 1382, Mod: 65, Sev: 13 |
| Neighborhood | 0 | 1.00 | FALSE | 25 | NAm: 225, Col: 150, Old: 113, Edw: 100 |
| Condition1 | 0 | 1.00 | FALSE | 9 | Nor: 1260, Fee: 81, Art: 48, RRA: 26 |
| Condition2 | 0 | 1.00 | FALSE | 8 | Nor: 1445, Fee: 6, Art: 2, Pos: 2 |
| BldgType | 0 | 1.00 | FALSE | 5 | 1Fa: 1220, Twn: 114, Dup: 52, Twn: 43 |
| HouseStyle | 0 | 1.00 | FALSE | 8 | 1St: 726, 2St: 445, 1.5: 154, SLv: 65 |
| RoofStyle | 0 | 1.00 | FALSE | 6 | Gab: 1141, Hip: 286, Fla: 13, Gam: 11 |
| RoofMatl | 0 | 1.00 | FALSE | 8 | Com: 1434, Tar: 11, WdS: 6, WdS: 5 |
| Exterior1st | 0 | 1.00 | FALSE | 15 | Vin: 515, HdB: 222, Met: 220, Wd : 206 |
| Exterior2nd | 0 | 1.00 | FALSE | 16 | Vin: 504, Met: 214, HdB: 207, Wd : 197 |
| MasVnrType | 8 | 0.99 | FALSE | 4 | Non: 864, Brk: 445, Sto: 128, Brk: 15 |
| ExterQual | 0 | 1.00 | FALSE | 4 | TA: 906, Gd: 488, Ex: 52, Fa: 14 |
| ExterCond | 0 | 1.00 | FALSE | 5 | TA: 1282, Gd: 146, Fa: 28, Ex: 3 |
| Foundation | 0 | 1.00 | FALSE | 6 | PCo: 647, CBl: 634, Brk: 146, Sla: 24 |
| BsmtQual | 37 | 0.97 | FALSE | 4 | TA: 649, Gd: 618, Ex: 121, Fa: 35 |
| BsmtCond | 37 | 0.97 | FALSE | 4 | TA: 1311, Gd: 65, Fa: 45, Po: 2 |
| BsmtExposure | 38 | 0.97 | FALSE | 4 | No: 953, Av: 221, Gd: 134, Mn: 114 |
| BsmtFinType1 | 37 | 0.97 | FALSE | 6 | Unf: 430, GLQ: 418, ALQ: 220, BLQ: 148 |
| BsmtFinType2 | 38 | 0.97 | FALSE | 6 | Unf: 1256, Rec: 54, LwQ: 46, BLQ: 33 |
| Heating | 0 | 1.00 | FALSE | 6 | Gas: 1428, Gas: 18, Gra: 7, Wal: 4 |
| HeatingQC | 0 | 1.00 | FALSE | 5 | Ex: 741, TA: 428, Gd: 241, Fa: 49 |
| CentralAir | 0 | 1.00 | FALSE | 2 | Y: 1365, N: 95 |
| Electrical | 1 | 1.00 | FALSE | 5 | SBr: 1334, Fus: 94, Fus: 27, Fus: 3 |
| KitchenQual | 0 | 1.00 | FALSE | 4 | TA: 735, Gd: 586, Ex: 100, Fa: 39 |
| Functional | 0 | 1.00 | FALSE | 7 | Typ: 1360, Min: 34, Min: 31, Mod: 15 |
| FireplaceQu | 690 | 0.53 | FALSE | 5 | Gd: 380, TA: 313, Fa: 33, Ex: 24 |
| GarageType | 81 | 0.94 | FALSE | 6 | Att: 870, Det: 387, Bui: 88, Bas: 19 |
| GarageFinish | 81 | 0.94 | FALSE | 3 | Unf: 605, RFn: 422, Fin: 352 |
| GarageQual | 81 | 0.94 | FALSE | 5 | TA: 1311, Fa: 48, Gd: 14, Ex: 3 |
| GarageCond | 81 | 0.94 | FALSE | 5 | TA: 1326, Fa: 35, Gd: 9, Po: 7 |
| PavedDrive | 0 | 1.00 | FALSE | 3 | Y: 1340, N: 90, P: 30 |
| PoolQC | 1453 | 0.00 | FALSE | 3 | Gd: 3, Ex: 2, Fa: 2 |
| Fence | 1179 | 0.19 | FALSE | 4 | MnP: 157, GdP: 59, GdW: 54, MnW: 11 |
| MiscFeature | 1406 | 0.04 | FALSE | 4 | She: 49, Gar: 2, Oth: 2, Ten: 1 |
| SaleType | 0 | 1.00 | FALSE | 9 | WD: 1267, New: 122, COD: 43, Con: 9 |
| SaleCondition | 0 | 1.00 | FALSE | 6 | Nor: 1198, Par: 125, Abn: 101, Fam: 20 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| Id | 0 | 1.00 | 730.50 | 421.61 | 1 | 365.75 | 730.5 | 1095.25 | 1460 |
| MSSubClass | 0 | 1.00 | 56.90 | 42.30 | 20 | 20.00 | 50.0 | 70.00 | 190 |
| LotFrontage | 259 | 0.82 | 70.05 | 24.28 | 21 | 59.00 | 69.0 | 80.00 | 313 |
| LotArea | 0 | 1.00 | 10516.83 | 9981.26 | 1300 | 7553.50 | 9478.5 | 11601.50 | 215245 |
| OverallQual | 0 | 1.00 | 6.10 | 1.38 | 1 | 5.00 | 6.0 | 7.00 | 10 |
| OverallCond | 0 | 1.00 | 5.58 | 1.11 | 1 | 5.00 | 5.0 | 6.00 | 9 |
| YearBuilt | 0 | 1.00 | 1971.27 | 30.20 | 1872 | 1954.00 | 1973.0 | 2000.00 | 2010 |
| YearRemodAdd | 0 | 1.00 | 1984.87 | 20.65 | 1950 | 1967.00 | 1994.0 | 2004.00 | 2010 |
| MasVnrArea | 8 | 0.99 | 103.69 | 181.07 | 0 | 0.00 | 0.0 | 166.00 | 1600 |
| BsmtFinSF1 | 0 | 1.00 | 443.64 | 456.10 | 0 | 0.00 | 383.5 | 712.25 | 5644 |
| BsmtFinSF2 | 0 | 1.00 | 46.55 | 161.32 | 0 | 0.00 | 0.0 | 0.00 | 1474 |
| BsmtUnfSF | 0 | 1.00 | 567.24 | 441.87 | 0 | 223.00 | 477.5 | 808.00 | 2336 |
| TotalBsmtSF | 0 | 1.00 | 1057.43 | 438.71 | 0 | 795.75 | 991.5 | 1298.25 | 6110 |
| X1stFlrSF | 0 | 1.00 | 1162.63 | 386.59 | 334 | 882.00 | 1087.0 | 1391.25 | 4692 |
| X2ndFlrSF | 0 | 1.00 | 346.99 | 436.53 | 0 | 0.00 | 0.0 | 728.00 | 2065 |
| LowQualFinSF | 0 | 1.00 | 5.84 | 48.62 | 0 | 0.00 | 0.0 | 0.00 | 572 |
| GrLivArea | 0 | 1.00 | 1515.46 | 525.48 | 334 | 1129.50 | 1464.0 | 1776.75 | 5642 |
| BsmtFullBath | 0 | 1.00 | 0.43 | 0.52 | 0 | 0.00 | 0.0 | 1.00 | 3 |
| BsmtHalfBath | 0 | 1.00 | 0.06 | 0.24 | 0 | 0.00 | 0.0 | 0.00 | 2 |
| FullBath | 0 | 1.00 | 1.57 | 0.55 | 0 | 1.00 | 2.0 | 2.00 | 3 |
| HalfBath | 0 | 1.00 | 0.38 | 0.50 | 0 | 0.00 | 0.0 | 1.00 | 2 |
| BedroomAbvGr | 0 | 1.00 | 2.87 | 0.82 | 0 | 2.00 | 3.0 | 3.00 | 8 |
| KitchenAbvGr | 0 | 1.00 | 1.05 | 0.22 | 0 | 1.00 | 1.0 | 1.00 | 3 |
| TotRmsAbvGrd | 0 | 1.00 | 6.52 | 1.63 | 2 | 5.00 | 6.0 | 7.00 | 14 |
| Fireplaces | 0 | 1.00 | 0.61 | 0.64 | 0 | 0.00 | 1.0 | 1.00 | 3 |
| GarageYrBlt | 81 | 0.94 | 1978.51 | 24.69 | 1900 | 1961.00 | 1980.0 | 2002.00 | 2010 |
| GarageCars | 0 | 1.00 | 1.77 | 0.75 | 0 | 1.00 | 2.0 | 2.00 | 4 |
| GarageArea | 0 | 1.00 | 472.98 | 213.80 | 0 | 334.50 | 480.0 | 576.00 | 1418 |
| WoodDeckSF | 0 | 1.00 | 94.24 | 125.34 | 0 | 0.00 | 0.0 | 168.00 | 857 |
| OpenPorchSF | 0 | 1.00 | 46.66 | 66.26 | 0 | 0.00 | 25.0 | 68.00 | 547 |
| EnclosedPorch | 0 | 1.00 | 21.95 | 61.12 | 0 | 0.00 | 0.0 | 0.00 | 552 |
| X3SsnPorch | 0 | 1.00 | 3.41 | 29.32 | 0 | 0.00 | 0.0 | 0.00 | 508 |
| ScreenPorch | 0 | 1.00 | 15.06 | 55.76 | 0 | 0.00 | 0.0 | 0.00 | 480 |
| PoolArea | 0 | 1.00 | 2.76 | 40.18 | 0 | 0.00 | 0.0 | 0.00 | 738 |
| MiscVal | 0 | 1.00 | 43.49 | 496.12 | 0 | 0.00 | 0.0 | 0.00 | 15500 |
| MoSold | 0 | 1.00 | 6.32 | 2.70 | 1 | 5.00 | 6.0 | 8.00 | 12 |
| YrSold | 0 | 1.00 | 2007.82 | 1.33 | 2006 | 2007.00 | 2008.0 | 2009.00 | 2010 |
| SalePrice | 0 | 1.00 | 180921.20 | 79442.50 | 34900 | 129975.00 | 163000.0 | 214000.00 | 755000 |
dt_res2 <- data_house %>% mutate(Street=case_when(Street=="Pavr" ~ "Pave",
Street=="Pavd" ~ "Pave",
TRUE ~ Street))
dt_res2 %>% count(Street)
## Street n
## 1 Grvl 6
## 2 Pave 1454
Dilakukan koreksi kesalahan yang dimulai dengan mengganti “Pavr” dan “Pavd” menjadi “Pave” menggunakan fungsi case_when. Kemudian, menggunakan fungsi count untuk menghitung jumlah masing-masing nilai pada kolom “Street” yang telah diubah.
data_house %>%
filter(GarageArea==0) %>%
select(GarageArea,GarageType)
## GarageArea GarageType
## 1 0 <NA>
## 2 0 <NA>
## 3 0 <NA>
## 4 0 <NA>
## 5 0 <NA>
## 6 0 <NA>
## 7 0 <NA>
## 8 0 <NA>
## 9 0 <NA>
## 10 0 <NA>
## 11 0 <NA>
## 12 0 <NA>
## 13 0 <NA>
## 14 0 <NA>
## 15 0 <NA>
## 16 0 <NA>
## 17 0 <NA>
## 18 0 <NA>
## 19 0 <NA>
## 20 0 <NA>
## 21 0 <NA>
## 22 0 <NA>
## 23 0 <NA>
## 24 0 <NA>
## 25 0 <NA>
## 26 0 <NA>
## 27 0 <NA>
## 28 0 <NA>
## 29 0 <NA>
## 30 0 <NA>
## 31 0 <NA>
## 32 0 <NA>
## 33 0 <NA>
## 34 0 <NA>
## 35 0 <NA>
## 36 0 <NA>
## 37 0 <NA>
## 38 0 <NA>
## 39 0 <NA>
## 40 0 <NA>
## 41 0 <NA>
## 42 0 <NA>
## 43 0 <NA>
## 44 0 <NA>
## 45 0 <NA>
## 46 0 <NA>
## 47 0 <NA>
## 48 0 <NA>
## 49 0 <NA>
## 50 0 <NA>
## 51 0 <NA>
## 52 0 <NA>
## 53 0 <NA>
## 54 0 <NA>
## 55 0 <NA>
## 56 0 <NA>
## 57 0 <NA>
## 58 0 <NA>
## 59 0 <NA>
## 60 0 <NA>
## 61 0 <NA>
## 62 0 <NA>
## 63 0 <NA>
## 64 0 <NA>
## 65 0 <NA>
## 66 0 <NA>
## 67 0 <NA>
## 68 0 <NA>
## 69 0 <NA>
## 70 0 <NA>
## 71 0 <NA>
## 72 0 <NA>
## 73 0 <NA>
## 74 0 <NA>
## 75 0 <NA>
## 76 0 <NA>
## 77 0 <NA>
## 78 0 <NA>
## 79 0 <NA>
## 80 0 <NA>
## 81 0 <NA>
dt_res3 <- data_house %>%
mutate(GarageType=case_when(GarageArea ==0 ~ NA,
TRUE ~ GarageType))
dt_res3 %>%
filter(GarageArea==0) %>%
select(GarageArea,GarageType)
## GarageArea GarageType
## 1 0 <NA>
## 2 0 <NA>
## 3 0 <NA>
## 4 0 <NA>
## 5 0 <NA>
## 6 0 <NA>
## 7 0 <NA>
## 8 0 <NA>
## 9 0 <NA>
## 10 0 <NA>
## 11 0 <NA>
## 12 0 <NA>
## 13 0 <NA>
## 14 0 <NA>
## 15 0 <NA>
## 16 0 <NA>
## 17 0 <NA>
## 18 0 <NA>
## 19 0 <NA>
## 20 0 <NA>
## 21 0 <NA>
## 22 0 <NA>
## 23 0 <NA>
## 24 0 <NA>
## 25 0 <NA>
## 26 0 <NA>
## 27 0 <NA>
## 28 0 <NA>
## 29 0 <NA>
## 30 0 <NA>
## 31 0 <NA>
## 32 0 <NA>
## 33 0 <NA>
## 34 0 <NA>
## 35 0 <NA>
## 36 0 <NA>
## 37 0 <NA>
## 38 0 <NA>
## 39 0 <NA>
## 40 0 <NA>
## 41 0 <NA>
## 42 0 <NA>
## 43 0 <NA>
## 44 0 <NA>
## 45 0 <NA>
## 46 0 <NA>
## 47 0 <NA>
## 48 0 <NA>
## 49 0 <NA>
## 50 0 <NA>
## 51 0 <NA>
## 52 0 <NA>
## 53 0 <NA>
## 54 0 <NA>
## 55 0 <NA>
## 56 0 <NA>
## 57 0 <NA>
## 58 0 <NA>
## 59 0 <NA>
## 60 0 <NA>
## 61 0 <NA>
## 62 0 <NA>
## 63 0 <NA>
## 64 0 <NA>
## 65 0 <NA>
## 66 0 <NA>
## 67 0 <NA>
## 68 0 <NA>
## 69 0 <NA>
## 70 0 <NA>
## 71 0 <NA>
## 72 0 <NA>
## 73 0 <NA>
## 74 0 <NA>
## 75 0 <NA>
## 76 0 <NA>
## 77 0 <NA>
## 78 0 <NA>
## 79 0 <NA>
## 80 0 <NA>
## 81 0 <NA>
Dilakukan filter pada data_house untuk mengoreksi ketidakkonstanan data dengan membuat nilai GarageArea = 0 dan variabel yang dipilih hanya GarageArea dan GarageType. Lalu case_when digunakan untuk mengubah nilai GarageType menjadi NA dan GarageArea = 0. Kemudian hasilnya dimasukkan dalam variabel dt_res3
gghistogram(data = data_house,x = "SalePrice",fill = "steelblue")+
scale_y_continuous(expand = c(0,0))
## Warning: Using `bins = 30` by default. Pick better value with the argument
## `bins`.
dt_res4 = data_house %>%
mutate(log_SalePrice=log(SalePrice))
gghistogram(data = dt_res4,x = "log_SalePrice",fill = "steelblue")+
scale_y_continuous(expand = c(0,0))
## Warning: Using `bins = 30` by default. Pick better value with the argument
## `bins`.
Dilakukan transformasi data untuk melihat distribusi harga rumah dalam
dataset dengan melihat histogram dari data = data_house. Yang kedua
adanya transformasi data ke log_SalePrice yang memungkinkan kita untuk
mengurangi efek dari outlier dan membuat distribusi data lebih normal
sehingga lebih mudah dianalisis. Histogram pertama memiliki skewness
positif yang cukup tinggi, yang berarti ada banyak nilai yang lebih
tinggi dari nilai rata-rata. Histogram kedua digunakan untuk melihat
distribusi logaritma harga rumah dengan data = dt_res4 dan hasilnya
memiliki skewness yang lebih rendah dan menunjukkan distribusi yang
lebih simetris.
gghistogram(data = data_house,x = "SalePrice",fill = "steelblue")+
scale_y_continuous(expand = c(0,0))
## Warning: Using `bins = 30` by default. Pick better value with the argument
## `bins`.
bagian normalization data dihasilkan histogram dari variabel “SalePrice” dalam dataset “data_house”
#Dengan menggunakan Transformasi Z
dt_res5 = data_house %>%
mutate(SalePrice_std=scale(SalePrice, center=TRUE,scale=TRUE))
gghistogram(data = dt_res5,x = "SalePrice_std",fill = "steelblue")+
scale_y_continuous(expand = c(0,0))
## Warning: Using `bins = 30` by default. Pick better value with the argument
## `bins`.
Dilakukan normalisasi data dengan menggunakan Transformasi Z pada
variabel “SalePrice” dalam dataset “data_house”. Pertama, digunakan
fungsi “mutate” untuk menambahkan kolom baru dengan nama
“SalePrice_std”. Pada kolom tersebut, nilai SalePrice diubah menjadi
nilai Z-Score (nilai rata-rata dikurangi dari setiap nilai dalam
variabel dan hasilnya dibagi dengan standar deviasi dari variabel). Lalu
fungsi “scale” digunakan untuk melakukan transformasi Z-Score Scaling.
Kemudian dibuatkannya histogram dari variabel “SalePrice_std” yang telah
dinormalisasi.
#Dengan Min-Max Scaling
gghistogram(data = data_house,x = "SalePrice",fill = "steelblue")+
scale_y_continuous(expand = c(0,0))
## Warning: Using `bins = 30` by default. Pick better value with the argument
## `bins`.
minMax <- function(x) {
(x - min(x)) / (max(x) - min(x))
}
dt_res6 = data_house %>%
mutate(SalePrice_mm=minMax(SalePrice))
gghistogram(data = dt_res6,x = "SalePrice_mm",
fill = "steelblue")+
scale_y_continuous(expand = c(0,0))
## Warning: Using `bins = 30` by default. Pick better value with the argument
## `bins`.
Dilakukan normalisasi data dengan menggunakan Min-Max Scaling. Yang pertama dihasilkan histogram dari variabel “SalePrice”. Selanjutnya fungsi “minMax” didefinisikan untuk menghitung nilai Min-Max Scaling dari setiap nilai dalam variabel. Setelah itu, fungsi “mutate” digunakan untuk membuat variabel baru “SalePrice_mm” yang merupakan hasil Min-Max Scaling dari variabel “SalePrice”. Terakhir, menggunakan variabel “SalePrice_mm” dibuatkan sebuah histogram yang menunjukkan bahwa rentang nilai SalePrice_mm adalah 0 hingga 1, dengan distribusi yang sama seperti SalePrice.
install.packages("tidyverse")
install.packages("DataExplorer")
install.packages("skimr")
library(tidyverse)
library(DataExplorer)
library(skimr)
data_house <- read.csv("C:/Users/afwa/Downloads/house_price1.csv",stringsAsFactors = TRUE)
glimpse(data_house)
## Rows: 1,460
## Columns: 81
## $ Id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ MSSubClass <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20,…
## $ MSZoning <fct> RL, RL, RL, RL, RL, RL, RL, RL, RM, RL, RL, RL, RL, RL, …
## $ LotFrontage <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, …
## $ LotArea <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 612…
## $ Street <fct> Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pa…
## $ Alley <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ LotShape <fct> Reg, Reg, IR1, IR1, IR1, IR1, Reg, IR1, Reg, Reg, Reg, I…
## $ LandContour <fct> Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, L…
## $ Utilities <fct> AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, …
## $ LotConfig <fct> Inside, FR2, Inside, Corner, FR2, Inside, Inside, Corner…
## $ LandSlope <fct> Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, G…
## $ Neighborhood <fct> CollgCr, Veenker, CollgCr, Crawfor, NoRidge, Mitchel, So…
## $ Condition1 <fct> Norm, Feedr, Norm, Norm, Norm, Norm, Norm, PosN, Artery,…
## $ Condition2 <fct> Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Ar…
## $ BldgType <fct> 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 2f…
## $ HouseStyle <fct> 2Story, 1Story, 2Story, 2Story, 2Story, 1.5Fin, 1Story, …
## $ OverallQual <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5,…
## $ OverallCond <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5,…
## $ YearBuilt <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 19…
## $ YearRemodAdd <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 19…
## $ RoofStyle <fct> Gable, Gable, Gable, Gable, Gable, Gable, Gable, Gable, …
## $ RoofMatl <fct> CompShg, CompShg, CompShg, CompShg, CompShg, CompShg, Co…
## $ Exterior1st <fct> VinylSd, MetalSd, VinylSd, Wd Sdng, VinylSd, VinylSd, Vi…
## $ Exterior2nd <fct> VinylSd, MetalSd, VinylSd, Wd Shng, VinylSd, VinylSd, Vi…
## $ MasVnrType <fct> BrkFace, None, BrkFace, None, BrkFace, None, Stone, Ston…
## $ MasVnrArea <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306, …
## $ ExterQual <fct> Gd, TA, Gd, TA, Gd, TA, Gd, TA, TA, TA, TA, Ex, TA, Gd, …
## $ ExterCond <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, …
## $ Foundation <fct> PConc, CBlock, PConc, BrkTil, PConc, Wood, PConc, CBlock…
## $ BsmtQual <fct> Gd, Gd, Gd, TA, Gd, Gd, Ex, Gd, TA, TA, TA, Ex, TA, Gd, …
## $ BsmtCond <fct> TA, TA, TA, Gd, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, …
## $ BsmtExposure <fct> No, Gd, Mn, No, Av, No, Av, Mn, No, No, No, No, No, Av, …
## $ BsmtFinType1 <fct> GLQ, ALQ, GLQ, ALQ, GLQ, GLQ, GLQ, ALQ, Unf, GLQ, Rec, G…
## $ BsmtFinSF1 <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 99…
## $ BsmtFinType2 <fct> Unf, Unf, Unf, Unf, Unf, Unf, Unf, BLQ, Unf, Unf, Unf, U…
## $ BsmtFinSF2 <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ BsmtUnfSF <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 17…
## $ TotalBsmtSF <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 10…
## $ Heating <fct> GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, Ga…
## $ HeatingQC <fct> Ex, Ex, Ex, Gd, Ex, Ex, Ex, Ex, Gd, Ex, Ex, Ex, TA, Ex, …
## $ CentralAir <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,…
## $ Electrical <fct> SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, …
## $ X1stFlrSF <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, …
## $ X2ndFlrSF <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0,…
## $ LowQualFinSF <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ GrLivArea <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 10…
## $ BsmtFullBath <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,…
## $ BsmtHalfBath <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FullBath <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1,…
## $ HalfBath <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,…
## $ BedroomAbvGr <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3,…
## $ KitchenAbvGr <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
## $ KitchenQual <fct> Gd, TA, Gd, Gd, Gd, TA, Gd, TA, TA, TA, TA, Ex, TA, Gd, …
## $ TotRmsAbvGrd <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6…
## $ Functional <fct> Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ, Min1, Typ, Typ, …
## $ Fireplaces <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0,…
## $ FireplaceQu <fct> NA, TA, TA, Gd, TA, NA, Gd, TA, TA, TA, NA, Gd, NA, Gd, …
## $ GarageType <fct> Attchd, Attchd, Attchd, Detchd, Attchd, Attchd, Attchd, …
## $ GarageYrBlt <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, 19…
## $ GarageFinish <fct> RFn, RFn, RFn, Unf, RFn, Unf, RFn, RFn, Unf, RFn, Unf, F…
## $ GarageCars <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2,…
## $ GarageArea <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 7…
## $ GarageQual <fct> TA, TA, TA, TA, TA, TA, TA, TA, Fa, Gd, TA, TA, TA, TA, …
## $ GarageCond <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, …
## $ PavedDrive <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,…
## $ WoodDeckSF <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160…
## $ OpenPorchSF <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213,…
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, …
## $ X3SsnPorch <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ScreenPorch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, …
## $ PoolArea <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ PoolQC <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ Fence <fct> NA, NA, NA, NA, NA, MnPrv, NA, NA, NA, NA, NA, NA, NA, N…
## $ MiscFeature <fct> NA, NA, NA, NA, NA, Shed, NA, Shed, NA, NA, NA, NA, NA, …
## $ MiscVal <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700,…
## $ MoSold <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10…
## $ YrSold <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 20…
## $ SaleType <fct> WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, New, WD, New…
## $ SaleCondition <fct> Normal, Normal, Normal, Abnorml, Normal, Normal, Normal,…
## $ SalePrice <int> 208500, 181500, 223500, 140000, 250000, 143000, 307000, …
Didapatkan tampilan informasi singkat mengenai dataset house_price1 yang dimasukkan ke dalam variabel data_house.
plot_intro(data = data_house,
geom_label_args = list(size=2.5))
skim_without_charts(data = data_house)
| Name | data_house |
| Number of rows | 1460 |
| Number of columns | 81 |
| _______________________ | |
| Column type frequency: | |
| factor | 43 |
| numeric | 38 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| MSZoning | 0 | 1.00 | FALSE | 5 | RL: 1151, RM: 218, FV: 65, RH: 16 |
| Street | 0 | 1.00 | FALSE | 2 | Pav: 1454, Grv: 6 |
| Alley | 1369 | 0.06 | FALSE | 2 | Grv: 50, Pav: 41 |
| LotShape | 0 | 1.00 | FALSE | 4 | Reg: 925, IR1: 484, IR2: 41, IR3: 10 |
| LandContour | 0 | 1.00 | FALSE | 4 | Lvl: 1311, Bnk: 63, HLS: 50, Low: 36 |
| Utilities | 0 | 1.00 | FALSE | 2 | All: 1459, NoS: 1 |
| LotConfig | 0 | 1.00 | FALSE | 5 | Ins: 1052, Cor: 263, Cul: 94, FR2: 47 |
| LandSlope | 0 | 1.00 | FALSE | 3 | Gtl: 1382, Mod: 65, Sev: 13 |
| Neighborhood | 0 | 1.00 | FALSE | 25 | NAm: 225, Col: 150, Old: 113, Edw: 100 |
| Condition1 | 0 | 1.00 | FALSE | 9 | Nor: 1260, Fee: 81, Art: 48, RRA: 26 |
| Condition2 | 0 | 1.00 | FALSE | 8 | Nor: 1445, Fee: 6, Art: 2, Pos: 2 |
| BldgType | 0 | 1.00 | FALSE | 5 | 1Fa: 1220, Twn: 114, Dup: 52, Twn: 43 |
| HouseStyle | 0 | 1.00 | FALSE | 8 | 1St: 726, 2St: 445, 1.5: 154, SLv: 65 |
| RoofStyle | 0 | 1.00 | FALSE | 6 | Gab: 1141, Hip: 286, Fla: 13, Gam: 11 |
| RoofMatl | 0 | 1.00 | FALSE | 8 | Com: 1434, Tar: 11, WdS: 6, WdS: 5 |
| Exterior1st | 0 | 1.00 | FALSE | 15 | Vin: 515, HdB: 222, Met: 220, Wd : 206 |
| Exterior2nd | 0 | 1.00 | FALSE | 16 | Vin: 504, Met: 214, HdB: 207, Wd : 197 |
| MasVnrType | 8 | 0.99 | FALSE | 4 | Non: 864, Brk: 445, Sto: 128, Brk: 15 |
| ExterQual | 0 | 1.00 | FALSE | 4 | TA: 906, Gd: 488, Ex: 52, Fa: 14 |
| ExterCond | 0 | 1.00 | FALSE | 5 | TA: 1282, Gd: 146, Fa: 28, Ex: 3 |
| Foundation | 0 | 1.00 | FALSE | 6 | PCo: 647, CBl: 634, Brk: 146, Sla: 24 |
| BsmtQual | 37 | 0.97 | FALSE | 4 | TA: 649, Gd: 618, Ex: 121, Fa: 35 |
| BsmtCond | 37 | 0.97 | FALSE | 4 | TA: 1311, Gd: 65, Fa: 45, Po: 2 |
| BsmtExposure | 38 | 0.97 | FALSE | 4 | No: 953, Av: 221, Gd: 134, Mn: 114 |
| BsmtFinType1 | 37 | 0.97 | FALSE | 6 | Unf: 430, GLQ: 418, ALQ: 220, BLQ: 148 |
| BsmtFinType2 | 38 | 0.97 | FALSE | 6 | Unf: 1256, Rec: 54, LwQ: 46, BLQ: 33 |
| Heating | 0 | 1.00 | FALSE | 6 | Gas: 1428, Gas: 18, Gra: 7, Wal: 4 |
| HeatingQC | 0 | 1.00 | FALSE | 5 | Ex: 741, TA: 428, Gd: 241, Fa: 49 |
| CentralAir | 0 | 1.00 | FALSE | 2 | Y: 1365, N: 95 |
| Electrical | 1 | 1.00 | FALSE | 5 | SBr: 1334, Fus: 94, Fus: 27, Fus: 3 |
| KitchenQual | 0 | 1.00 | FALSE | 4 | TA: 735, Gd: 586, Ex: 100, Fa: 39 |
| Functional | 0 | 1.00 | FALSE | 7 | Typ: 1360, Min: 34, Min: 31, Mod: 15 |
| FireplaceQu | 690 | 0.53 | FALSE | 5 | Gd: 380, TA: 313, Fa: 33, Ex: 24 |
| GarageType | 81 | 0.94 | FALSE | 6 | Att: 870, Det: 387, Bui: 88, Bas: 19 |
| GarageFinish | 81 | 0.94 | FALSE | 3 | Unf: 605, RFn: 422, Fin: 352 |
| GarageQual | 81 | 0.94 | FALSE | 5 | TA: 1311, Fa: 48, Gd: 14, Ex: 3 |
| GarageCond | 81 | 0.94 | FALSE | 5 | TA: 1326, Fa: 35, Gd: 9, Po: 7 |
| PavedDrive | 0 | 1.00 | FALSE | 3 | Y: 1340, N: 90, P: 30 |
| PoolQC | 1453 | 0.00 | FALSE | 3 | Gd: 3, Ex: 2, Fa: 2 |
| Fence | 1179 | 0.19 | FALSE | 4 | MnP: 157, GdP: 59, GdW: 54, MnW: 11 |
| MiscFeature | 1406 | 0.04 | FALSE | 4 | She: 49, Gar: 2, Oth: 2, Ten: 1 |
| SaleType | 0 | 1.00 | FALSE | 9 | WD: 1267, New: 122, COD: 43, Con: 9 |
| SaleCondition | 0 | 1.00 | FALSE | 6 | Nor: 1198, Par: 125, Abn: 101, Fam: 20 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| Id | 0 | 1.00 | 730.50 | 421.61 | 1 | 365.75 | 730.5 | 1095.25 | 1460 |
| MSSubClass | 0 | 1.00 | 56.90 | 42.30 | 20 | 20.00 | 50.0 | 70.00 | 190 |
| LotFrontage | 259 | 0.82 | 70.05 | 24.28 | 21 | 59.00 | 69.0 | 80.00 | 313 |
| LotArea | 0 | 1.00 | 10516.83 | 9981.26 | 1300 | 7553.50 | 9478.5 | 11601.50 | 215245 |
| OverallQual | 0 | 1.00 | 6.10 | 1.38 | 1 | 5.00 | 6.0 | 7.00 | 10 |
| OverallCond | 0 | 1.00 | 5.58 | 1.11 | 1 | 5.00 | 5.0 | 6.00 | 9 |
| YearBuilt | 0 | 1.00 | 1971.27 | 30.20 | 1872 | 1954.00 | 1973.0 | 2000.00 | 2010 |
| YearRemodAdd | 0 | 1.00 | 1984.87 | 20.65 | 1950 | 1967.00 | 1994.0 | 2004.00 | 2010 |
| MasVnrArea | 8 | 0.99 | 103.69 | 181.07 | 0 | 0.00 | 0.0 | 166.00 | 1600 |
| BsmtFinSF1 | 0 | 1.00 | 443.64 | 456.10 | 0 | 0.00 | 383.5 | 712.25 | 5644 |
| BsmtFinSF2 | 0 | 1.00 | 46.55 | 161.32 | 0 | 0.00 | 0.0 | 0.00 | 1474 |
| BsmtUnfSF | 0 | 1.00 | 567.24 | 441.87 | 0 | 223.00 | 477.5 | 808.00 | 2336 |
| TotalBsmtSF | 0 | 1.00 | 1057.43 | 438.71 | 0 | 795.75 | 991.5 | 1298.25 | 6110 |
| X1stFlrSF | 0 | 1.00 | 1162.63 | 386.59 | 334 | 882.00 | 1087.0 | 1391.25 | 4692 |
| X2ndFlrSF | 0 | 1.00 | 346.99 | 436.53 | 0 | 0.00 | 0.0 | 728.00 | 2065 |
| LowQualFinSF | 0 | 1.00 | 5.84 | 48.62 | 0 | 0.00 | 0.0 | 0.00 | 572 |
| GrLivArea | 0 | 1.00 | 1515.46 | 525.48 | 334 | 1129.50 | 1464.0 | 1776.75 | 5642 |
| BsmtFullBath | 0 | 1.00 | 0.43 | 0.52 | 0 | 0.00 | 0.0 | 1.00 | 3 |
| BsmtHalfBath | 0 | 1.00 | 0.06 | 0.24 | 0 | 0.00 | 0.0 | 0.00 | 2 |
| FullBath | 0 | 1.00 | 1.57 | 0.55 | 0 | 1.00 | 2.0 | 2.00 | 3 |
| HalfBath | 0 | 1.00 | 0.38 | 0.50 | 0 | 0.00 | 0.0 | 1.00 | 2 |
| BedroomAbvGr | 0 | 1.00 | 2.87 | 0.82 | 0 | 2.00 | 3.0 | 3.00 | 8 |
| KitchenAbvGr | 0 | 1.00 | 1.05 | 0.22 | 0 | 1.00 | 1.0 | 1.00 | 3 |
| TotRmsAbvGrd | 0 | 1.00 | 6.52 | 1.63 | 2 | 5.00 | 6.0 | 7.00 | 14 |
| Fireplaces | 0 | 1.00 | 0.61 | 0.64 | 0 | 0.00 | 1.0 | 1.00 | 3 |
| GarageYrBlt | 81 | 0.94 | 1978.51 | 24.69 | 1900 | 1961.00 | 1980.0 | 2002.00 | 2010 |
| GarageCars | 0 | 1.00 | 1.77 | 0.75 | 0 | 1.00 | 2.0 | 2.00 | 4 |
| GarageArea | 0 | 1.00 | 472.98 | 213.80 | 0 | 334.50 | 480.0 | 576.00 | 1418 |
| WoodDeckSF | 0 | 1.00 | 94.24 | 125.34 | 0 | 0.00 | 0.0 | 168.00 | 857 |
| OpenPorchSF | 0 | 1.00 | 46.66 | 66.26 | 0 | 0.00 | 25.0 | 68.00 | 547 |
| EnclosedPorch | 0 | 1.00 | 21.95 | 61.12 | 0 | 0.00 | 0.0 | 0.00 | 552 |
| X3SsnPorch | 0 | 1.00 | 3.41 | 29.32 | 0 | 0.00 | 0.0 | 0.00 | 508 |
| ScreenPorch | 0 | 1.00 | 15.06 | 55.76 | 0 | 0.00 | 0.0 | 0.00 | 480 |
| PoolArea | 0 | 1.00 | 2.76 | 40.18 | 0 | 0.00 | 0.0 | 0.00 | 738 |
| MiscVal | 0 | 1.00 | 43.49 | 496.12 | 0 | 0.00 | 0.0 | 0.00 | 15500 |
| MoSold | 0 | 1.00 | 6.32 | 2.70 | 1 | 5.00 | 6.0 | 8.00 | 12 |
| YrSold | 0 | 1.00 | 2007.82 | 1.33 | 2006 | 2007.00 | 2008.0 | 2009.00 | 2010 |
| SalePrice | 0 | 1.00 | 180921.20 | 79442.50 | 34900 | 129975.00 | 163000.0 | 214000.00 | 755000 |
Fungsi plot_intro digunakan untuk memeriksa gambaran umum data pada variabel “data_house”. Hasilnya dapat diketahui bahwa terdapat 53.1% kolom diskrit dan 46.9% kolom kontinu, serta terdapat 5.9% observasi yang hilang/kosong. Selanjutnya melalui skim_without_chart didapatkan ringkasan data numerik tanpa grafik dari variabel “data_house”.
data_house1 <- data_house %>%
select(-Id) %>%
mutate(
Alley = forcats::fct_explicit_na(Alley, na_level = "Ukn"),
FireplaceQu=forcats::fct_explicit_na(FireplaceQu,
na_level = "Ukn"
),
PoolQC = forcats::fct_explicit_na(PoolQC, na_level = "Ukn"),
Fence = forcats::fct_explicit_na(Fence, na_level = "Ukn"),
MiscFeature = forcats::fct_explicit_na(MiscFeature, na_level = "Ukn")
) %>% na.omit
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Alley = forcats::fct_explicit_na(Alley, na_level = "Ukn")`.
## Caused by warning:
## ! `fct_explicit_na()` was deprecated in forcats 1.0.0.
## ℹ Please use `fct_na_value_to_level()` instead.
Diatas merupakan sintaks untuk melakukan replace missing value, khususnya jika data berupa factor atau string. Kemudian na.omit berfungsi untuk menghapus semua baris yang mengandung missing value
plot_intro(data = data_house1)
Kemudian digunakan fungsi plot_intro digunakan untuk memeriksa gambaran
umum data pada variabel “data_house1”. Hasilnya dapat diketahui bahwa
terdapat 53.8% kolom diskrit dan 46.2% kolom kontinu, serta terdapat 0%
kolom yang hilang/kosong, yang berarti seluruh data sudah lengkap
skim_without_charts(data_house1)
| Name | data_house1 |
| Number of rows | 1094 |
| Number of columns | 80 |
| _______________________ | |
| Column type frequency: | |
| factor | 43 |
| numeric | 37 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| MSZoning | 0 | 1 | FALSE | 5 | RL: 850, RM: 173, FV: 54, RH: 9 |
| Street | 0 | 1 | FALSE | 2 | Pav: 1090, Grv: 4 |
| Alley | 0 | 1 | FALSE | 3 | Ukn: 1017, Grv: 41, Pav: 36 |
| LotShape | 0 | 1 | FALSE | 4 | Reg: 760, IR1: 301, IR2: 26, IR3: 7 |
| LandContour | 0 | 1 | FALSE | 4 | Lvl: 991, Bnk: 45, HLS: 44, Low: 14 |
| Utilities | 0 | 1 | FALSE | 1 | All: 1094, NoS: 0 |
| LotConfig | 0 | 1 | FALSE | 5 | Ins: 830, Cor: 187, Cul: 44, FR2: 29 |
| LandSlope | 0 | 1 | FALSE | 3 | Gtl: 1045, Mod: 44, Sev: 5 |
| Neighborhood | 0 | 1 | FALSE | 25 | NAm: 173, Col: 122, Old: 96, Som: 75 |
| Condition1 | 0 | 1 | FALSE | 9 | Nor: 950, Fee: 52, Art: 42, RRA: 24 |
| Condition2 | 0 | 1 | FALSE | 6 | Nor: 1082, Fee: 5, Art: 2, Pos: 2 |
| BldgType | 0 | 1 | FALSE | 5 | 1Fa: 925, Twn: 90, Twn: 35, Dup: 24 |
| HouseStyle | 0 | 1 | FALSE | 8 | 1St: 540, 2St: 346, 1.5: 117, SLv: 43 |
| RoofStyle | 0 | 1 | FALSE | 5 | Gab: 843, Hip: 230, Gam: 10, Man: 6 |
| RoofMatl | 0 | 1 | FALSE | 7 | Com: 1078, WdS: 6, Tar: 5, WdS: 2 |
| Exterior1st | 0 | 1 | FALSE | 14 | Vin: 421, Met: 172, HdB: 151, Wd : 149 |
| Exterior2nd | 0 | 1 | FALSE | 16 | Vin: 412, Met: 169, Wd : 145, HdB: 138 |
| MasVnrType | 0 | 1 | FALSE | 4 | Non: 639, Brk: 327, Sto: 119, Brk: 9 |
| ExterQual | 0 | 1 | FALSE | 4 | TA: 646, Gd: 395, Ex: 46, Fa: 7 |
| ExterCond | 0 | 1 | FALSE | 4 | TA: 973, Gd: 104, Fa: 15, Ex: 2 |
| Foundation | 0 | 1 | FALSE | 5 | PCo: 518, CBl: 446, Brk: 122, Sto: 6 |
| BsmtQual | 0 | 1 | FALSE | 4 | TA: 486, Gd: 463, Ex: 113, Fa: 32 |
| BsmtCond | 0 | 1 | FALSE | 4 | TA: 1006, Gd: 51, Fa: 36, Po: 1 |
| BsmtExposure | 0 | 1 | FALSE | 4 | No: 734, Av: 174, Gd: 97, Mn: 89 |
| BsmtFinType1 | 0 | 1 | FALSE | 6 | Unf: 343, GLQ: 323, ALQ: 162, BLQ: 105 |
| BsmtFinType2 | 0 | 1 | FALSE | 6 | Unf: 972, Rec: 37, LwQ: 35, BLQ: 25 |
| Heating | 0 | 1 | FALSE | 4 | Gas: 1075, Gas: 16, Gra: 2, Oth: 1 |
| HeatingQC | 0 | 1 | FALSE | 5 | Ex: 594, TA: 298, Gd: 174, Fa: 27 |
| CentralAir | 0 | 1 | FALSE | 2 | Y: 1036, N: 58 |
| Electrical | 0 | 1 | FALSE | 5 | SBr: 1009, Fus: 67, Fus: 15, Fus: 2 |
| KitchenQual | 0 | 1 | FALSE | 4 | TA: 528, Gd: 454, Ex: 91, Fa: 21 |
| Functional | 0 | 1 | FALSE | 6 | Typ: 1024, Min: 25, Min: 21, Maj: 10 |
| FireplaceQu | 0 | 1 | FALSE | 6 | Ukn: 511, Gd: 315, TA: 212, Fa: 24 |
| GarageType | 0 | 1 | FALSE | 6 | Att: 680, Det: 325, Bui: 63, Bas: 15 |
| GarageFinish | 0 | 1 | FALSE | 3 | Unf: 485, RFn: 333, Fin: 276 |
| GarageQual | 0 | 1 | FALSE | 5 | TA: 1031, Fa: 46, Gd: 11, Ex: 3 |
| GarageCond | 0 | 1 | FALSE | 5 | TA: 1050, Fa: 31, Po: 6, Gd: 5 |
| PavedDrive | 0 | 1 | FALSE | 3 | Y: 1023, N: 48, P: 23 |
| PoolQC | 0 | 1 | FALSE | 4 | Ukn: 1088, Ex: 2, Fa: 2, Gd: 2 |
| Fence | 0 | 1 | FALSE | 5 | Ukn: 882, MnP: 117, GdP: 46, GdW: 39 |
| MiscFeature | 0 | 1 | FALSE | 4 | Ukn: 1059, She: 33, Oth: 1, Ten: 1 |
| SaleType | 0 | 1 | FALSE | 9 | WD: 928, New: 116, COD: 31, Con: 5 |
| SaleCondition | 0 | 1 | FALSE | 6 | Nor: 880, Par: 119, Abn: 70, Fam: 18 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| MSSubClass | 0 | 1 | 56.13 | 41.98 | 20 | 20.00 | 50.0 | 70.00 | 190 |
| LotFrontage | 0 | 1 | 70.76 | 24.51 | 21 | 60.00 | 70.0 | 80.00 | 313 |
| LotArea | 0 | 1 | 10132.35 | 8212.25 | 1300 | 7606.75 | 9444.5 | 11387.25 | 215245 |
| OverallQual | 0 | 1 | 6.25 | 1.37 | 2 | 5.00 | 6.0 | 7.00 | 10 |
| OverallCond | 0 | 1 | 5.58 | 1.07 | 2 | 5.00 | 5.0 | 6.00 | 9 |
| YearBuilt | 0 | 1 | 1972.41 | 31.19 | 1880 | 1953.00 | 1975.0 | 2003.00 | 2010 |
| YearRemodAdd | 0 | 1 | 1985.92 | 20.93 | 1950 | 1967.00 | 1995.0 | 2005.00 | 2010 |
| MasVnrArea | 0 | 1 | 109.86 | 190.67 | 0 | 0.00 | 0.0 | 171.75 | 1600 |
| BsmtFinSF1 | 0 | 1 | 448.19 | 468.73 | 0 | 0.00 | 384.5 | 712.75 | 5644 |
| BsmtFinSF2 | 0 | 1 | 45.25 | 159.08 | 0 | 0.00 | 0.0 | 0.00 | 1474 |
| BsmtUnfSF | 0 | 1 | 606.12 | 445.83 | 0 | 270.00 | 525.0 | 846.00 | 2336 |
| TotalBsmtSF | 0 | 1 | 1099.56 | 415.85 | 105 | 816.00 | 1023.0 | 1345.50 | 6110 |
| X1stFlrSF | 0 | 1 | 1173.81 | 387.68 | 438 | 894.00 | 1097.0 | 1413.50 | 4692 |
| X2ndFlrSF | 0 | 1 | 356.54 | 439.26 | 0 | 0.00 | 0.0 | 729.00 | 2065 |
| LowQualFinSF | 0 | 1 | 4.68 | 42.10 | 0 | 0.00 | 0.0 | 0.00 | 572 |
| GrLivArea | 0 | 1 | 1535.03 | 526.12 | 438 | 1164.00 | 1480.0 | 1779.00 | 5642 |
| BsmtFullBath | 0 | 1 | 0.42 | 0.51 | 0 | 0.00 | 0.0 | 1.00 | 2 |
| BsmtHalfBath | 0 | 1 | 0.06 | 0.24 | 0 | 0.00 | 0.0 | 0.00 | 2 |
| FullBath | 0 | 1 | 1.58 | 0.55 | 0 | 1.00 | 2.0 | 2.00 | 3 |
| HalfBath | 0 | 1 | 0.39 | 0.50 | 0 | 0.00 | 0.0 | 1.00 | 2 |
| BedroomAbvGr | 0 | 1 | 2.86 | 0.76 | 0 | 2.00 | 3.0 | 3.00 | 6 |
| KitchenAbvGr | 0 | 1 | 1.03 | 0.19 | 1 | 1.00 | 1.0 | 1.00 | 3 |
| TotRmsAbvGrd | 0 | 1 | 6.57 | 1.58 | 3 | 5.00 | 6.0 | 7.00 | 12 |
| Fireplaces | 0 | 1 | 0.61 | 0.63 | 0 | 0.00 | 1.0 | 1.00 | 3 |
| GarageYrBlt | 0 | 1 | 1978.57 | 25.93 | 1900 | 1960.00 | 1982.0 | 2003.00 | 2010 |
| GarageCars | 0 | 1 | 1.88 | 0.66 | 1 | 1.00 | 2.0 | 2.00 | 4 |
| GarageArea | 0 | 1 | 503.76 | 192.26 | 160 | 360.00 | 484.0 | 602.50 | 1418 |
| WoodDeckSF | 0 | 1 | 94.34 | 122.62 | 0 | 0.00 | 0.0 | 169.75 | 857 |
| OpenPorchSF | 0 | 1 | 46.95 | 64.82 | 0 | 0.00 | 28.0 | 68.00 | 547 |
| EnclosedPorch | 0 | 1 | 22.05 | 61.57 | 0 | 0.00 | 0.0 | 0.00 | 552 |
| X3SsnPorch | 0 | 1 | 3.27 | 29.66 | 0 | 0.00 | 0.0 | 0.00 | 508 |
| ScreenPorch | 0 | 1 | 16.50 | 58.46 | 0 | 0.00 | 0.0 | 0.00 | 480 |
| PoolArea | 0 | 1 | 3.01 | 40.71 | 0 | 0.00 | 0.0 | 0.00 | 648 |
| MiscVal | 0 | 1 | 23.55 | 167.14 | 0 | 0.00 | 0.0 | 0.00 | 2500 |
| MoSold | 0 | 1 | 6.34 | 2.69 | 1 | 5.00 | 6.0 | 8.00 | 12 |
| YrSold | 0 | 1 | 2007.79 | 1.33 | 2006 | 2007.00 | 2008.0 | 2009.00 | 2010 |
| SalePrice | 0 | 1 | 187033.26 | 83165.33 | 35311 | 132500.00 | 165750.0 | 221000.00 | 755000 |
Melalui skim_without_chart didapatkan ringkasan data numerik tanpa grafik dari variabel “data_house1”.
data_house1 <- data_house1 %>%
select(-Utilities)
Setelah dilihat kembali ternyata ada kolom yang hanya memiliki satu kategori saja yaitu kolom Utilities. Sehingga dilakukan penghapusan.
plot_histogram(data = data_house1,nrow=3,ncol = 3,
geom_histogram_args = list(fill="steelblue"),
ggtheme = theme_bw()
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Membuat histogram dari data rumah dalam variabel “data_house1”.
Histogram akan dibuat dalam bentuk grid dengan 3 baris dan 3 kolom. Dan
hasil histogram dapat digunakan untuk melihat pola distribusi data,
seperti bentuk kurva, jumlah data yang terkonsentrasi di sekitar nilai
tertentu, dan data yang tersebar di seluruh jangkauan nilai.
plot_bar(data = data_house1,ggtheme =theme_bw(),nrow = 1)
Selanjutnya dibuat bar chart dari data rumah dalam variabel
“data_house1” yang akan ditampilkan dalam satu baris. Dari bar chart
tersebut, kita dapat melihat perbedaan jumlah atau proporsi antara
kategori yang berbeda.
plot_scatterplot(data = data_house1 %>%
select_if(is.numeric),
by="SalePrice",geom_point_args = list(color="steelblue"),ggtheme = theme_bw() )
Selanjutnya membuat scatterplot dari data rumah yang hanya memilih kolom
numerik dan mengelompokkannya berdasarkan kolom “SalePrice”. Scatterplot
akan menunjukkan hubungan antara dua variabel numerik dalam bentuk
titik-titik dan kita dapat melihat hubungan antara dua variabel
numerik.
cor_mat <- cor(data_house1%>%
select_if(is.numeric),method = "spearman")
cor_mat[upper.tri(cor_mat,diag = TRUE)] <- NA
cor_df <- cor_mat %>%
as.data.frame() %>%
rownames_to_column(var = "Var1") %>%
pivot_longer(names_to = "Var2",
values_to = "corr",
-Var1) %>% na.omit
cor_df %>% filter(abs(corr)>0.6) %>% arrange(desc(abs(corr)))
## # A tibble: 31 × 3
## Var1 Var2 corr
## <chr> <chr> <dbl>
## 1 GarageYrBlt YearBuilt 0.895
## 2 X1stFlrSF TotalBsmtSF 0.877
## 3 GarageArea GarageCars 0.841
## 4 TotRmsAbvGrd GrLivArea 0.829
## 5 SalePrice OverallQual 0.823
## 6 GarageYrBlt YearRemodAdd 0.747
## 7 YearRemodAdd YearBuilt 0.738
## 8 SalePrice GrLivArea 0.731
## 9 SalePrice GarageCars 0.681
## 10 SalePrice FullBath 0.671
## # … with 21 more rows
cor_df %>% filter(abs(corr)<=0.6)
## # A tibble: 635 × 3
## Var1 Var2 corr
## <chr> <chr> <dbl>
## 1 LotFrontage MSSubClass -0.313
## 2 LotArea MSSubClass -0.255
## 3 OverallQual MSSubClass 0.0992
## 4 OverallQual LotFrontage 0.238
## 5 OverallQual LotArea 0.283
## 6 OverallCond MSSubClass -0.0763
## 7 OverallCond LotFrontage -0.0693
## 8 OverallCond LotArea -0.0873
## 9 OverallCond OverallQual -0.264
## 10 YearBuilt MSSubClass -0.00468
## # … with 625 more rows
cat_var_names <- data_house1 %>%
select(where(is.factor),SalePrice) %>%
names
cat_var_names
## [1] "MSZoning" "Street" "Alley" "LotShape"
## [5] "LandContour" "LotConfig" "LandSlope" "Neighborhood"
## [9] "Condition1" "Condition2" "BldgType" "HouseStyle"
## [13] "RoofStyle" "RoofMatl" "Exterior1st" "Exterior2nd"
## [17] "MasVnrType" "ExterQual" "ExterCond" "Foundation"
## [21] "BsmtQual" "BsmtCond" "BsmtExposure" "BsmtFinType1"
## [25] "BsmtFinType2" "Heating" "HeatingQC" "CentralAir"
## [29] "Electrical" "KitchenQual" "Functional" "FireplaceQu"
## [33] "GarageType" "GarageFinish" "GarageQual" "GarageCond"
## [37] "PavedDrive" "PoolQC" "Fence" "MiscFeature"
## [41] "SaleType" "SaleCondition" "SalePrice"
Kode diatas digunakan untuk membuat matriks korelasi Spearman antar variabel numerik dalam data “data_house1” dan mengekstrak koefisien korelasi yang lebih besar dari 0,6 dalam bentuk data frame. Kemudian, kode juga mencari nama variabel kategorikal dalam data “data_house1” yang kemudian disimpan dalam variabel “cat_var_names”.
for(i in cat_var_names[-43]){
plot_boxplot(data = data_house1 %>%
select(where(is.factor),SalePrice),
geom_boxplot_args=list(fill="steelblue"),
by=i,ggtheme = theme_bw())
}
ditampilkan beberapa boxplot dari setiap variabel kategorikal dalam data “data_house1” kecuali variabel “SalePrice”. Setiap boxplot akan menunjukkan distribusi harga jual (SalePrice) untuk setiap nilai kategorikal dalam variabel yang diberikan. Masing-masing boxplot menunjukkan distribusi harga jual (SalePrice) untuk setiap nilai dalam variabel kategorikal yang diberikan. Analisis hasilnya bergantung pada variabel kategorikal yang digunakan. Kita dapat melihat distribusi harga jual di antara setiap kelompok dalam variabel kategorikal.
install.packages("rsample")
install.packages("DataExplorer")
install.packages("sjPlot")
install.packages("openxlsx")
install.packages("lmtest")
install.packages("fBasics")
install.packages("mlr3measures")
library(rsample)
library(DataExplorer)
library(sjPlot)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(openxlsx)
data_insurance <- read.csv("C:/Users/afwa/Downloads/insurance.csv",stringsAsFactors = TRUE)
head(data_insurance)
## age sex bmi children smoker region expenses
## 1 19 female 27.900 0 yes southwest 16884.924
## 2 18 male 33.770 1 no southeast 1725.552
## 3 28 male 33.000 3 no southeast 4449.462
## 4 33 male 22.705 0 no northwest 21984.471
## 5 32 male 28.880 0 no northwest 3866.855
## 6 31 female 25.740 0 no southeast 3756.622
#1. Memeriksa sebaran data
plot_histogram(data = data_insurance,nrow=3,ncol = 3,
geom_histogram_args = list(fill="steelblue"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Empat histogram di atas menampilkan frekuensi dari empat variabel data age, bmi, children dan expenses. Dapat dilihat frekuensi variabel dengan memperhatikan angka-angka pada histogram.
#Transformasi response
data_insurance$expenses <- log(data_insurance$expenses)
plot_histogram(data = data_insurance,nrow=3,ncol = 3,
geom_histogram_args = list(fill="steelblue"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Histogram diatas menampilkan histogram dari variabel expenses yang telah
diubah sekalanya dengan transformasi data. Transformasi data dilakukan
untuk tujuan mengubah skala pengukuran data asli menjadi bentuk lain
sehingga dapat memenuhi syarat regresi linier.
#Sebaran untuk peubah kategorik
plot_bar(data = data_insurance,nrow=3,ncol = 3
)
Plot bar di atas menampilkan frekuensi sebaran sex, smoker dan region
dari data. Plot ini menampilkan jumlah dari data pada setiap
sebaran.
#2. Memeriksa Korelasi Peubah
plot_scatterplot(data = data_insurance[,c("expenses","age","bmi","children")],
by="expenses",geom_point_args= list(color="steelblue") )
Selanjutkan dilakukan pemeriksaan korelasi antara variabel. Secara
sederhana, uji ini adalah uji linearitas yaitu pengujian untuk memeriksa
apakah terdapat hubungan antara variabel independe dengan variabel
dependen. Hal ini dapat dilihat dari sebaran data, jika sebaran data
terlihat berhubungan satu sama lain maka data tersebut linear, sedangkan
jika sebaran data tersebut takterarah maka dapat disebut data tersebut
tidak linear. Terlihat dari ketiga variabel bebas yang diuji pada
variabel terikat diatas, bahwa sebaran data nya berkorelasi satu sama
lain sehingga dapat disebut sebagai data yang linear. Karena uji
tersebut telah berhasil, maka dapat dilakukan uji regresi linear
berganda.
regresi <- lm(formula = expenses~.,data = data_insurance)
summary(regresi)
##
## Call:
## lm(formula = expenses ~ ., data = data_insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.07186 -0.19835 -0.04917 0.06598 2.16636
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.0305581 0.0723960 97.112 < 2e-16 ***
## age 0.0345816 0.0008721 39.655 < 2e-16 ***
## sexmale -0.0754164 0.0244012 -3.091 0.002038 **
## bmi 0.0133748 0.0020960 6.381 2.42e-10 ***
## children 0.1018568 0.0100995 10.085 < 2e-16 ***
## smokeryes 1.5543228 0.0302795 51.333 < 2e-16 ***
## regionnorthwest -0.0637876 0.0349057 -1.827 0.067860 .
## regionsoutheast -0.1571967 0.0350828 -4.481 8.08e-06 ***
## regionsouthwest -0.1289522 0.0350271 -3.681 0.000241 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4443 on 1329 degrees of freedom
## Multiple R-squared: 0.7679, Adjusted R-squared: 0.7666
## F-statistic: 549.8 on 8 and 1329 DF, p-value: < 2.2e-16
plot_model(regresi,type = "est",sort.est = TRUE,
transform = "exp" )
plot_model(model = regresi,type="pred")
## $age
##
## $sex
##
## $bmi
##
## $children
##
## $smoker
##
## $region
## Model Checking
plot_model(regresi,type = "diag")
## [[1]]
##
## [[2]]
## `geom_smooth()` using formula = 'y ~ x'
##
## [[3]]
##
## [[4]]
## `geom_smooth()` using formula = 'y ~ x'
res <- residuals(regresi)
# uji normalitas
shapiro.test(res)
##
## Shapiro-Wilk normality test
##
## data: res
## W = 0.8373, p-value < 2.2e-16
fBasics::jarqueberaTest(res)
##
## Title:
## Jarque - Bera Normalality Test
##
## Test Results:
## STATISTIC:
## X-squared: 1673.7604
## P VALUE:
## Asymptotic p Value: < 2.2e-16
##
## Description:
## Fri Feb 24 21:27:53 2023 by user: afwa
fBasics::ksnormTest(res,)
## Warning in ks.test.default(x, "pnorm", alternative = "two.sided"): ties should
## not be present for the Kolmogorov-Smirnov test
## Warning in ks.test.default(x, "pnorm", alternative = "less"): ties should not
## be present for the Kolmogorov-Smirnov test
## Warning in ks.test.default(x, "pnorm", alternative = "greater"): ties should
## not be present for the Kolmogorov-Smirnov test
##
## Title:
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## Test Results:
## STATISTIC:
## D: 0.2831
## P VALUE:
## Alternative Two-Sided: < 2.2e-16
## Alternative Less: < 2.2e-16
## Alternative Greater: < 2.2e-16
##
## Description:
## Fri Feb 24 21:27:53 2023 by user: afwa
print(fBasics::adTest(res))
##
## Title:
## Anderson - Darling Normality Test
##
## Test Results:
## STATISTIC:
## A: 74.8074
## P VALUE:
## < 2.2e-16
##
## Description:
## Fri Feb 24 21:27:53 2023 by user: afwa
# uji homogen ragam
lmtest::bptest(expenses ~.,
data = data_insurance,
studentize = F)
##
## Breusch-Pagan test
##
## data: expenses ~ .
## BP = 243.98, df = 8, p-value < 2.2e-16
#Prediksi Regresi Linear
#Membagi data menjadi training testing
set.seed(123)
data_split <- initial_split(data = data_insurance,prop = 0.8)
train1 <- training(data_split)
test1 <- testing(data_split)
regresi2 <- lm(expenses ~.,data = train1)
#Prediksi data testing
prediksi <- predict(regresi2,newdata = test1)
head(prediksi)
## 14 15 21 22 27 33
## 9.347747 9.893421 9.580387 8.495743 9.490087 8.474506
#Evaluasi hasil prediksi
# RMSE
mlr3measures::rmse(response = prediksi,truth = test1$expenses)
## [1] 0.4415859
# MAPE
mlr3measures::mape(response = prediksi,truth = test1$expenses)
## [1] 0.03175772
# Spearman Correlation
mlr3measures::srho(response = prediksi,truth = test1$expenses)
## [1] 0.9039972
Regresi linear berganda yang telah dilakukan diatas dapat dilhat kesimpulannya dengan melihat spearman correlationnya. Hasil yang didapat adalah 0.9039972 dimana mendekati 1. Semakin mendekati 1 maka korelasi antar variabel bebas dan terikat semakin kuat.