0. Read Data

data <- read.csv("kc_house_data.csv")

1. Explorasi Data

str(data)
## 'data.frame':    21613 obs. of  21 variables:
##  $ id           : num  7129300520 6414100192 5631500400 2487200875 1954400510 ...
##  $ date         : Factor w/ 372 levels "20140502T000000",..: 165 221 291 221 284 11 57 252 340 306 ...
##  $ price        : num  221900 538000 180000 604000 510000 ...
##  $ bedrooms     : int  3 3 2 4 3 4 3 3 3 3 ...
##  $ bathrooms    : num  1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
##  $ sqft_living  : int  1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
##  $ sqft_lot     : int  5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
##  $ floors       : num  1 2 1 1 1 1 2 1 1 2 ...
##  $ waterfront   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ view         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ condition    : int  3 3 3 5 3 3 3 3 3 3 ...
##  $ grade        : int  7 7 6 7 8 11 7 7 7 7 ...
##  $ sqft_above   : int  1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
##  $ sqft_basement: int  0 400 0 910 0 1530 0 0 730 0 ...
##  $ yr_built     : int  1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
##  $ yr_renovated : int  0 1991 0 0 0 0 0 0 0 0 ...
##  $ zipcode      : int  98178 98125 98028 98136 98074 98053 98003 98198 98146 98038 ...
##  $ lat          : num  47.5 47.7 47.7 47.5 47.6 ...
##  $ long         : num  -122 -122 -122 -122 -122 ...
##  $ sqft_living15: int  1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
##  $ sqft_lot15   : int  5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...
range(data$bedrooms)
## [1]  0 33
range(data$bathrooms)
## [1] 0 8
range(data$floors)
## [1] 1.0 3.5
range(data$condition)
## [1] 1 5
range(data$grade)
## [1]  1 13

2. Seleksi Data Berdasarkan Bisnis + Corellation

price: harga m2_living: luas rumah dalam m2 m2_lot: luas tanah dalam m2 bedrooms: jumlah kamar tidur bathrooms: jumlah kamar mandi floors: jumlah lantai condition: kondisi rumah 1 buruk - 5 baru view: jumlah halaman waterfront: jumlah kolam renang grade: grade rumah

data_selected <- data %>% 
  select(price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, condition, waterfront, view, grade) %>% 
  mutate(bathrooms = as.integer(bathrooms),
         floors = as.integer(floors),
         m2_living = 0.3048 * sqft_living, 
         m2_lot = 0.3048 * sqft_lot) %>% 
  select(price, m2_living, m2_lot, bedrooms, bathrooms, floors, condition, waterfront, view, grade)

Korelasi data

ggcorr(data_selected, label= T)

Outlier Check

boxplot(data_selected$bedrooms)

boxplot(data_selected$bathrooms)

boxplot(data_selected$m2_living)

boxplot(data_selected$m2_lot)

boxplot(data_selected$floors)

boxplot(data_selected$condition)

boxplot(data_selected$waterfront)

boxplot(data_selected$view)

3. Modelling

Model Full

model_full <- lm(price~., data=data_selected)
summary(model_full)
## 
## Call:
## lm(formula = price ~ ., data = data_selected)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1231443  -125596   -17532    94899  4558095 
## 
## Coefficients:
##                 Estimate   Std. Error t value            Pr(>|t|)    
## (Intercept) -665307.8183   17191.3671 -38.700 <0.0000000000000002 ***
## m2_living       594.5217      11.0632  53.739 <0.0000000000000002 ***
## m2_lot           -1.1718       0.1267  -9.247 <0.0000000000000002 ***
## bedrooms     -38380.0003    2134.0597 -17.985 <0.0000000000000002 ***
## bathrooms     29075.7846    3221.7008   9.025 <0.0000000000000002 ***
## floors       -47120.4673    3530.2883 -13.347 <0.0000000000000002 ***
## condition     51524.9237    2530.6973  20.360 <0.0000000000000002 ***
## waterfront   581878.1782   19776.3159  29.423 <0.0000000000000002 ***
## view          61576.6924    2341.1371  26.302 <0.0000000000000002 ***
## grade        102659.4243    2223.5880  46.168 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 230000 on 21603 degrees of freedom
## Multiple R-squared:  0.6078, Adjusted R-squared:  0.6077 
## F-statistic:  3720 on 9 and 21603 DF,  p-value: < 0.00000000000000022

Interpretasi koefisien: - Price = -665307.8183 + (594.5217 * m2_living) + (-1.1718 * m2_lot) + (-38380.0003 * bedrooms) + (29075.7846 * bathrooms) + (-47120.4673 * floors) + (51524.9237 * condition) + (581878.1782 * waterfront) + (61576.6924 * view) + (102659.4243 * grade)

Adj R-Square: 0.60 (mendekati 1)

Model None

model_none <- lm(price~1, data=data_selected)
summary(model_none)
## 
## Call:
## lm(formula = price ~ 1, data = data_selected)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -465088 -218138  -90088  104912 7159912 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)   540088       2497   216.3 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 367100 on 21612 degrees of freedom

Adj R-Square: 0.60

4. Feature Selection

model_backward <- step(model_full, direction="backward")
## Start:  AIC=533662.3
## price ~ m2_living + m2_lot + bedrooms + bathrooms + floors + 
##     condition + waterfront + view + grade
## 
##              Df       Sum of Sq              RSS    AIC
## <none>                          1142336728298222 533662
## - bathrooms   1   4306977080413 1146643705378635 533742
## - m2_lot      1   4521106021818 1146857834320040 533746
## - floors      1   9420607646769 1151757335944991 533838
## - bedrooms    1  17103181270161 1159439909568383 533981
## - condition   1  21919689506038 1164256417804259 534071
## - view        1  36581294998829 1178918023297051 534342
## - waterfront  1  45777643537833 1188114371836054 534510
## - grade       1 112711720747852 1255048449046074 535694
## - m2_living   1 152705668498510 1295042396796732 536372

5. Error

pred <- predict(model_backward, data_selected)

RMSE (Root Mean Square Error)

RMSE(pred, data_selected$price)
## [1] 229900.3

MAE (Mean Absolute Error)

MAE(pred, data_selected$price)
## [1] 152322.6
range(data_selected$price)
## [1]   75000 7700000

MAPE (Mean Absolute Percentage Error)

MAPE(pred, data_selected$price)
## [1] 0.3192653

6. Asumsi

1. Residual Menyebar Normal (Normality)

H0: Residual menyebar normal H1: Residual tidak menyebar normal

jika p-value < alpha (0.05) maka tolak h0

residual dinyatakan normal ketika p-value > 0.05 (asumsi terpenuhi)

Histogram

hist(model_backward$residuals, breaks=100)

Shapiro Test

Tidak bisa karena sample size lebih dari 5000

#shapiro.test(model_backward$residuals) # Tidak bisa karena sample size 3 sampe 5000

Lillie Test

lillie.test(model_backward$residuals)
## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  model_backward$residuals
## D = 0.093559, p-value < 0.00000000000000022

QQ Plot

plot(model_backward, which=2)

Residual Tidak Berpola

h0: Model Homoscedasticity h1: Model Heteroscedasticity

model dinyatakan Homoscedasticity bila p-value > alpha semakin mendekati 1 semakin bagus

plot(data_selected$price, model_backward$residuals)
abline(h=0, col="red")

bptest(model_backward)
## 
##  studentized Breusch-Pagan test
## 
## data:  model_backward
## BP = 2560.5, df = 9, p-value < 0.00000000000000022

3. Tiap X tidak memiliki hubungan (Multicolinearity)

nilai vif yang bagus di bawah 10

vif(model_backward)
##  m2_living     m2_lot   bedrooms  bathrooms     floors  condition 
##   3.920194   1.046210   1.610096   2.290920   1.551479   1.108446 
## waterfront       view      grade 
##   1.196494   1.315486   2.792147

4. Linearity

h0: korelasi = 0 h1: korelasi != 0

membandingkan 2 variabel untuk menguji apakah kedua variable tersebut memiliki korelasi linear

cor.test.all <- function(data,target) {
  names <- names(data)
  df <- NULL
  for (i in 1:length(names)) {
    y <- target
    x <- names[[i]]
    p_value <- cor.test(data[,y], data[,x])[3]
    temp <- data.frame(x = x,
                       y = y,
                       p_value = as.numeric(p_value))
    df  <- rbind(df,temp)
  }
  return(df)
}
p_value <- cor.test.all(data_selected, "price")
p_value
##             x     y
## 1       price price
## 2   m2_living price
## 3      m2_lot price
## 4    bedrooms price
## 5   bathrooms price
## 6      floors price
## 7   condition price
## 8  waterfront price
## 9        view price
## 10      grade price
##                                                                                                                                                                                                                                                                                       p_value
## 1  0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 2  0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 3  0.0000000000000000000000000000000000000007972504510431172068015271378975312839109888680414442704441488599349726832457853352001369166562990060172916306768797767290379852056503295898437500000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 4  0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 5  0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 6  0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003812102
## 7  0.0000000893565406245876391372784559863351461217462201602756977081298828125000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 8  0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 9  0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 10 0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000

p-value kurang dari alpha (0.05)

7. Kesimpulan