data <- read.csv("kc_house_data.csv")
str(data)
## 'data.frame': 21613 obs. of 21 variables:
## $ id : num 7129300520 6414100192 5631500400 2487200875 1954400510 ...
## $ date : Factor w/ 372 levels "20140502T000000",..: 165 221 291 221 284 11 57 252 340 306 ...
## $ price : num 221900 538000 180000 604000 510000 ...
## $ bedrooms : int 3 3 2 4 3 4 3 3 3 3 ...
## $ bathrooms : num 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
## $ sqft_living : int 1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
## $ sqft_lot : int 5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
## $ floors : num 1 2 1 1 1 1 2 1 1 2 ...
## $ waterfront : int 0 0 0 0 0 0 0 0 0 0 ...
## $ view : int 0 0 0 0 0 0 0 0 0 0 ...
## $ condition : int 3 3 3 5 3 3 3 3 3 3 ...
## $ grade : int 7 7 6 7 8 11 7 7 7 7 ...
## $ sqft_above : int 1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
## $ sqft_basement: int 0 400 0 910 0 1530 0 0 730 0 ...
## $ yr_built : int 1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
## $ yr_renovated : int 0 1991 0 0 0 0 0 0 0 0 ...
## $ zipcode : int 98178 98125 98028 98136 98074 98053 98003 98198 98146 98038 ...
## $ lat : num 47.5 47.7 47.7 47.5 47.6 ...
## $ long : num -122 -122 -122 -122 -122 ...
## $ sqft_living15: int 1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
## $ sqft_lot15 : int 5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...
range(data$bedrooms)
## [1] 0 33
range(data$bathrooms)
## [1] 0 8
range(data$floors)
## [1] 1.0 3.5
range(data$condition)
## [1] 1 5
range(data$grade)
## [1] 1 13
price: harga m2_living: luas rumah dalam m2 m2_lot: luas tanah dalam m2 bedrooms: jumlah kamar tidur bathrooms: jumlah kamar mandi floors: jumlah lantai condition: kondisi rumah 1 buruk - 5 baru view: jumlah halaman waterfront: jumlah kolam renang grade: grade rumah
data_selected <- data %>%
select(price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, condition, waterfront, view, grade) %>%
mutate(bathrooms = as.integer(bathrooms),
floors = as.integer(floors),
m2_living = 0.3048 * sqft_living,
m2_lot = 0.3048 * sqft_lot) %>%
select(price, m2_living, m2_lot, bedrooms, bathrooms, floors, condition, waterfront, view, grade)
ggcorr(data_selected, label= T)
boxplot(data_selected$bedrooms)
boxplot(data_selected$bathrooms)
boxplot(data_selected$m2_living)
boxplot(data_selected$m2_lot)
boxplot(data_selected$floors)
boxplot(data_selected$condition)
boxplot(data_selected$waterfront)
boxplot(data_selected$view)
model_full <- lm(price~., data=data_selected)
summary(model_full)
##
## Call:
## lm(formula = price ~ ., data = data_selected)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1231443 -125596 -17532 94899 4558095
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -665307.8183 17191.3671 -38.700 <0.0000000000000002 ***
## m2_living 594.5217 11.0632 53.739 <0.0000000000000002 ***
## m2_lot -1.1718 0.1267 -9.247 <0.0000000000000002 ***
## bedrooms -38380.0003 2134.0597 -17.985 <0.0000000000000002 ***
## bathrooms 29075.7846 3221.7008 9.025 <0.0000000000000002 ***
## floors -47120.4673 3530.2883 -13.347 <0.0000000000000002 ***
## condition 51524.9237 2530.6973 20.360 <0.0000000000000002 ***
## waterfront 581878.1782 19776.3159 29.423 <0.0000000000000002 ***
## view 61576.6924 2341.1371 26.302 <0.0000000000000002 ***
## grade 102659.4243 2223.5880 46.168 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 230000 on 21603 degrees of freedom
## Multiple R-squared: 0.6078, Adjusted R-squared: 0.6077
## F-statistic: 3720 on 9 and 21603 DF, p-value: < 0.00000000000000022
Interpretasi koefisien: - Price = -665307.8183 + (594.5217 * m2_living) + (-1.1718 * m2_lot) + (-38380.0003 * bedrooms) + (29075.7846 * bathrooms) + (-47120.4673 * floors) + (51524.9237 * condition) + (581878.1782 * waterfront) + (61576.6924 * view) + (102659.4243 * grade)
Adj R-Square: 0.60 (mendekati 1)
model_none <- lm(price~1, data=data_selected)
summary(model_none)
##
## Call:
## lm(formula = price ~ 1, data = data_selected)
##
## Residuals:
## Min 1Q Median 3Q Max
## -465088 -218138 -90088 104912 7159912
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 540088 2497 216.3 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 367100 on 21612 degrees of freedom
Adj R-Square: 0.60
model_backward <- step(model_full, direction="backward")
## Start: AIC=533662.3
## price ~ m2_living + m2_lot + bedrooms + bathrooms + floors +
## condition + waterfront + view + grade
##
## Df Sum of Sq RSS AIC
## <none> 1142336728298222 533662
## - bathrooms 1 4306977080413 1146643705378635 533742
## - m2_lot 1 4521106021818 1146857834320040 533746
## - floors 1 9420607646769 1151757335944991 533838
## - bedrooms 1 17103181270161 1159439909568383 533981
## - condition 1 21919689506038 1164256417804259 534071
## - view 1 36581294998829 1178918023297051 534342
## - waterfront 1 45777643537833 1188114371836054 534510
## - grade 1 112711720747852 1255048449046074 535694
## - m2_living 1 152705668498510 1295042396796732 536372
pred <- predict(model_backward, data_selected)
RMSE(pred, data_selected$price)
## [1] 229900.3
MAE(pred, data_selected$price)
## [1] 152322.6
range(data_selected$price)
## [1] 75000 7700000
MAPE(pred, data_selected$price)
## [1] 0.3192653
H0: Residual menyebar normal H1: Residual tidak menyebar normal
jika p-value < alpha (0.05) maka tolak h0
residual dinyatakan normal ketika p-value > 0.05 (asumsi terpenuhi)
hist(model_backward$residuals, breaks=100)
Tidak bisa karena sample size lebih dari 5000
#shapiro.test(model_backward$residuals) # Tidak bisa karena sample size 3 sampe 5000
lillie.test(model_backward$residuals)
##
## Lilliefors (Kolmogorov-Smirnov) normality test
##
## data: model_backward$residuals
## D = 0.093559, p-value < 0.00000000000000022
plot(model_backward, which=2)
h0: Model Homoscedasticity h1: Model Heteroscedasticity
model dinyatakan Homoscedasticity bila p-value > alpha semakin mendekati 1 semakin bagus
plot(data_selected$price, model_backward$residuals)
abline(h=0, col="red")
bptest(model_backward)
##
## studentized Breusch-Pagan test
##
## data: model_backward
## BP = 2560.5, df = 9, p-value < 0.00000000000000022
nilai vif yang bagus di bawah 10
vif(model_backward)
## m2_living m2_lot bedrooms bathrooms floors condition
## 3.920194 1.046210 1.610096 2.290920 1.551479 1.108446
## waterfront view grade
## 1.196494 1.315486 2.792147
h0: korelasi = 0 h1: korelasi != 0
membandingkan 2 variabel untuk menguji apakah kedua variable tersebut memiliki korelasi linear
cor.test.all <- function(data,target) {
names <- names(data)
df <- NULL
for (i in 1:length(names)) {
y <- target
x <- names[[i]]
p_value <- cor.test(data[,y], data[,x])[3]
temp <- data.frame(x = x,
y = y,
p_value = as.numeric(p_value))
df <- rbind(df,temp)
}
return(df)
}
p_value <- cor.test.all(data_selected, "price")
p_value
## x y
## 1 price price
## 2 m2_living price
## 3 m2_lot price
## 4 bedrooms price
## 5 bathrooms price
## 6 floors price
## 7 condition price
## 8 waterfront price
## 9 view price
## 10 grade price
## p_value
## 1 0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 2 0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 3 0.0000000000000000000000000000000000000007972504510431172068015271378975312839109888680414442704441488599349726832457853352001369166562990060172916306768797767290379852056503295898437500000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 4 0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 5 0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 6 0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003812102
## 7 0.0000000893565406245876391372784559863351461217462201602756977081298828125000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 8 0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 9 0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 10 0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
p-value kurang dari alpha (0.05)