Sebagai seorang penjual properti, kita ingin membuat model yang mana dapat memprediksi harga properti berdasarkan beberapa informasi yang ada pada data.
Tentukan variabel:
priceprice1. Read data house_data.csv
house <- read.csv("data_input/house_data.csv")
head(house)2. Cek struktur data
glimpse(house)#> Rows: 21,613
#> Columns: 9
#> $ price <int> 221900, 538000, 180000, 604000, 510000, 1225000, 257500, 2…
#> $ bedrooms <int> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2, 3…
#> $ bathrooms <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00, 2.50…
#> $ sqft_living <int> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 1890,…
#> $ sqft_lot <int> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470, 6…
#> $ floors <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0…
#> $ waterfront <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
#> $ grade <int> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, 7, …
#> $ yr_built <int> 1955, 1951, 1933, 1965, 1987, 2001, 1995, 1963, 1960, 2003…
💡 Hasil pemeriksaan struktur data:
3. Cleansing Data
house <- house %>%
mutate(waterfront= as.factor(waterfront))
glimpse(house)#> Rows: 21,613
#> Columns: 9
#> $ price <int> 221900, 538000, 180000, 604000, 510000, 1225000, 257500, 2…
#> $ bedrooms <int> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2, 3…
#> $ bathrooms <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00, 2.50…
#> $ sqft_living <int> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 1890,…
#> $ sqft_lot <int> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470, 6…
#> $ floors <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0…
#> $ waterfront <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
#> $ grade <int> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, 7, …
#> $ yr_built <int> 1955, 1951, 1933, 1965, 1987, 2001, 1995, 1963, 1960, 2003…
#cek missing values
anyNA(house)#> [1] FALSE
colnames(house)#> [1] "price" "bedrooms" "bathrooms" "sqft_living" "sqft_lot"
#> [6] "floors" "waterfront" "grade" "yr_built"
3. EDA
#persebaran data
boxplot(house$price)boxplot(house$bedrooms)boxplot(house$bathrooms)boxplot(house$sqft_living)boxplot(house$sqft_lot)boxplot(house$floors)boxplot(house$grade)#korelasi
library(GGally)
ggcorr(house, label = TRUE)z_scores <- scale(house$price)
# Define a threshold for identifying outliers (e.g., Z-score > 3 or < -3)
threshold <- 3
# Find the indices of outliers
outlier_indices <- which(abs(z_scores) > threshold)
# Remove outliers from the dataset
house_clean <- house[-outlier_indices, ]Buatlah 3 model berdasarkan feature selection yg telah dipelajari 1. model all predictor 2. model selection based on correlation (korelasi > 0.5) 3. model selection hasil stepwise (backward/forward/both)
#model all
model_all_outlier <- lm(formula = price ~ .,
data = house_clean)
summary(model_all_outlier)#>
#> Call:
#> lm(formula = price ~ ., data = house_clean)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -890531 -100499 -10458 82349 946406
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 5879303.86726 91787.01380 64.05 <0.0000000000000002 ***
#> bedrooms -20660.64847 1536.84376 -13.44 <0.0000000000000002 ***
#> bathrooms 38163.82656 2588.65762 14.74 <0.0000000000000002 ***
#> sqft_living 110.79518 2.56564 43.18 <0.0000000000000002 ***
#> sqft_lot -0.06337 0.02804 -2.26 0.0238 *
#> floors 30996.08844 2561.42850 12.10 <0.0000000000000002 ***
#> waterfront1 285199.19405 16907.58393 16.87 <0.0000000000000002 ***
#> grade 119204.81801 1624.67964 73.37 <0.0000000000000002 ***
#> yr_built -3328.78003 48.41999 -68.75 <0.0000000000000002 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 161200 on 21198 degrees of freedom
#> Multiple R-squared: 0.6135, Adjusted R-squared: 0.6134
#> F-statistic: 4206 on 8 and 21198 DF, p-value: < 0.00000000000000022
z_scores <- apply(house[, c(-1, -7, -9)], 2, function(x) (x - mean(x)) / sd(x))
# Find the row indices of outliers for each column
outlier_indices <- lapply(z_scores, function(z) which(abs(z) > 3))
# Combine all outlier indices into a single vector
all_outlier_indices <- unlist(outlier_indices)
# Remove duplicate indices and sort in ascending order
all_outlier_indices <- unique(sort(all_outlier_indices))
# Remove rows containing outliers
house_clean <- house[-all_outlier_indices, ]#model tanpa outlier
model_no_outlier <- lm(formula = price ~ .,
data = house_clean)
summary(model_no_outlier)#>
#> Call:
#> lm(formula = price ~ ., data = house_clean)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -1384199 -112978 -10082 91070 4251868
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 6999032.44443 121579.51472 57.568 < 0.0000000000000002 ***
#> bedrooms -41481.76630 2040.78553 -20.326 < 0.0000000000000002 ***
#> bathrooms 51700.62096 3437.66566 15.039 < 0.0000000000000002 ***
#> sqft_living 177.91146 3.29033 54.071 < 0.0000000000000002 ***
#> sqft_lot -0.23948 0.03679 -6.509 0.0000000000773 ***
#> floors 17279.72003 3426.93917 5.042 0.0000004636433 ***
#> waterfront1 721806.23726 17406.99876 41.466 < 0.0000000000000002 ***
#> grade 128817.33683 2149.99400 59.915 < 0.0000000000000002 ***
#> yr_built -3963.69815 64.05123 -61.883 < 0.0000000000000002 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 218900 on 21603 degrees of freedom
#> Multiple R-squared: 0.6446, Adjusted R-squared: 0.6444
#> F-statistic: 4897 on 8 and 21603 DF, p-value: < 0.00000000000000022
#model selection
model_selection <- lm(formula = price ~ bathrooms+sqft_living+grade,
data = house_clean)
summary(model_selection)#>
#> Call:
#> lm(formula = price ~ bathrooms + sqft_living + grade, data = house_clean)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -1008734 -136362 -23119 100665 4801949
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -597698.186 13260.844 -45.07 <0.0000000000000002 ***
#> bathrooms -38104.020 3439.929 -11.08 <0.0000000000000002 ***
#> sqft_living 203.318 3.331 61.04 <0.0000000000000002 ***
#> grade 103892.483 2286.421 45.44 <0.0000000000000002 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 249800 on 21608 degrees of freedom
#> Multiple R-squared: 0.5371, Adjusted R-squared: 0.5371
#> F-statistic: 8358 on 3 and 21608 DF, p-value: < 0.00000000000000022
#model backward
model_none <- lm(formula = price ~ 1,
data = house_clean)#model backward
model_forward <- step(object = model_none,
direction = "forward",
scope = list(upper = model_no_outlier))#> Start: AIC=553850.4
#> price ~ 1
#>
#> Df Sum of Sq RSS AIC
#> + sqft_living 1 1435543462199431 1477272051343703 539180
#> + grade 1 1297544228160404 1615271285382730 541110
#> + bathrooms 1 803193589873378 2109621923669756 546880
#> + bedrooms 1 276931907824552 2635883605718581 551693
#> + waterfront 1 206673885071968 2706141628471166 552262
#> + floors 1 192039291239764 2720776222303370 552378
#> + sqft_lot 1 23412415355950 2889403098187183 553678
#> + yr_built 1 8490935359504 2904324578183630 553789
#> <none> 2912815513543134 553850
#>
#> Step: AIC=539179.5
#> price ~ sqft_living
#>
#> Df Sum of Sq RSS AIC
#> + grade 1 121323888272031 1355948163071672 537329
#> + waterfront 1 110238323711359 1367033727632344 537505
#> + yr_built 1 92856783315154 1384415268028548 537778
#> + bedrooms 1 40634300992415 1436637750351288 538579
#> + sqft_lot 1 3011442425746 1474260608917956 539137
#> + floors 1 229657253482 1477042394090220 539178
#> + bathrooms 1 147571014568 1477124480329135 539179
#> <none> 1477272051343703 539180
#>
#> Step: AIC=537329.4
#> price ~ sqft_living + grade
#>
#> Df Sum of Sq RSS AIC
#> + yr_built 1 199234982439309 1156713180632362 533897
#> + waterfront 1 108953744540985 1246994418530686 535521
#> + bedrooms 1 22139980304855 1333808182766817 536976
#> + floors 1 9625202644511 1346322960427160 537177
#> + bathrooms 1 7656177487159 1348291985584513 537209
#> + sqft_lot 1 2020229610976 1353927933460696 537299
#> <none> 1355948163071672 537329
#>
#> Step: AIC=533896.9
#> price ~ sqft_living + grade + yr_built
#>
#> Df Sum of Sq RSS AIC
#> + waterfront 1 90115617168644 1066597563463718 532146
#> + bedrooms 1 20119434673168 1136593745959194 533520
#> + bathrooms 1 8621589417559 1148091591214803 533737
#> + floors 1 4393666034176 1152319514598186 533817
#> + sqft_lot 1 1713671373253 1154999509259109 533867
#> <none> 1156713180632362 533897
#>
#> Step: AIC=532146
#> price ~ sqft_living + grade + yr_built + waterfront
#>
#> Df Sum of Sq RSS AIC
#> + bedrooms 1 13900518144799 1052697045318919 531864
#> + bathrooms 1 8471839173550 1058125724290168 531976
#> + floors 1 4059612166418 1062537951297300 532066
#> + sqft_lot 1 1826211333038 1064771352130680 532111
#> <none> 1066597563463718 532146
#>
#> Step: AIC=531864.5
#> price ~ sqft_living + grade + yr_built + waterfront + bedrooms
#>
#> Df Sum of Sq RSS AIC
#> + bathrooms 1 13947450604938 1038749594713981 531578
#> + floors 1 4174633184642 1048522412134278 531781
#> + sqft_lot 1 2871066176301 1049825979142618 531807
#> <none> 1052697045318919 531864
#>
#> Step: AIC=531578.2
#> price ~ sqft_living + grade + yr_built + waterfront + bedrooms +
#> bathrooms
#>
#> Df Sum of Sq RSS AIC
#> + sqft_lot 1 2243281039487 1036506313674494 531533
#> + floors 1 1431319580706 1037318275133275 531550
#> <none> 1038749594713981 531578
#>
#> Step: AIC=531533.5
#> price ~ sqft_living + grade + yr_built + waterfront + bedrooms +
#> bathrooms + sqft_lot
#>
#> Df Sum of Sq RSS AIC
#> + floors 1 1218449882192 1035287863792302 531510
#> <none> 1036506313674494 531533
#>
#> Step: AIC=531510.1
#> price ~ sqft_living + grade + yr_built + waterfront + bedrooms +
#> bathrooms + sqft_lot + floors
summary(model_forward)#>
#> Call:
#> lm(formula = price ~ sqft_living + grade + yr_built + waterfront +
#> bedrooms + bathrooms + sqft_lot + floors, data = house_clean)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -1384199 -112978 -10082 91070 4251868
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 6999032.44443 121579.51472 57.568 < 0.0000000000000002 ***
#> sqft_living 177.91146 3.29033 54.071 < 0.0000000000000002 ***
#> grade 128817.33683 2149.99400 59.915 < 0.0000000000000002 ***
#> yr_built -3963.69815 64.05123 -61.883 < 0.0000000000000002 ***
#> waterfront1 721806.23726 17406.99876 41.466 < 0.0000000000000002 ***
#> bedrooms -41481.76630 2040.78553 -20.326 < 0.0000000000000002 ***
#> bathrooms 51700.62096 3437.66566 15.039 < 0.0000000000000002 ***
#> sqft_lot -0.23948 0.03679 -6.509 0.0000000000773 ***
#> floors 17279.72003 3426.93917 5.042 0.0000004636433 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 218900 on 21603 degrees of freedom
#> Multiple R-squared: 0.6446, Adjusted R-squared: 0.6444
#> F-statistic: 4897 on 8 and 21603 DF, p-value: < 0.00000000000000022
Berdasarkan RMSE model regresi manakah yang terbaik?
house_clean$predict_all <- predict(object = model_no_outlier, newdata = house_clean)
house_clean$predict_selection <- predict(object = model_selection, newdata = house_clean)
house_clean$predict_forward <- predict(object = model_forward, newdata = house_clean)
head(house_clean)library(MLmetrics)
RMSE(y_pred = house_clean$predict_all,
y_true = house_clean$price)#> [1] 218868.4
RMSE(y_pred = house_clean$predict_selection,
y_true = house_clean$price)#> [1] 249772.4
RMSE(y_pred = house_clean$predict_forward,
y_true = house_clean$price)#> [1] 218868.4
Kesimpulan : Model yang memberikan error paling kecil dalam memprediksi price adalah model selection dan model forward, dengan nilai RMSE sama sebesar 218868.4
summary(model_no_outlier)#>
#> Call:
#> lm(formula = price ~ ., data = house_clean)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -1384199 -112978 -10082 91070 4251868
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 6999032.44443 121579.51472 57.568 < 0.0000000000000002 ***
#> bedrooms -41481.76630 2040.78553 -20.326 < 0.0000000000000002 ***
#> bathrooms 51700.62096 3437.66566 15.039 < 0.0000000000000002 ***
#> sqft_living 177.91146 3.29033 54.071 < 0.0000000000000002 ***
#> sqft_lot -0.23948 0.03679 -6.509 0.0000000000773 ***
#> floors 17279.72003 3426.93917 5.042 0.0000004636433 ***
#> waterfront1 721806.23726 17406.99876 41.466 < 0.0000000000000002 ***
#> grade 128817.33683 2149.99400 59.915 < 0.0000000000000002 ***
#> yr_built -3963.69815 64.05123 -61.883 < 0.0000000000000002 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 218900 on 21603 degrees of freedom
#> Multiple R-squared: 0.6446, Adjusted R-squared: 0.6444
#> F-statistic: 4897 on 8 and 21603 DF, p-value: < 0.00000000000000022
1. Interpretasi coefficient untuk prediktor kategorik:
2. Interpretasi coefficient untuk prediktor numerik:
3. Signifikansi prediktor:
4. Adjusted R Squared: