Pengaruh Lokasi Rumah terhadap Harga dan Luas Bangunan dengan Pendekatan MANOVA dan MANCOVA berdasarkan Variabel Kontrol Umur Rumah

Persiapan Packages

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(DataExplorer) 
library(ggplot2)     
library(corrplot)

## corrplot 0.95 loaded

library(ggplot2)
library(psych)

## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(biotools)

## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## ---
## biotools version 4.3

library(candisc)

## Loading required package: heplots
## Loading required package: broom
## 
## Attaching package: 'heplots'
## 
## The following object is masked from 'package:biotools':
## 
##     boxM
## 
## 
## Attaching package: 'candisc'
## 
## The following object is masked from 'package:stats':
## 
##     cancor

library(car)

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:psych':
## 
##     logit
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

library(emmeans)

## Welcome to emmeans.
## Caution: You lose important information if you filter this package's results.
## See '? untidy'

library(dplyr)

Praproses Data

df <- read.csv("D:/Projek_anmul/housing_price_dataset.csv")
head(df)

##   SquareFeet Bedrooms Bathrooms Neighborhood YearBuilt    Price
## 1       2126        4         1        Rural      1969 215355.3
## 2       2459        3         2        Rural      1980 195014.2
## 3       1860        2         1       Suburb      1970 306891.0
## 4       2294        2         1        Urban      1996 206786.8
## 5       2130        5         2       Suburb      2001 272436.2
## 6       2095        2         3       Suburb      2020 198208.8

Tranformasi variabel yearbuild menjadi umur

tahun_sekarang <- as.numeric(format(Sys.Date(), "%Y"))
df$umur <- tahun_sekarang - df$YearBuilt

Hapus data yang price yang negatif

df_negatif <- df[df$Price < 0, ]
df_negatif

##       SquareFeet Bedrooms Bathrooms Neighborhood YearBuilt       Price umur
## 1267        1024        2         2        Urban      2006 -24715.2425   19
## 2311        1036        4         1       Suburb      1983  -7550.5046   42
## 3631        1235        3         2        Rural      2012 -19871.2511   13
## 4163        1352        5         2       Suburb      1977 -10608.3595   48
## 5119        1140        4         1        Urban      2020 -23911.0031    5
## 5952        1097        4         3        Rural      1981  -4537.4186   44
## 6356        1016        5         2        Rural      1997 -13803.6841   28
## 8721        1235        3         1        Urban      1952 -24183.0005   73
## 9612        1131        3         3        Urban      1959 -13692.0261   66
## 10598       1177        2         3        Urban      2010   -434.0971   15
## 11992       1213        4         1       Suburb      2020  -4910.4153    5
## 17443       1600        2         3        Rural      1989  -8238.8845   36
## 17707       1080        5         1        Rural      1955 -28774.9980   70
## 20212       1049        3         1        Rural      2005 -18159.6857   20
## 20760       1036        2         2        Urban      1957  -4810.7243   68
## 23651       1024        4         3       Suburb      1953  -4295.9322   72
## 25460       1106        2         2        Urban      1984  -7177.6285   41
## 29828       1173        5         2        Rural      1988   -847.8951   37
## 30172       1066        3         1        Rural      1964   -602.2091   61
## 33667       1013        5         2        Urban      1960 -36588.1654   65
## 35554       1374        4         3        Urban      1996  -4771.5702   29
## 36930       1078        5         1       Suburb      2015  -6159.0392   10

df <- df[df$Price >= 0, ]

Exploratory Data Analysis

statistika deskriptif

summary(df)

##    SquareFeet      Bedrooms       Bathrooms     Neighborhood      
##  Min.   :1000   Min.   :2.000   Min.   :1.000   Length:49978      
##  1st Qu.:1514   1st Qu.:3.000   1st Qu.:1.000   Class :character  
##  Median :2008   Median :3.000   Median :2.000   Mode  :character  
##  Mean   :2007   Mean   :3.499   Mean   :1.995                     
##  3rd Qu.:2506   3rd Qu.:4.000   3rd Qu.:3.000                     
##  Max.   :2999   Max.   :5.000   Max.   :3.000                     
##    YearBuilt        Price               umur     
##  Min.   :1950   Min.   :   154.8   Min.   : 4.0  
##  1st Qu.:1967   1st Qu.:170007.5   1st Qu.:22.0  
##  Median :1985   Median :225100.1   Median :40.0  
##  Mean   :1985   Mean   :224931.7   Mean   :39.6  
##  3rd Qu.:2003   3rd Qu.:279395.8   3rd Qu.:58.0  
##  Max.   :2021   Max.   :492195.3   Max.   :75.0

Cek missing value

colSums(is.na(df))

##   SquareFeet     Bedrooms    Bathrooms Neighborhood    YearBuilt        Price 
##            0            0            0            0            0            0 
##         umur 
##            0

plot_missing(df)

Distribusi harga rumah berdasarkan neighborhood

ggplot(df, aes(x = Neighborhood, y = Price)) +
  geom_boxplot(fill = "skyblue") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Distribusi harga rumah berdasarkan neighborhood")

Distribusi data berdasarkan neighborhood

ggplot(df, aes(x = Neighborhood)) +
  geom_bar(fill = "skyblue") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Distribusi data  berdasarkan neighborhood", y = "Jumlah")

Rata rata umur berdasarkan neighborhood

summary_data <- df %>%
  group_by(Neighborhood) %>%
  summarise(avg_umur = mean(umur, na.rm = TRUE))

ggplot(summary_data, aes(x = Neighborhood, y = avg_umur, fill = Neighborhood)) +
  geom_col() +
  labs(title = "Rata rata umur berdasarkan neighborhood",
       x = "Neighborhood",
       y = "Rata-rata Umur (Tahun)") +
  theme_minimal()

Korelasi antar variabel dependen

cor(df$SquareFeet, df$Price, use = "complete.obs", method = "pearson")

## [1] 0.7506557

ggplot(df, aes(x = SquareFeet, y = Price)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(title = "Scatterplot: Squarefeet vs Price")

## `geom_smooth()` using formula = 'y ~ x'

Uji Asumsi

Uji dependensi (Barlett Test)

data_dep <- df[, c("SquareFeet", "Price")]
cormat <- cor(data_dep)
cortest.bartlett(cormat, n = nrow(df))

## $chisq
## [1] 41426.21
## 
## $p.value
## [1] 0
## 
## $df
## [1] 1

Uji homogenitas (Box’s M)

data_dep <- df[, c("SquareFeet", "Price")]
df$Neighborhood <- as.factor(df$Neighborhood)
boxM(data_dep, group = df$Neighborhood)

## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  data_dep
## Chi-Sq (approx.) = 1.7736, df = 6, p-value = 0.9393

Uji normalitas

X <- df[, c("SquareFeet", "Price")]
mahal_dist2 <- mahalanobis(X, colMeans(X), cov(X))

n <- nrow(X)
p <- ncol(X)

chi_sq_quantile <- qchisq(ppoints(n), df = p)

r_q <- cor(sort(mahal_dist2), chi_sq_quantile)

cat("Koefisien Korelasi (r_q):", round(r_q, 4), "\n")

## Koefisien Korelasi (r_q): 0.9941

df_korelasi <- n - 2
t_kritis <- qt(0.975, df_korelasi)  
r_table <- t_kritis / sqrt(t_kritis^2 + df_korelasi)

cat("r tabel (alpha = 0.05, df =", df_korelasi, "):", round(r_table, 4), "\n")

## r tabel (alpha = 0.05, df = 49976 ): 0.0088

if (r_q < r_table) {
  cat("Kesimpulan: Tolak H0 → Data TIDAK normal multivariat\n")
} else {
  cat("Kesimpulan: Gagal tolak H0 → Data normal multivariat\n")
}

## Kesimpulan: Gagal tolak H0 → Data normal multivariat

qqplot(chi_sq_quantile, sort(mahal_dist2),
       main = "Q-Q Plot Mahalanobis² vs Chi-Square Quantiles",
       xlab = "Chi-square Quantiles",
       ylab = "Sorted Mahalanobis Distance²")
abline(0, 1, col = "red")

Manova

# MANOVA
manova_model <- manova(cbind(Price, SquareFeet) ~ Neighborhood, data = df)

summary(manova_model, test = "Roy")

##                 Df        Roy approx F num Df den Df    Pr(>F)    
## Neighborhood     2 0.00053128   13.275      2  49975 1.722e-06 ***
## Residuals    49975                                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

manova_model <- manova(cbind(Price, SquareFeet) ~ Neighborhood, data = df)
cda <- candisc(manova_model)
plot(cda, main = "Canonical Discriminant Plot")

## Vector scale factor set to 4.194

ggplot(df, aes(x = Price, y = SquareFeet, color = Neighborhood)) +
  geom_point(size = 3) +
  labs(title = "Visualisasi Scatter Plot",
       x = "Price", y = "SquareFeet") +
  theme_minimal()

Mancova

model_mancova <- manova(cbind(Price, SquareFeet) ~ Neighborhood*umur, data = df)
summary(model_mancova, test = "Roy")

##                      Df       Roy approx F num Df den Df    Pr(>F)    
## Neighborhood          2 5.313e-04  13.2751      2  49972 1.723e-06 ***
## umur                  1 1.580e-05   0.3947      2  49971    0.6739    
## Neighborhood:umur     2 5.744e-05   1.4351      2  49972    0.2381    
## Residuals         49972                                               
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

cda_result <- candisc(model_mancova)
plot(cda_result)

## Vector scale factor set to 4.196

Ancova

Variabel price

model_ancova_price <- lm(Price ~ Neighborhood * umur, data = df)
Anova(model_ancova_price, type = 2)

## Anova Table (Type II tests)
## 
## Response: Price
##                       Sum Sq    Df F value    Pr(>F)    
## Neighborhood      1.4587e+11     2 12.6343 3.269e-06 ***
## umur              1.4354e+09     1  0.2487     0.618    
## Neighborhood:umur 1.3825e+10     2  1.1975     0.302    
## Residuals         2.8847e+14 49972                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Variabel squareFeet

model_ancova_SquareFeet <- lm(SquareFeet ~ Neighborhood * umur, data = df)
Anova(model_ancova_SquareFeet, type = 2)

## Anova Table (Type II tests)
## 
## Response: SquareFeet
##                       Sum Sq    Df F value  Pr(>F)   
## Neighborhood      3.1061e+06     2  4.6923 0.00917 **
## umur              4.1220e+03     1  0.0125 0.91114   
## Neighborhood:umur 8.7703e+05     2  1.3249 0.26584   
## Residuals         1.6540e+10 49972                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Pos Hoc

Varibel price

model_ancova <- lm(Price ~ Neighborhood * umur, data = df)
emmeans_result <- emmeans(model_ancova, ~ Neighborhood)

## NOTE: Results may be misleading due to involvement in interactions

pairs(emmeans_result, adjust = "tukey")

##  contrast       estimate  SE    df t.ratio p.value
##  Rural - Suburb      908 832 49972   1.091  0.5194
##  Rural - Urban     -3089 833 49972  -3.707  0.0006
##  Suburb - Urban    -3997 833 49972  -4.800  <.0001
## 
## P value adjustment: tukey method for comparing a family of 3 estimates

Variabel squareFeet

model_ancova <- lm(SquareFeet ~ Neighborhood * umur, data = df)
emmeans_result <- emmeans(model_ancova, ~ Neighborhood)

## NOTE: Results may be misleading due to involvement in interactions

pairs(emmeans_result, adjust = "tukey")

##  contrast       estimate   SE    df t.ratio p.value
##  Rural - Suburb     1.63 6.30 49972   0.260  0.9635
##  Rural - Urban    -15.88 6.31 49972  -2.517  0.0317
##  Suburb - Urban   -17.52 6.30 49972  -2.778  0.0151
## 
## P value adjustment: tukey method for comparing a family of 3 estimates

Pengaruh Lokasi Rumah terhadap Harga dan Luas Bangunan dengan Pendekatan MANOVA dan MANCOVA berdasarkan Variabel Kontrol Umur Rumah

Gesang Nur Zamroji, Ivanagita Mayang Palupi, Muhammad Nabil Ibrahim

2025-06-04

Persiapan Packages

Praproses Data

Tranformasi variabel yearbuild menjadi umur

Hapus data yang price yang negatif

Exploratory Data Analysis

statistika deskriptif

Cek missing value

Distribusi harga rumah berdasarkan neighborhood

Distribusi data berdasarkan neighborhood

Rata rata umur berdasarkan neighborhood

Korelasi antar variabel dependen

Uji Asumsi

Uji dependensi (Barlett Test)

Uji homogenitas (Box’s M)

Uji normalitas

Manova

Mancova

Ancova

Variabel price

Variabel squareFeet

Pos Hoc

Varibel price

Variabel squareFeet