Persiapan Packages

library(tidyverse)   
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(DataExplorer) 
library(ggplot2)     
library(corrplot)    
## corrplot 0.95 loaded
library(ggplot2)
library(psych)
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(biotools)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## ---
## biotools version 4.3
library(candisc)
## Loading required package: heplots
## Loading required package: broom
## 
## Attaching package: 'heplots'
## 
## The following object is masked from 'package:biotools':
## 
##     boxM
## 
## 
## Attaching package: 'candisc'
## 
## The following object is masked from 'package:stats':
## 
##     cancor
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:psych':
## 
##     logit
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(emmeans)
## Welcome to emmeans.
## Caution: You lose important information if you filter this package's results.
## See '? untidy'
library(dplyr)

Praproses Data

df <- read.csv("D:/Projek_anmul/housing_price_dataset.csv")
head(df)
##   SquareFeet Bedrooms Bathrooms Neighborhood YearBuilt    Price
## 1       2126        4         1        Rural      1969 215355.3
## 2       2459        3         2        Rural      1980 195014.2
## 3       1860        2         1       Suburb      1970 306891.0
## 4       2294        2         1        Urban      1996 206786.8
## 5       2130        5         2       Suburb      2001 272436.2
## 6       2095        2         3       Suburb      2020 198208.8

Tranformasi variabel yearbuild menjadi umur

tahun_sekarang <- as.numeric(format(Sys.Date(), "%Y"))
df$umur <- tahun_sekarang - df$YearBuilt

Hapus data yang price yang negatif

df_negatif <- df[df$Price < 0, ]
df_negatif
##       SquareFeet Bedrooms Bathrooms Neighborhood YearBuilt       Price umur
## 1267        1024        2         2        Urban      2006 -24715.2425   19
## 2311        1036        4         1       Suburb      1983  -7550.5046   42
## 3631        1235        3         2        Rural      2012 -19871.2511   13
## 4163        1352        5         2       Suburb      1977 -10608.3595   48
## 5119        1140        4         1        Urban      2020 -23911.0031    5
## 5952        1097        4         3        Rural      1981  -4537.4186   44
## 6356        1016        5         2        Rural      1997 -13803.6841   28
## 8721        1235        3         1        Urban      1952 -24183.0005   73
## 9612        1131        3         3        Urban      1959 -13692.0261   66
## 10598       1177        2         3        Urban      2010   -434.0971   15
## 11992       1213        4         1       Suburb      2020  -4910.4153    5
## 17443       1600        2         3        Rural      1989  -8238.8845   36
## 17707       1080        5         1        Rural      1955 -28774.9980   70
## 20212       1049        3         1        Rural      2005 -18159.6857   20
## 20760       1036        2         2        Urban      1957  -4810.7243   68
## 23651       1024        4         3       Suburb      1953  -4295.9322   72
## 25460       1106        2         2        Urban      1984  -7177.6285   41
## 29828       1173        5         2        Rural      1988   -847.8951   37
## 30172       1066        3         1        Rural      1964   -602.2091   61
## 33667       1013        5         2        Urban      1960 -36588.1654   65
## 35554       1374        4         3        Urban      1996  -4771.5702   29
## 36930       1078        5         1       Suburb      2015  -6159.0392   10
df <- df[df$Price >= 0, ]

Exploratory Data Analysis

statistika deskriptif

summary(df)
##    SquareFeet      Bedrooms       Bathrooms     Neighborhood      
##  Min.   :1000   Min.   :2.000   Min.   :1.000   Length:49978      
##  1st Qu.:1514   1st Qu.:3.000   1st Qu.:1.000   Class :character  
##  Median :2008   Median :3.000   Median :2.000   Mode  :character  
##  Mean   :2007   Mean   :3.499   Mean   :1.995                     
##  3rd Qu.:2506   3rd Qu.:4.000   3rd Qu.:3.000                     
##  Max.   :2999   Max.   :5.000   Max.   :3.000                     
##    YearBuilt        Price               umur     
##  Min.   :1950   Min.   :   154.8   Min.   : 4.0  
##  1st Qu.:1967   1st Qu.:170007.5   1st Qu.:22.0  
##  Median :1985   Median :225100.1   Median :40.0  
##  Mean   :1985   Mean   :224931.7   Mean   :39.6  
##  3rd Qu.:2003   3rd Qu.:279395.8   3rd Qu.:58.0  
##  Max.   :2021   Max.   :492195.3   Max.   :75.0

Cek missing value

colSums(is.na(df))  
##   SquareFeet     Bedrooms    Bathrooms Neighborhood    YearBuilt        Price 
##            0            0            0            0            0            0 
##         umur 
##            0
plot_missing(df)    

Distribusi harga rumah berdasarkan neighborhood

ggplot(df, aes(x = Neighborhood, y = Price)) +
  geom_boxplot(fill = "skyblue") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Distribusi harga rumah berdasarkan neighborhood")

Distribusi data berdasarkan neighborhood

ggplot(df, aes(x = Neighborhood)) +
  geom_bar(fill = "skyblue") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Distribusi data  berdasarkan neighborhood", y = "Jumlah")

Rata rata umur berdasarkan neighborhood

summary_data <- df %>%
  group_by(Neighborhood) %>%
  summarise(avg_umur = mean(umur, na.rm = TRUE))

ggplot(summary_data, aes(x = Neighborhood, y = avg_umur, fill = Neighborhood)) +
  geom_col() +
  labs(title = "Rata rata umur berdasarkan neighborhood",
       x = "Neighborhood",
       y = "Rata-rata Umur (Tahun)") +
  theme_minimal()

Korelasi antar variabel dependen

cor(df$SquareFeet, df$Price, use = "complete.obs", method = "pearson")
## [1] 0.7506557
ggplot(df, aes(x = SquareFeet, y = Price)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(title = "Scatterplot: Squarefeet vs Price")
## `geom_smooth()` using formula = 'y ~ x'

Uji Asumsi

Uji dependensi (Barlett Test)

data_dep <- df[, c("SquareFeet", "Price")]
cormat <- cor(data_dep)
cortest.bartlett(cormat, n = nrow(df))  
## $chisq
## [1] 41426.21
## 
## $p.value
## [1] 0
## 
## $df
## [1] 1

Uji homogenitas (Box’s M)

data_dep <- df[, c("SquareFeet", "Price")]
df$Neighborhood <- as.factor(df$Neighborhood)
boxM(data_dep, group = df$Neighborhood)
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  data_dep
## Chi-Sq (approx.) = 1.7736, df = 6, p-value = 0.9393

Uji normalitas

X <- df[, c("SquareFeet", "Price")]
mahal_dist2 <- mahalanobis(X, colMeans(X), cov(X))

n <- nrow(X)
p <- ncol(X)

chi_sq_quantile <- qchisq(ppoints(n), df = p)

r_q <- cor(sort(mahal_dist2), chi_sq_quantile)

cat("Koefisien Korelasi (r_q):", round(r_q, 4), "\n")
## Koefisien Korelasi (r_q): 0.9941
df_korelasi <- n - 2
t_kritis <- qt(0.975, df_korelasi)  
r_table <- t_kritis / sqrt(t_kritis^2 + df_korelasi)

cat("r tabel (alpha = 0.05, df =", df_korelasi, "):", round(r_table, 4), "\n")
## r tabel (alpha = 0.05, df = 49976 ): 0.0088
if (r_q < r_table) {
  cat("Kesimpulan: Tolak H0 → Data TIDAK normal multivariat\n")
} else {
  cat("Kesimpulan: Gagal tolak H0 → Data normal multivariat\n")
}
## Kesimpulan: Gagal tolak H0 → Data normal multivariat
qqplot(chi_sq_quantile, sort(mahal_dist2),
       main = "Q-Q Plot Mahalanobis² vs Chi-Square Quantiles",
       xlab = "Chi-square Quantiles",
       ylab = "Sorted Mahalanobis Distance²")
abline(0, 1, col = "red")

Manova

# MANOVA
manova_model <- manova(cbind(Price, SquareFeet) ~ Neighborhood, data = df)

summary(manova_model, test = "Roy")
##                 Df        Roy approx F num Df den Df    Pr(>F)    
## Neighborhood     2 0.00053128   13.275      2  49975 1.722e-06 ***
## Residuals    49975                                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
manova_model <- manova(cbind(Price, SquareFeet) ~ Neighborhood, data = df)
cda <- candisc(manova_model)
plot(cda, main = "Canonical Discriminant Plot")
## Vector scale factor set to 4.194

ggplot(df, aes(x = Price, y = SquareFeet, color = Neighborhood)) +
  geom_point(size = 3) +
  labs(title = "Visualisasi Scatter Plot",
       x = "Price", y = "SquareFeet") +
  theme_minimal()

Mancova

model_mancova <- manova(cbind(Price, SquareFeet) ~ Neighborhood*umur, data = df)
summary(model_mancova, test = "Roy")
##                      Df       Roy approx F num Df den Df    Pr(>F)    
## Neighborhood          2 5.313e-04  13.2751      2  49972 1.723e-06 ***
## umur                  1 1.580e-05   0.3947      2  49971    0.6739    
## Neighborhood:umur     2 5.744e-05   1.4351      2  49972    0.2381    
## Residuals         49972                                               
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cda_result <- candisc(model_mancova)
plot(cda_result)
## Vector scale factor set to 4.196

Ancova

Variabel price

model_ancova_price <- lm(Price ~ Neighborhood * umur, data = df)
Anova(model_ancova_price, type = 2)
## Anova Table (Type II tests)
## 
## Response: Price
##                       Sum Sq    Df F value    Pr(>F)    
## Neighborhood      1.4587e+11     2 12.6343 3.269e-06 ***
## umur              1.4354e+09     1  0.2487     0.618    
## Neighborhood:umur 1.3825e+10     2  1.1975     0.302    
## Residuals         2.8847e+14 49972                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Variabel squareFeet

model_ancova_SquareFeet <- lm(SquareFeet ~ Neighborhood * umur, data = df)
Anova(model_ancova_SquareFeet, type = 2)
## Anova Table (Type II tests)
## 
## Response: SquareFeet
##                       Sum Sq    Df F value  Pr(>F)   
## Neighborhood      3.1061e+06     2  4.6923 0.00917 **
## umur              4.1220e+03     1  0.0125 0.91114   
## Neighborhood:umur 8.7703e+05     2  1.3249 0.26584   
## Residuals         1.6540e+10 49972                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Pos Hoc

Varibel price

model_ancova <- lm(Price ~ Neighborhood * umur, data = df)
emmeans_result <- emmeans(model_ancova, ~ Neighborhood)
## NOTE: Results may be misleading due to involvement in interactions
pairs(emmeans_result, adjust = "tukey")
##  contrast       estimate  SE    df t.ratio p.value
##  Rural - Suburb      908 832 49972   1.091  0.5194
##  Rural - Urban     -3089 833 49972  -3.707  0.0006
##  Suburb - Urban    -3997 833 49972  -4.800  <.0001
## 
## P value adjustment: tukey method for comparing a family of 3 estimates

Variabel squareFeet

model_ancova <- lm(SquareFeet ~ Neighborhood * umur, data = df)
emmeans_result <- emmeans(model_ancova, ~ Neighborhood)
## NOTE: Results may be misleading due to involvement in interactions
pairs(emmeans_result, adjust = "tukey")
##  contrast       estimate   SE    df t.ratio p.value
##  Rural - Suburb     1.63 6.30 49972   0.260  0.9635
##  Rural - Urban    -15.88 6.31 49972  -2.517  0.0317
##  Suburb - Urban   -17.52 6.30 49972  -2.778  0.0151
## 
## P value adjustment: tukey method for comparing a family of 3 estimates