Persiapan Packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(DataExplorer)
library(ggplot2)
library(corrplot)
## corrplot 0.95 loaded
library(ggplot2)
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(biotools)
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
## ---
## biotools version 4.3
library(candisc)
## Loading required package: heplots
## Loading required package: broom
##
## Attaching package: 'heplots'
##
## The following object is masked from 'package:biotools':
##
## boxM
##
##
## Attaching package: 'candisc'
##
## The following object is masked from 'package:stats':
##
## cancor
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:psych':
##
## logit
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(emmeans)
## Welcome to emmeans.
## Caution: You lose important information if you filter this package's results.
## See '? untidy'
library(dplyr)
Praproses Data
df <- read.csv("D:/Projek_anmul/housing_price_dataset.csv")
head(df)
## SquareFeet Bedrooms Bathrooms Neighborhood YearBuilt Price
## 1 2126 4 1 Rural 1969 215355.3
## 2 2459 3 2 Rural 1980 195014.2
## 3 1860 2 1 Suburb 1970 306891.0
## 4 2294 2 1 Urban 1996 206786.8
## 5 2130 5 2 Suburb 2001 272436.2
## 6 2095 2 3 Suburb 2020 198208.8
Hapus data yang price yang negatif
df_negatif <- df[df$Price < 0, ]
df_negatif
## SquareFeet Bedrooms Bathrooms Neighborhood YearBuilt Price umur
## 1267 1024 2 2 Urban 2006 -24715.2425 19
## 2311 1036 4 1 Suburb 1983 -7550.5046 42
## 3631 1235 3 2 Rural 2012 -19871.2511 13
## 4163 1352 5 2 Suburb 1977 -10608.3595 48
## 5119 1140 4 1 Urban 2020 -23911.0031 5
## 5952 1097 4 3 Rural 1981 -4537.4186 44
## 6356 1016 5 2 Rural 1997 -13803.6841 28
## 8721 1235 3 1 Urban 1952 -24183.0005 73
## 9612 1131 3 3 Urban 1959 -13692.0261 66
## 10598 1177 2 3 Urban 2010 -434.0971 15
## 11992 1213 4 1 Suburb 2020 -4910.4153 5
## 17443 1600 2 3 Rural 1989 -8238.8845 36
## 17707 1080 5 1 Rural 1955 -28774.9980 70
## 20212 1049 3 1 Rural 2005 -18159.6857 20
## 20760 1036 2 2 Urban 1957 -4810.7243 68
## 23651 1024 4 3 Suburb 1953 -4295.9322 72
## 25460 1106 2 2 Urban 1984 -7177.6285 41
## 29828 1173 5 2 Rural 1988 -847.8951 37
## 30172 1066 3 1 Rural 1964 -602.2091 61
## 33667 1013 5 2 Urban 1960 -36588.1654 65
## 35554 1374 4 3 Urban 1996 -4771.5702 29
## 36930 1078 5 1 Suburb 2015 -6159.0392 10
df <- df[df$Price >= 0, ]
Exploratory Data Analysis
statistika deskriptif
summary(df)
## SquareFeet Bedrooms Bathrooms Neighborhood
## Min. :1000 Min. :2.000 Min. :1.000 Length:49978
## 1st Qu.:1514 1st Qu.:3.000 1st Qu.:1.000 Class :character
## Median :2008 Median :3.000 Median :2.000 Mode :character
## Mean :2007 Mean :3.499 Mean :1.995
## 3rd Qu.:2506 3rd Qu.:4.000 3rd Qu.:3.000
## Max. :2999 Max. :5.000 Max. :3.000
## YearBuilt Price umur
## Min. :1950 Min. : 154.8 Min. : 4.0
## 1st Qu.:1967 1st Qu.:170007.5 1st Qu.:22.0
## Median :1985 Median :225100.1 Median :40.0
## Mean :1985 Mean :224931.7 Mean :39.6
## 3rd Qu.:2003 3rd Qu.:279395.8 3rd Qu.:58.0
## Max. :2021 Max. :492195.3 Max. :75.0
Cek missing value
colSums(is.na(df))
## SquareFeet Bedrooms Bathrooms Neighborhood YearBuilt Price
## 0 0 0 0 0 0
## umur
## 0
plot_missing(df)

Distribusi harga rumah berdasarkan neighborhood
ggplot(df, aes(x = Neighborhood, y = Price)) +
geom_boxplot(fill = "skyblue") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Distribusi harga rumah berdasarkan neighborhood")

Distribusi data berdasarkan neighborhood
ggplot(df, aes(x = Neighborhood)) +
geom_bar(fill = "skyblue") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Distribusi data berdasarkan neighborhood", y = "Jumlah")

Rata rata umur berdasarkan neighborhood
summary_data <- df %>%
group_by(Neighborhood) %>%
summarise(avg_umur = mean(umur, na.rm = TRUE))
ggplot(summary_data, aes(x = Neighborhood, y = avg_umur, fill = Neighborhood)) +
geom_col() +
labs(title = "Rata rata umur berdasarkan neighborhood",
x = "Neighborhood",
y = "Rata-rata Umur (Tahun)") +
theme_minimal()

Korelasi antar variabel dependen
cor(df$SquareFeet, df$Price, use = "complete.obs", method = "pearson")
## [1] 0.7506557
ggplot(df, aes(x = SquareFeet, y = Price)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(title = "Scatterplot: Squarefeet vs Price")
## `geom_smooth()` using formula = 'y ~ x'

Uji Asumsi
Uji dependensi (Barlett Test)
data_dep <- df[, c("SquareFeet", "Price")]
cormat <- cor(data_dep)
cortest.bartlett(cormat, n = nrow(df))
## $chisq
## [1] 41426.21
##
## $p.value
## [1] 0
##
## $df
## [1] 1
Uji homogenitas (Box’s M)
data_dep <- df[, c("SquareFeet", "Price")]
df$Neighborhood <- as.factor(df$Neighborhood)
boxM(data_dep, group = df$Neighborhood)
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: data_dep
## Chi-Sq (approx.) = 1.7736, df = 6, p-value = 0.9393
Uji normalitas
X <- df[, c("SquareFeet", "Price")]
mahal_dist2 <- mahalanobis(X, colMeans(X), cov(X))
n <- nrow(X)
p <- ncol(X)
chi_sq_quantile <- qchisq(ppoints(n), df = p)
r_q <- cor(sort(mahal_dist2), chi_sq_quantile)
cat("Koefisien Korelasi (r_q):", round(r_q, 4), "\n")
## Koefisien Korelasi (r_q): 0.9941
df_korelasi <- n - 2
t_kritis <- qt(0.975, df_korelasi)
r_table <- t_kritis / sqrt(t_kritis^2 + df_korelasi)
cat("r tabel (alpha = 0.05, df =", df_korelasi, "):", round(r_table, 4), "\n")
## r tabel (alpha = 0.05, df = 49976 ): 0.0088
if (r_q < r_table) {
cat("Kesimpulan: Tolak H0 → Data TIDAK normal multivariat\n")
} else {
cat("Kesimpulan: Gagal tolak H0 → Data normal multivariat\n")
}
## Kesimpulan: Gagal tolak H0 → Data normal multivariat
qqplot(chi_sq_quantile, sort(mahal_dist2),
main = "Q-Q Plot Mahalanobis² vs Chi-Square Quantiles",
xlab = "Chi-square Quantiles",
ylab = "Sorted Mahalanobis Distance²")
abline(0, 1, col = "red")

Manova
# MANOVA
manova_model <- manova(cbind(Price, SquareFeet) ~ Neighborhood, data = df)
summary(manova_model, test = "Roy")
## Df Roy approx F num Df den Df Pr(>F)
## Neighborhood 2 0.00053128 13.275 2 49975 1.722e-06 ***
## Residuals 49975
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
manova_model <- manova(cbind(Price, SquareFeet) ~ Neighborhood, data = df)
cda <- candisc(manova_model)
plot(cda, main = "Canonical Discriminant Plot")
## Vector scale factor set to 4.194

ggplot(df, aes(x = Price, y = SquareFeet, color = Neighborhood)) +
geom_point(size = 3) +
labs(title = "Visualisasi Scatter Plot",
x = "Price", y = "SquareFeet") +
theme_minimal()

Mancova
model_mancova <- manova(cbind(Price, SquareFeet) ~ Neighborhood*umur, data = df)
summary(model_mancova, test = "Roy")
## Df Roy approx F num Df den Df Pr(>F)
## Neighborhood 2 5.313e-04 13.2751 2 49972 1.723e-06 ***
## umur 1 1.580e-05 0.3947 2 49971 0.6739
## Neighborhood:umur 2 5.744e-05 1.4351 2 49972 0.2381
## Residuals 49972
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cda_result <- candisc(model_mancova)
plot(cda_result)
## Vector scale factor set to 4.196

Ancova
Variabel price
model_ancova_price <- lm(Price ~ Neighborhood * umur, data = df)
Anova(model_ancova_price, type = 2)
## Anova Table (Type II tests)
##
## Response: Price
## Sum Sq Df F value Pr(>F)
## Neighborhood 1.4587e+11 2 12.6343 3.269e-06 ***
## umur 1.4354e+09 1 0.2487 0.618
## Neighborhood:umur 1.3825e+10 2 1.1975 0.302
## Residuals 2.8847e+14 49972
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Variabel squareFeet
model_ancova_SquareFeet <- lm(SquareFeet ~ Neighborhood * umur, data = df)
Anova(model_ancova_SquareFeet, type = 2)
## Anova Table (Type II tests)
##
## Response: SquareFeet
## Sum Sq Df F value Pr(>F)
## Neighborhood 3.1061e+06 2 4.6923 0.00917 **
## umur 4.1220e+03 1 0.0125 0.91114
## Neighborhood:umur 8.7703e+05 2 1.3249 0.26584
## Residuals 1.6540e+10 49972
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Pos Hoc
Varibel price
model_ancova <- lm(Price ~ Neighborhood * umur, data = df)
emmeans_result <- emmeans(model_ancova, ~ Neighborhood)
## NOTE: Results may be misleading due to involvement in interactions
pairs(emmeans_result, adjust = "tukey")
## contrast estimate SE df t.ratio p.value
## Rural - Suburb 908 832 49972 1.091 0.5194
## Rural - Urban -3089 833 49972 -3.707 0.0006
## Suburb - Urban -3997 833 49972 -4.800 <.0001
##
## P value adjustment: tukey method for comparing a family of 3 estimates
Variabel squareFeet
model_ancova <- lm(SquareFeet ~ Neighborhood * umur, data = df)
emmeans_result <- emmeans(model_ancova, ~ Neighborhood)
## NOTE: Results may be misleading due to involvement in interactions
pairs(emmeans_result, adjust = "tukey")
## contrast estimate SE df t.ratio p.value
## Rural - Suburb 1.63 6.30 49972 0.260 0.9635
## Rural - Urban -15.88 6.31 49972 -2.517 0.0317
## Suburb - Urban -17.52 6.30 49972 -2.778 0.0151
##
## P value adjustment: tukey method for comparing a family of 3 estimates