Khansa Mutiara Kheeda (162112133110)
Naila Shakira (162112133052)
Andi Syaiful Rahmat (502210010029)
melb <- read.csv("E:/UNHAS/SEMESTER 5 (PMM UNAIR)/5 (KAMIS & JUMAT) EKSPLORASI DAN VISUALISASI DATA/PEKAN 11/melb_data.csv")
Dataset yang digunakan dalam tugas ini berisi informasi tentang perumahan di Melbourne. Data ini dapat di download pada link berikut https://www.kaggle.com/datasets/dansbecker/melbourne-housing-snapshot. Data ini berisikan 13580 baris informasi sehingga dalam pengolahan data yang dilakukan hanya diambil 1000 baris data terakhir. Pengambilan data 1000 baris terakhir didasarkan pada letak missing value pada dataset yang dominan berada pada akhir data
library(tidyverse)
library(gridExtra)
library(heatmaply)
library(visdat)
library(reshape2)
library(tidyr)
library(ggplot2)
library(psych)
library(DataExplorer)
melb = melb%>% slice(12581:13580)
price = melb$Price
rooms = melb$Rooms
distance = melb$Distance
bathroom = melb$Bathroom
car = melb$Car
landsize = melb$Landsize
buildingarea = melb$BuildingArea
yearbuilt = melb$YearBuilt
lattitude = melb$Lattitude
longtitude = melb$Longtitude
melb = data.frame(price, rooms, distance, bathroom, car, landsize, buildingarea, yearbuilt, lattitude, longtitude)
sapply(melb, function(x)sum(is.na(x)))
## price rooms distance bathroom car landsize
## 0 0 0 0 46 0
## buildingarea yearbuilt lattitude longtitude
## 483 425 0 0
vis_miss(melb, cluster = TRUE)
Dari data di atas diketahui terdapat 3 variabel yang mempunyai data hilang (NA). Ketiga variabel tersebut ialah data pada kolom Car, Building Area, dan Year Built, yang masing-masing memiliki data hilang sebanyak 46, 483, dan 425 baris. Secara keselurahan, penyebaran data hilang (missing value) pada data tersebut ditunjukkan oleh grafik di atas dengan persentase data yang hilang sebesar 9,5% dan sebesar 90,5% data yang terisi.
Untuk mengatasi hal ini, kita gantikan semua missing value pada ketiga atribut tersebut dengan nilai rata-rata dari masing-masing atribut dengan outlier.
rata.car = mean(car, na.rm = T)
car[which(is.na(car))] = rata.car
rata.buildingarea = mean(buildingarea, na.rm = T)
buildingarea[which(is.na(buildingarea))] = rata.buildingarea
rata.yearbuilt = mean(yearbuilt, na.rm = T)
yearbuilt[which(is.na(yearbuilt))] = rata.yearbuilt
Berikut adalah grafik scatterplot dari ketiga atribut yang masih memiliki outlier.
outs_plot = ggplot(data = melb, aes(x = car, y = price)) +
geom_point() +
geom_smooth(method = lm) +
ggtitle("Outliers Plot (1)")
outs_plot1 = ggplot(data = melb, aes(x = buildingarea, y = price)) +
geom_point() +
geom_smooth(method = lm) +
ggtitle("Outliers Plot (2)")
outs_plot2 = ggplot(data = melb, aes(x = yearbuilt, y = price)) +
geom_point() +
geom_smooth(method = lm) +
ggtitle("Outliers Plot (3)")
outs_plot3 = ggplot(data = melb, aes(x = yearbuilt+buildingarea+car, y = price)) +
geom_point() +
geom_smooth(method = lm) +
ggtitle("Outliers Plot (1, 2, & 3)")
gridExtra::grid.arrange(outs_plot, outs_plot1, outs_plot2, ncol=3)
gridExtra::grid.arrange(outs_plot3, ncol=1)
model <- lm(price ~ car, data = melb)
summary(model)
##
## Call:
## lm(formula = price ~ car, data = melb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -886355 -431176 -156220 250052 3883824
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1026086 43063 23.828 < 2e-16 ***
## car 77545 19740 3.928 9.17e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 611700 on 952 degrees of freedom
## (46 observations deleted due to missingness)
## Multiple R-squared: 0.01595, Adjusted R-squared: 0.01492
## F-statistic: 15.43 on 1 and 952 DF, p-value: 9.173e-05
model1 <- lm(price ~ buildingarea, data = melb)
summary(model1)
##
## Call:
## lm(formula = price ~ buildingarea, data = melb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -904892 -453673 -159942 261009 2870631
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.188e+06 2.811e+04 42.251 <2e-16 ***
## buildingarea 8.586e+00 1.430e+01 0.601 0.548
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 634100 on 515 degrees of freedom
## (483 observations deleted due to missingness)
## Multiple R-squared: 0.0006997, Adjusted R-squared: -0.001241
## F-statistic: 0.3606 on 1 and 515 DF, p-value: 0.5484
model2 <- lm(price ~ yearbuilt, data = melb)
summary(model2)
##
## Call:
## lm(formula = price ~ yearbuilt, data = melb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -966994 -401271 -176273 228791 3533288
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14531484.4 1395120.0 10.416 <2e-16 ***
## yearbuilt -6781.3 709.7 -9.555 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 601800 on 573 degrees of freedom
## (425 observations deleted due to missingness)
## Multiple R-squared: 0.1374, Adjusted R-squared: 0.1359
## F-statistic: 91.3 on 1 and 573 DF, p-value: < 2.2e-16
model3 <- lm(price ~ yearbuilt+buildingarea+car, data = melb)
summary(model3)
##
## Call:
## lm(formula = price ~ yearbuilt + buildingarea + car, data = melb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1699737 -322776 -107803 205303 3122997
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16963710.9 1395107.9 12.159 <2e-16 ***
## yearbuilt -8347.2 713.4 -11.700 <2e-16 ***
## buildingarea 3349.4 306.5 10.926 <2e-16 ***
## car 53899.2 23384.5 2.305 0.0216 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 512300 on 476 degrees of freedom
## (520 observations deleted due to missingness)
## Multiple R-squared: 0.3414, Adjusted R-squared: 0.3373
## F-statistic: 82.26 on 3 and 476 DF, p-value: < 2.2e-16
cooksD <- cooks.distance(model)
cooksD1 <- cooks.distance(model1)
cooksD2 <- cooks.distance(model2)
cooksD3 <- cooks.distance(model3)
n <- nrow(melb)
plot(cooksD, main = "Cooks Distance for Influential Obs (1)")
abline(h = 4/n, lty = 5, col = "red") #add cutoff line
plot(cooksD1, main = "Cooks Distance for Influential Obs (2)")
abline(h = 4/n, lty = 5, col = "red") #add cutoff line
plot(cooksD2, main = "Cooks Distance for Influential Obs (3)")
abline(h = 4/n, lty = 5, col = "red") #add cutoff line
plot(cooksD3, main = "Cooks Distance for Influential Obs (1, 2, & 3)")
abline(h = 4/n, lty = 5, col = "red") #add cutoff line
influential_obs <- as.numeric(names(cooksD)[(cooksD > (4/n))])
influential_obs
## [1] 21 37 67 83 87 112 124 125 130 157 174 183 192 193 208 237 303 335 339
## [20] 349 384 421 434 517 525 540 558 609 618 620 643 698 717 729 757 762 765 767
## [39] 787 834 848 887 889 891 895 900 904 907 911 939 948 976 999
influential_obs1 <- as.numeric(names(cooksD1)[(cooksD1 > (4/n))])
influential_obs1
## [1] 21 37 64 67 96 192 193 208 309 396 505 517 522 559 618 620 645 666 717
## [20] 762 765 769 788 855 887 889 895 904 939 999
influential_obs2 <- as.numeric(names(cooksD2)[(cooksD2 > (4/n))])
influential_obs2
## [1] 3 4 21 29 37 64 67 74 77 84 86 89 96 97 98 164 169 182 183
## [20] 192 208 214 264 277 307 309 314 321 323 352 359 384 396 505 508 517 522 559
## [39] 617 618 620 623 636 645 646 709 710 717 718 728 757 762 765 769 787 788 789
## [58] 798 808 834 855 887 889 895 904 929 939 942 974 978 999
influential_obs3 <- as.numeric(names(cooksD3)[(cooksD3 > (4/n))])
influential_obs3
## [1] 4 21 37 64 67 82 86 96 128 165 189 192 208 296 314 321 323 336 348
## [20] 352 361 384 386 461 465 505 517 522 559 601 617 618 620 636 698 709 717 728
## [39] 731 757 762 765 769 786 788 791 855 887 889 895 900 904 911 920 939 942 948
## [58] 999
define new data frame with influential points removed
outliers_removed <- melb[-influential_obs,]
outliers_removed1 <- melb[-influential_obs1,]
outliers_removed2 <- melb[-influential_obs2,]
outliers_removed3 <- melb[-influential_obs3,]
outliers_present <- ggplot(data = melb, aes(x = car , y = price)) +
geom_point() +
geom_smooth(method = lm) +
ggtitle("Outliers Present (1)")
outliers_present1 <- ggplot(data = melb, aes(x = buildingarea, y = price)) +
geom_point() +
geom_smooth(method = lm) +
ggtitle("Outliers Present (2)")
outliers_present2 <- ggplot(data = melb, aes(x = yearbuilt, y = price)) +
geom_point() +
geom_smooth(method = lm) +
ggtitle("Outliers Present (3)")
outliers_present3 <- ggplot(data = melb, aes(x = yearbuilt+buildingarea+car, y = price)) +
geom_point() +
geom_smooth(method = lm) +
ggtitle("Outliers Present (1, 2, & 3)")
outliers_removed <- ggplot(data = outliers_removed, aes(x = car, y = price)) +
geom_point() +
geom_smooth(method = lm) +
ggtitle("Outliers Removed (1)")
outliers_removed1 <- ggplot(data = outliers_removed1, aes(x = buildingarea, y = price)) +
geom_point() +
geom_smooth(method = lm) +
ggtitle("Outliers Removed (2)")
outliers_removed2 <- ggplot(data = outliers_removed2, aes(x = yearbuilt, y = price)) +
geom_point() +
geom_smooth(method = lm) +
ggtitle("Outliers Removed (3)")
outliers_removed3 <- ggplot(data = outliers_removed3, aes(x = yearbuilt+buildingarea+car, y = price)) +
geom_point() +
geom_smooth(method = lm) +
ggtitle("Outliers Removed (1, 2, & 3)")
plot both scatterplots side by side
gridExtra::grid.arrange(outliers_present, outliers_removed, ncol = 2)
gridExtra::grid.arrange(outliers_present1, outliers_removed1, ncol = 2)
gridExtra::grid.arrange(outliers_present2, outliers_removed2, ncol = 2)
gridExtra::grid.arrange(outliers_present3, outliers_removed3, ncol = 2)
dffit <- as.data.frame(dffits(model))
head(dffit)
dffit1 <- as.data.frame(dffits(model1))
head(dffit1)
dffit2 <- as.data.frame(dffits(model2))
head(dffit2)
dffit3 <- as.data.frame(dffits(model3))
head(dffit3)
#find number of predictors in model
p <- length(model$coefficients)-1
p1 <- length(model1$coefficients)-1
p2 <- length(model2$coefficients)-1
p3 <- length(model3$coefficients)-1
#find number of observations
n <- nrow(melb)
#calculate DFFITS threshold value
threshdf <- 2*sqrt(p/n)
threshdf1 <- 2*sqrt(p1/n)
threshdf2 <- 2*sqrt(p2/n)
threshdf3 <- 2*sqrt(p3/n)
threshdf
## [1] 0.06324555
threshdf1
## [1] 0.06324555
threshdf2
## [1] 0.06324555
threshdf3
## [1] 0.1095445
## sort observations by DFFITS, descending
sort_dffit <- dffit[order(-dffit),]
sort_dffit1 <- dffit1[order(-dffit1),]
sort_dffit2 <- dffit2[order(-dffit2),]
sort_dffit3 <- dffit2[order(-dffit3),]
head(sort_dffit)
## [1] 0.4068252 0.3431635 0.2995003 0.2171134 0.2154575 0.2140119
head(sort_dffit1)
## [1] 0.2034640 0.1836657 0.1716274 0.1699512 0.1603054 0.1553892
head(sort_dffit2)
## [1] 0.4304203 0.3264060 0.2708635 0.2678575 0.2274852 0.2170418
head(sort_dffit3)
## [1] 0.326405982 -0.017145393 0.000232594 0.047296398 -0.040001980
## [6] 0.013443488
# plot DFFITS values for each observation
plot(dffits(model), type = 'h')
#add horizontal lines at absolute values for threshold
abline(h = threshdf, lty = 2)
abline(h = -threshdf, lty = 2)
plot(dffits(model1), type = 'h')
#add horizontal lines at absolute values for threshold
abline(h = threshdf1, lty = 2)
abline(h = -threshdf1, lty = 2)
plot(dffits(model2), type = 'h')
#add horizontal lines at absolute values for threshold
abline(h = threshdf2, lty = 2)
abline(h = -threshdf2, lty = 2)
plot(dffits(model3), type = 'h')
#add horizontal lines at absolute values for threshold
abline(h = threshdf3, lty = 2)
abline(h = -threshdf3, lty = 2)
#calculate DFBETAS for each observation in the model
dfbeta <- as.data.frame(dfbetas(model))
dfbeta1 <- as.data.frame(dfbetas(model1))
dfbeta2 <- as.data.frame(dfbetas(model2))
dfbeta3 <- as.data.frame(dfbetas(model3))
#display DFBETAS for each observation
head(dfbeta)
head(dfbeta1)
head(dfbeta2)
head(dfbeta3)
#calculate DFBETAS threshold value
threshdb <- 2/sqrt(n)
threshdb
## [1] 0.06324555
#specify 2 rows and 1 column in plotting region
par(mfrow=c(2,1))
#plot DFBETAS for disp with threshold lines
plotdb <- {plot(dfbeta, type='h')
abline(h = threshdb, lty = 2)
abline(h = -threshdb, lty = 2)}
plotdb1 <- {plot(dfbeta1, type='h')
abline(h = threshdb, lty = 2)
abline(h = -threshdb, lty = 2)}
plotdb2 <- {plot(dfbeta2, type='h')
abline(h = threshdb, lty = 2)
abline(h = -threshdb, lty = 2)}
plotdb3_1 <- {plot(dfbeta3$car, type='h')
abline(h = threshdb, lty = 2)
abline(h = -threshdb, lty = 2)}
plotdb3_2 <- {plot(dfbeta3$buildingarea, type='h')
abline(h = threshdb, lty = 2)
abline(h = -threshdb, lty = 2)}
plotdb3_3 <- {plot(dfbeta3$yearbuilt, type='h')
abline(h = threshdb, lty = 2)
abline(h = -threshdb, lty = 2)}
summary(influence.measures(model))
## Potentially influential observations of
## lm(formula = price ~ car, data = melb) :
##
## dfb.1_ dfb.car dffit cov.r cook.d hat
## 4 0.00 0.00 0.00 1.01_* 0.00 0.00
## 5 -0.01 0.02 0.02 1.01_* 0.00 0.01
## 21 0.11 -0.08 0.12 0.99_* 0.01 0.00
## 26 0.04 -0.03 0.04 1.01_* 0.00 0.00
## 37 0.05 0.01 0.13 0.97_* 0.01 0.00
## 40 0.03 -0.04 -0.05 1.01_* 0.00 0.01
## 52 0.01 -0.01 0.01 1.01_* 0.00 0.00
## 60 -0.02 0.02 0.03 1.01_* 0.00 0.01
## 64 0.03 0.01 0.08 0.99_* 0.00 0.00
## 67 -0.18 0.27 0.30_* 0.97_* 0.04 0.01
## 96 0.04 0.01 0.09 0.99_* 0.00 0.00
## 97 0.03 0.01 0.08 0.99_* 0.00 0.00
## 112 0.04 0.01 0.09 0.99_* 0.00 0.00
## 124 0.09 -0.12 -0.13 1.01_* 0.01 0.01_*
## 125 0.15 -0.19 -0.19_* 1.02_* 0.02 0.02_*
## 130 0.11 -0.14 -0.15_* 1.01_* 0.01 0.01_*
## 141 0.00 0.00 0.00 1.01_* 0.00 0.01
## 157 0.15 -0.19 -0.20_* 1.02_* 0.02 0.02_*
## 162 0.03 0.00 0.07 0.99_* 0.00 0.00
## 164 0.03 0.00 0.07 0.99_* 0.00 0.00
## 183 0.08 0.01 0.21_* 0.92_* 0.02 0.00
## 188 0.01 -0.01 0.01 1.01_* 0.00 0.00
## 192 -0.04 0.08 0.12 0.99_* 0.01 0.00
## 193 0.05 0.01 0.12 0.98_* 0.01 0.00
## 213 0.00 -0.01 -0.01 1.01_* 0.00 0.01
## 223 0.02 -0.02 0.02 1.01_* 0.00 0.00
## 224 0.01 -0.02 -0.02 1.01_* 0.00 0.01
## 228 0.03 -0.04 -0.05 1.01_* 0.00 0.01
## 242 -0.02 0.01 -0.02 1.01_* 0.00 0.00
## 253 0.04 -0.03 0.04 1.01_* 0.00 0.00
## 264 -0.02 0.02 -0.02 1.01_* 0.00 0.00
## 288 0.01 -0.01 -0.01 1.02_* 0.00 0.02_*
## 299 0.01 -0.01 -0.02 1.01_* 0.00 0.01
## 318 0.03 0.00 0.07 0.99_* 0.00 0.00
## 324 0.03 0.00 0.08 0.99_* 0.00 0.00
## 335 -0.13 0.19 0.22_* 0.99_* 0.02 0.01
## 337 0.03 0.01 0.08 0.99_* 0.00 0.00
## 339 0.09 -0.13 -0.13 1.01_* 0.01 0.01_*
## 349 -0.09 0.13 0.15_* 1.00 0.01 0.01
## 361 -0.01 0.02 0.02 1.01_* 0.00 0.01
## 386 0.02 -0.02 -0.03 1.01_* 0.00 0.01
## 393 -0.03 0.03 -0.03 1.01_* 0.00 0.00
## 394 0.03 -0.05 -0.06 1.01_* 0.00 0.01
## 399 0.03 -0.03 0.03 1.01_* 0.00 0.00
## 415 0.03 0.00 0.07 0.99_* 0.00 0.00
## 421 0.10 -0.07 0.11 0.99_* 0.01 0.00
## 434 -0.07 0.16 0.21_* 0.96_* 0.02 0.00
## 435 -0.03 0.05 0.05 1.01_* 0.00 0.01
## 460 0.02 -0.02 0.02 1.01_* 0.00 0.00
## 465 0.03 -0.05 -0.06 1.01_* 0.00 0.01
## 481 0.03 0.00 0.07 0.99_* 0.00 0.00
## 505 0.03 0.00 0.07 0.99_* 0.00 0.00
## 516 0.05 -0.07 -0.08 1.01_* 0.00 0.01_*
## 517 0.04 0.01 0.10 0.98_* 0.00 0.00
## 522 0.03 0.00 0.07 0.99_* 0.00 0.00
## 525 0.07 -0.10 -0.11 1.01_* 0.01 0.01_*
## 529 -0.01 0.01 0.01 1.01_* 0.00 0.01
## 540 -0.04 0.08 0.12 0.99_* 0.01 0.00
## 558 -0.33 0.40 0.41_* 1.03_* 0.08 0.04_*
## 559 0.03 0.01 0.09 0.99_* 0.00 0.00
## 609 0.17 -0.15 0.17_* 1.00 0.01 0.00
## 618 -0.08 0.13 0.14_* 1.00 0.01 0.01
## 620 0.05 0.01 0.13 0.97_* 0.01 0.00
## 628 0.01 -0.01 -0.02 1.01_* 0.00 0.01
## 637 0.03 0.00 0.08 0.99_* 0.00 0.00
## 643 0.08 -0.11 -0.11 1.01_* 0.01 0.01_*
## 666 0.01 -0.01 -0.01 1.01_* 0.00 0.01_*
## 672 0.03 -0.03 0.03 1.01_* 0.00 0.00
## 683 -0.02 0.02 0.02 1.01_* 0.00 0.01_*
## 698 0.08 -0.11 -0.11 1.02_* 0.01 0.02_*
## 700 0.02 -0.02 -0.02 1.02_* 0.00 0.02_*
## 709 -0.04 0.04 -0.04 1.01_* 0.00 0.00
## 713 0.03 -0.05 -0.06 1.01_* 0.00 0.01
## 717 0.11 -0.08 0.11 0.99_* 0.01 0.00
## 729 0.15 -0.14 0.15_* 1.00 0.01 0.00
## 741 0.01 -0.02 -0.02 1.01_* 0.00 0.01
## 762 0.20 -0.15 0.22_* 0.96_* 0.02 0.00
## 767 0.04 0.01 0.10 0.98_* 0.00 0.00
## 769 0.03 0.01 0.09 0.99_* 0.00 0.00
## 787 0.04 0.01 0.11 0.98_* 0.01 0.00
## 788 0.04 0.01 0.09 0.99_* 0.00 0.00
## 792 -0.03 0.04 0.04 1.01_* 0.00 0.01_*
## 829 0.02 -0.03 -0.03 1.01_* 0.00 0.01
## 831 -0.01 0.01 -0.01 1.01_* 0.00 0.00
## 834 -0.14 0.18 0.19_* 1.02_* 0.02 0.02_*
## 838 0.04 -0.04 0.04 1.01_* 0.00 0.00
## 855 0.03 0.01 0.08 0.99_* 0.00 0.00
## 868 0.04 -0.03 0.04 1.01_* 0.00 0.00
## 877 0.02 -0.02 -0.03 1.01_* 0.00 0.01
## 882 0.03 -0.05 -0.05 1.01_* 0.00 0.01
## 884 0.01 -0.02 -0.02 1.01_* 0.00 0.01
## 887 0.05 0.01 0.11 0.98_* 0.01 0.00
## 889 0.05 0.01 0.12 0.97_* 0.01 0.00
## 895 -0.05 0.10 0.14_* 0.99_* 0.01 0.00
## 900 0.09 -0.11 -0.12 1.02_* 0.01 0.02_*
## 904 -0.24 0.33 0.34_* 0.99_* 0.06 0.01_*
## 933 -0.02 0.02 -0.02 1.01_* 0.00 0.00
## 934 0.00 0.00 0.00 1.01_* 0.00 0.00
## 940 0.01 -0.01 -0.01 1.01_* 0.00 0.01
## 944 0.03 0.00 0.08 0.99_* 0.00 0.00
## 948 0.26 -0.32 -0.32_* 1.07_* 0.05 0.07_*
## 952 -0.03 0.05 0.06 1.01_* 0.00 0.01
## 956 -0.01 0.01 0.01 1.01_* 0.00 0.01
## 974 0.03 0.01 0.08 0.99_* 0.00 0.00
## 976 -0.05 0.10 0.14_* 0.99_* 0.01 0.00
## 998 0.01 -0.02 -0.02 1.01_* 0.00 0.01
## 999 -0.13 0.18 0.19_* 1.01 0.02 0.01_*
summary(influence.measures(model1))
## Potentially influential observations of
## lm(formula = price ~ buildingarea, data = melb) :
##
## dfb.1_ dfb.bldn dffit cov.r cook.d hat
## 21 0.10 0.00 0.11 0.98_* 0.01 0.00
## 37 0.17 0.01 0.17 0.95_* 0.01 0.00
## 64 0.11 0.00 0.11 0.98_* 0.01 0.00
## 67 0.18 0.02 0.18 0.94_* 0.02 0.00
## 96 0.11 0.00 0.11 0.98_* 0.01 0.00
## 192 0.11 0.00 0.11 0.98_* 0.01 0.00
## 193 0.15 0.00 0.16 0.96_* 0.01 0.00
## 309 0.09 0.00 0.09 0.99_* 0.00 0.00
## 505 0.09 0.00 0.09 0.99_* 0.00 0.00
## 517 0.13 0.00 0.13 0.97_* 0.01 0.00
## 522 0.09 0.02 0.10 0.99_* 0.00 0.00
## 559 0.11 0.00 0.11 0.98_* 0.01 0.00
## 620 0.17 -0.01 0.17 0.95_* 0.01 0.00
## 666 16.70_* -204.21_* -204.41_* 433.13_* 18295.05_* 1.00_*
## 717 0.10 0.00 0.10 0.98_* 0.01 0.00
## 762 0.20 -0.01 0.20_* 0.93_* 0.02 0.00
## 765 0.09 0.00 0.09 0.99_* 0.00 0.00
## 769 0.11 -0.01 0.11 0.98_* 0.01 0.00
## 788 0.11 0.00 0.11 0.98_* 0.01 0.00
## 855 0.11 -0.01 0.11 0.98_* 0.01 0.00
## 887 0.15 0.00 0.15 0.96_* 0.01 0.00
## 889 0.16 -0.02 0.16 0.96_* 0.01 0.00
## 895 0.13 0.01 0.13 0.97_* 0.01 0.00
## 904 0.15 0.00 0.16 0.96_* 0.01 0.00
summary(influence.measures(model2))
## Potentially influential observations of
## lm(formula = price ~ yearbuilt, data = melb) :
##
## dfb.1_ dfb.yrbl dffit cov.r cook.d hat
## 13 -0.03 0.03 -0.03 1.01_* 0.00 0.01
## 31 -0.04 0.04 -0.04 1.01_* 0.00 0.01
## 37 0.23 -0.23 0.27_* 0.97_* 0.04 0.01
## 64 0.07 -0.07 0.12 0.99_* 0.01 0.00
## 67 -0.07 0.08 0.20_* 0.94_* 0.02 0.00
## 96 -0.01 0.01 0.12 0.98_* 0.01 0.00
## 97 -0.09 0.09 0.15 0.98_* 0.01 0.00
## 164 -0.08 0.08 0.14 0.98_* 0.01 0.00
## 170 -0.04 0.04 -0.04 1.01_* 0.00 0.01
## 183 0.35 -0.35 0.43_* 0.89_* 0.09 0.01
## 192 -0.18 0.19 0.23_* 0.97_* 0.03 0.01
## 214 -0.02 0.02 0.09 0.99_* 0.00 0.00
## 240 -0.02 0.02 -0.02 1.01_* 0.00 0.01
## 251 -0.08 0.08 -0.08 1.01_* 0.00 0.01
## 303 0.04 -0.04 0.05 1.01_* 0.00 0.01
## 350 -0.03 0.03 -0.03 1.01_* 0.00 0.01
## 352 -0.16 0.16 0.20_* 0.98_* 0.02 0.01
## 358 -0.06 0.06 -0.07 1.01_* 0.00 0.01
## 361 -0.04 0.04 -0.04 1.01_* 0.00 0.01
## 383 0.06 -0.06 0.06 1.01_* 0.00 0.01
## 396 0.12 -0.12 0.13 1.01 0.01 0.01_*
## 416 -0.02 0.02 -0.02 1.01_* 0.00 0.01
## 469 -0.06 0.06 -0.07 1.01_* 0.00 0.01_*
## 512 0.06 -0.06 0.06 1.01_* 0.00 0.01
## 517 0.15 -0.15 0.18_* 0.99_* 0.02 0.01
## 518 -0.02 0.02 -0.02 1.01_* 0.00 0.01
## 522 -0.14 0.14 0.19_* 0.98_* 0.02 0.00
## 546 -0.03 0.03 -0.03 1.02_* 0.00 0.01_*
## 559 -0.08 0.09 0.15 0.98_* 0.01 0.00
## 618 -0.04 0.04 0.11 0.99_* 0.01 0.00
## 620 0.16 -0.16 0.22_* 0.96_* 0.02 0.00
## 645 0.12 -0.12 0.13 1.01_* 0.01 0.01_*
## 673 -0.01 0.01 -0.01 1.01_* 0.00 0.01
## 686 0.00 0.00 0.00 1.01_* 0.00 0.01
## 757 0.10 -0.09 0.10 1.01_* 0.01 0.01_*
## 762 0.28 -0.28 0.33_* 0.95_* 0.05 0.01
## 769 0.02 -0.02 0.11 0.98_* 0.01 0.00
## 787 0.13 -0.12 0.17 0.98_* 0.01 0.00
## 788 0.10 -0.10 0.14 0.99_* 0.01 0.00
## 810 0.00 0.00 0.00 1.02_* 0.00 0.01_*
## 836 -0.03 0.03 -0.03 1.01_* 0.00 0.01_*
## 838 -0.05 0.05 -0.05 1.01_* 0.00 0.01
## 839 -0.05 0.05 -0.05 1.01_* 0.00 0.01
## 840 0.02 -0.02 0.03 1.01_* 0.00 0.01
## 887 0.16 -0.16 0.20_* 0.98_* 0.02 0.00
## 889 -0.20 0.20 0.27_* 0.95_* 0.04 0.00
## 895 0.12 -0.11 0.16 0.98_* 0.01 0.00
## 904 0.01 -0.01 0.15 0.96_* 0.01 0.00
## 939 -0.16 0.16 0.20_* 0.98_* 0.02 0.01
## 942 -0.13 0.14 0.17 0.98_* 0.02 0.00
## 945 -0.04 0.04 -0.04 1.01_* 0.00 0.01
summary(influence.measures(model3))
## Potentially influential observations of
## lm(formula = price ~ yearbuilt + buildingarea + car, data = melb) :
##
## dfb.1_ dfb.yrbl dfb.bldn dfb.car dffit cov.r cook.d hat
## 13 -0.01 0.01 0.00 0.01 -0.02 1.03_* 0.00 0.02
## 21 0.09 -0.09 0.09 -0.04 0.13 1.03_* 0.00 0.03_*
## 37 0.25 -0.26 0.30 -0.04 0.40_* 0.96_* 0.04 0.02
## 64 0.10 -0.10 0.06 0.00 0.15 0.97_* 0.01 0.00
## 67 0.02 -0.04 0.49 0.14 0.59_* 0.96_* 0.08 0.03_*
## 96 0.01 -0.01 0.18 -0.04 0.21 0.96_* 0.01 0.01
## 124 0.01 -0.01 0.05 -0.09 -0.10 1.03_* 0.00 0.03_*
## 128 0.00 0.01 -0.22 0.00 -0.24 1.02 0.01 0.03_*
## 165 -0.05 0.05 -0.15 0.03 -0.16 1.03_* 0.01 0.03_*
## 170 -0.02 0.02 0.00 0.01 -0.02 1.03_* 0.00 0.02
## 192 -0.17 0.16 0.14 0.07 0.30_* 0.94_* 0.02 0.01
## 214 0.00 0.00 0.06 -0.02 0.06 1.05_* 0.00 0.04_*
## 237 0.02 -0.02 0.01 -0.02 0.03 1.03_* 0.00 0.02
## 277 -0.02 0.02 0.07 -0.02 0.08 1.03_* 0.00 0.02
## 288 0.00 0.00 0.00 -0.04 -0.04 1.04_* 0.00 0.03_*
## 296 -0.30 0.32 -0.77 0.17 -0.82_* 0.96_* 0.16 0.05_*
## 361 -0.13 0.13 0.00 -0.11 -0.17 1.03_* 0.01 0.03_*
## 508 -0.02 0.02 0.04 0.02 0.06 1.03_* 0.00 0.02
## 517 0.20 -0.20 0.05 0.02 0.23 0.96_* 0.01 0.01
## 522 -0.03 0.02 0.18 -0.05 0.18 1.07_* 0.01 0.06_*
## 559 -0.07 0.07 0.15 -0.04 0.22 0.94_* 0.01 0.01
## 601 -0.07 0.10 -0.93 0.24 -0.94_* 1.02 0.22 0.09_*
## 618 -0.01 0.01 -0.01 0.20 0.24 0.97_* 0.01 0.01
## 620 0.23 -0.23 0.01 0.04 0.30_* 0.88_* 0.02 0.01
## 628 0.01 -0.01 -0.09 0.09 0.12 1.03_* 0.00 0.03_*
## 643 0.00 0.00 0.04 -0.09 -0.10 1.03_* 0.00 0.02
## 698 0.04 -0.03 -0.10 -0.23 -0.30_* 1.03_* 0.02 0.04_*
## 700 -0.04 0.04 0.04 -0.10 -0.11 1.06_* 0.00 0.05_*
## 762 0.38 -0.37 0.18 -0.18 0.49_* 0.85_* 0.06 0.01
## 769 0.02 0.00 -0.41 0.13 0.45_* 0.88_* 0.05 0.01
## 788 0.13 -0.13 0.07 0.01 0.18 0.97_* 0.01 0.01
## 792 -0.02 0.02 -0.01 -0.06 -0.07 1.03_* 0.00 0.02
## 834 0.06 -0.06 0.00 0.11 0.12 1.05_* 0.00 0.04_*
## 838 -0.01 0.01 0.00 0.01 -0.02 1.03_* 0.00 0.02
## 855 0.23 -0.22 -0.28 0.12 0.40_* 0.95_* 0.04 0.02
## 887 0.19 -0.20 0.20 -0.02 0.29_* 0.97_* 0.02 0.01
## 889 -0.39 0.41 -0.67 0.15 0.80_* 0.74_* 0.15 0.02
## 900 0.03 -0.03 0.00 -0.15 -0.17 1.04_* 0.01 0.03_*
## 904 0.08 -0.09 -0.09 0.53 0.57_* 0.91_* 0.08 0.02
## 939 -0.06 0.06 0.20 -0.01 0.24 1.03 0.01 0.03_*
## 942 -0.29 0.30 -0.46 0.10 0.56_* 0.88_* 0.07 0.02
## 948 -0.01 0.01 0.09 -0.32 -0.32_* 1.16_* 0.03 0.14_*
## 999 0.12 -0.13 -0.05 0.23 0.26 1.02 0.02 0.03_*
covra = as.data.frame(covratio(model))
head(covra)
covra1 = as.data.frame(covratio(model1))
head(covra1)
covra2 = as.data.frame(covratio(model2))
head(covra2)
covra3 = as.data.frame(covratio(model3))
head(covra3)
#find number of predictors in model
p <- length(model$coefficients)-1
p1 <- length(model1$coefficients)-1
p2 <- length(model2$coefficients)-1
p3 <- length(model3$coefficients)-1
#calculate COVRATIO threshold value
threshcv <- (3*p)/n
threshcv1 <- (3*p1)/n
threshcv2 <- (3*p2)/n
threshcv3 <- (3*p3)/n
threshcv
## [1] 0.003
threshcv1
## [1] 0.003
threshcv2
## [1] 0.003
threshcv3
## [1] 0.009
sort_covra <- covra[order(-covra),]
sort_covra1 <- covra1[order(-covra1),]
sort_covra2 <- covra2[order(-covra2),]
sort_covra3 <- covra3[order(-covra3),]
head(sort_covra)
## [1] 1.073004 1.034317 1.020706 1.020672 1.019312 1.019105
head(sort_covra1)
## [1] 433.125061 1.005869 1.005853 1.005852 1.005850 1.005848
head(sort_covra2)
## [1] 1.015580 1.015355 1.014225 1.013914 1.013289 1.013211
head(sort_covra3)
## [1] 1.159737 1.071505 1.055904 1.053008 1.051011 1.042829
plot(covratio(model), type = 'l')
abline(l = threshcv, lty = 2)
abline(l = -threshcv, lty = 2)
plot(covratio(model1), type = 'l')
abline(l = threshcv1, lty = 2)
abline(l = -threshcv1, lty = 2)
plot(covratio(model2), type = 'l')
abline(l = threshcv2, lty = 2)
abline(l = -threshcv2, lty = 2)
plot(covratio(model3), type = 'l')
abline(l = threshcv3, lty = 2)
abline(l = -threshcv3, lty = 2)