library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data <- read.csv("C:/Users/Rishabh/Desktop/STAT 371/Assignments/Assignment 4/munichrent_student.csv")
head(data)
## age bestneighborhood_ centralheating_ extrabath_ goodneighborhood_ index
## 1 92 no yes no yes 26
## 2 44 no yes no no 697
## 3 43 no yes no no 1360
## 4 44 no yes no no 1932
## 5 53 no yes no no 1580
## 6 44 no yes no no 203
## numrooms premiumkitchen_ rentpersqm size tiledbath_ warmwater_
## 1 2 no 6.72 65 yes yes
## 2 3 no 7.71 71 yes yes
## 3 4 no 6.26 74 yes yes
## 4 4 no 7.82 98 yes yes
## 5 3 no 10.23 85 yes yes
## 6 5 no 7.96 140 yes yes
model <- lm(rentpersqm ~ age + bestneighborhood_ + centralheating_ + extrabath_ + goodneighborhood_ + numrooms + premiumkitchen_
+ size + tiledbath_ + warmwater_, data = data)
summary(model)
##
## Call:
## lm(formula = rentpersqm ~ age + bestneighborhood_ + centralheating_ +
## extrabath_ + goodneighborhood_ + numrooms + premiumkitchen_ +
## size + tiledbath_ + warmwater_, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.3221 -1.3789 -0.0517 1.2773 6.7479
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.487076 0.929826 6.977 4.89e-11 ***
## age -0.007619 0.006406 -1.189 0.23577
## bestneighborhood_yes 1.555038 0.869309 1.789 0.07524 .
## centralheating_yes 1.291070 0.615381 2.098 0.03723 *
## extrabath_yes 0.597160 0.559220 1.068 0.28695
## goodneighborhood_yes 0.744803 0.307822 2.420 0.01648 *
## numrooms -0.466262 0.277599 -1.680 0.09467 .
## premiumkitchen_yes 1.403683 0.641975 2.187 0.03000 *
## size -0.018404 0.011876 -1.550 0.12288
## tiledbath_yes 0.768893 0.356488 2.157 0.03227 *
## warmwater_yes 2.646480 0.765207 3.459 0.00067 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.994 on 190 degrees of freedom
## Multiple R-squared: 0.3879, Adjusted R-squared: 0.3557
## F-statistic: 12.04 on 10 and 190 DF, p-value: 5.111e-16
qqplot <- resid(model)
qqnorm(qqplot)
qqline(qqplot, col=2)
(ii) Studendized residuals vs Index
s <- rstudent(model)
plot(s , xlab= "index", ylab ="Studentized residuals", main = "Studendized residuals vs Index")
abline(h = c(-3,3), col = "blue", lty = 2)
yhat<-fitted(model)
plot(qqplot ~ yhat, xlab ="Fitted values", ylab="Residuals", main = "Residual vs. Fitted values")
leverage <- hatvalues(model)
plot(leverage, xlab = 'index', ylab = 'leverage', main = "Leverage vs. Index")
# Find the index number of the observation with the largest leverage
max_leverage_index <- which.max(leverage)
print(max_leverage_index)
## 201
## 201
index_78_studentized_res <- s[201]
print(index_78_studentized_res)
## 201
## -0.5514178
max_studentized_res <- which.max(s)
print(max_studentized_res)
## 55
## 55
The index of the observation with the largest leverage
# Find the indices of the two largest studentized residual
top_s_indices <- order(s, decreasing = TRUE)[1:2]
# Print the indices of the two largest studentized residual
print(top_s_indices)
## [1] 55 96