library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

data <- read.csv("C:/Users/Rishabh/Desktop/STAT 371/Assignments/Assignment 4/munichrent_student.csv")

head(data)

##   age bestneighborhood_ centralheating_ extrabath_ goodneighborhood_ index
## 1  92                no             yes         no               yes    26
## 2  44                no             yes         no                no   697
## 3  43                no             yes         no                no  1360
## 4  44                no             yes         no                no  1932
## 5  53                no             yes         no                no  1580
## 6  44                no             yes         no                no   203
##   numrooms premiumkitchen_ rentpersqm size tiledbath_ warmwater_
## 1        2              no       6.72   65        yes        yes
## 2        3              no       7.71   71        yes        yes
## 3        4              no       6.26   74        yes        yes
## 4        4              no       7.82   98        yes        yes
## 5        3              no      10.23   85        yes        yes
## 6        5              no       7.96  140        yes        yes

model <- lm(rentpersqm ~ age + bestneighborhood_  + centralheating_ + extrabath_ + goodneighborhood_ + numrooms + premiumkitchen_
 + size + tiledbath_ + warmwater_, data = data) 

summary(model)

## 
## Call:
## lm(formula = rentpersqm ~ age + bestneighborhood_ + centralheating_ + 
##     extrabath_ + goodneighborhood_ + numrooms + premiumkitchen_ + 
##     size + tiledbath_ + warmwater_, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.3221 -1.3789 -0.0517  1.2773  6.7479 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           6.487076   0.929826   6.977 4.89e-11 ***
## age                  -0.007619   0.006406  -1.189  0.23577    
## bestneighborhood_yes  1.555038   0.869309   1.789  0.07524 .  
## centralheating_yes    1.291070   0.615381   2.098  0.03723 *  
## extrabath_yes         0.597160   0.559220   1.068  0.28695    
## goodneighborhood_yes  0.744803   0.307822   2.420  0.01648 *  
## numrooms             -0.466262   0.277599  -1.680  0.09467 .  
## premiumkitchen_yes    1.403683   0.641975   2.187  0.03000 *  
## size                 -0.018404   0.011876  -1.550  0.12288    
## tiledbath_yes         0.768893   0.356488   2.157  0.03227 *  
## warmwater_yes         2.646480   0.765207   3.459  0.00067 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.994 on 190 degrees of freedom
## Multiple R-squared:  0.3879, Adjusted R-squared:  0.3557 
## F-statistic: 12.04 on 10 and 190 DF,  p-value: 5.111e-16

Ques 1:

QQ Plot

qqplot <- resid(model)

qqnorm(qqplot)

qqline(qqplot, col=2)

(ii) Studendized residuals vs Index

s <- rstudent(model)

plot(s , xlab= "index", ylab ="Studentized residuals", main = "Studendized residuals vs Index")

abline(h = c(-3,3), col = "blue", lty = 2)

Residual vs. Fitted values

yhat<-fitted(model)

plot(qqplot ~ yhat, xlab ="Fitted values", ylab="Residuals", main = "Residual vs. Fitted values")

Leverage vs. Index

leverage <- hatvalues(model)
plot(leverage, xlab = 'index', ylab = 'leverage', main = "Leverage vs. Index")

Conclusions for each of the plots above:

# Find the index number of the observation with the largest leverage
max_leverage_index <- which.max(leverage)
print(max_leverage_index)

## 201 
## 201

index_78_studentized_res <- s[201]
print(index_78_studentized_res)

##        201 
## -0.5514178

max_studentized_res <- which.max(s)
print(max_studentized_res)

## 55 
## 55

The index of the observation with the largest leverage

# Find the indices of the two largest studentized residual
top_s_indices <- order(s, decreasing = TRUE)[1:2]

# Print the indices of the two largest studentized residual
print(top_s_indices)

## [1] 55 96

Laurier Project

Rishabh Ballkooram

2023-11-07

Ques 1:

Conclusions for each of the plots above: