mydata_cars <- read.table("./car_sales_data.csv",
header=TRUE,
sep=",",
dec=";")
head(mydata_cars, 10) #Showing first 10 units
## Manufacturer Model Engine.size Fuel.type Year.of.manufacture Mileage
## 1 Ford Fiesta 1.0 Petrol 2002 127300
## 2 Porsche 718 Cayman 4.0 Petrol 2016 57850
## 3 Ford Mondeo 1.6 Diesel 2014 39190
## 4 Toyota RAV4 1.8 Hybrid 1988 210814
## 5 VW Polo 1.0 Petrol 2006 127869
## 6 Ford Focus 1.4 Petrol 2018 33603
## 7 Ford Mondeo 1.8 Diesel 2010 86686
## 8 Toyota Prius 1.4 Hybrid 2015 30663
## 9 VW Polo 1.2 Petrol 2012 73470
## 10 Ford Focus 2.0 Diesel 1992 262514
## Price
## 1 3074
## 2 49704
## 3 24072
## 4 1705
## 5 4101
## 6 29204
## 7 14350
## 8 30297
## 9 9977
## 10 1049
Description:
#Because i am only interested in petrol and diesl fueld car, I want to remove all units that have different Fuel.type, and create new dataset
mydata_cars2 <- mydata_cars[mydata_cars$Fuel.type !="Hybrid",]
mydata_cars2$Fuel.type <- ifelse(mydata_cars2$Fuel.type == "Diesel", 1, 0) #Now fuel is a dummy variable 1= Diesel and 0= Petrol
head(mydata_cars2, 10) #Showing first 10 units
## Manufacturer Model Engine.size Fuel.type Year.of.manufacture Mileage
## 1 Ford Fiesta 1.0 0 2002 127300
## 2 Porsche 718 Cayman 4.0 0 2016 57850
## 3 Ford Mondeo 1.6 1 2014 39190
## 5 VW Polo 1.0 0 2006 127869
## 6 Ford Focus 1.4 0 2018 33603
## 7 Ford Mondeo 1.8 1 2010 86686
## 9 VW Polo 1.2 0 2012 73470
## 10 Ford Focus 2.0 1 1992 262514
## 11 VW Golf 2.0 1 2014 83047
## 12 BMW Z4 2.0 0 1990 293666
## Price
## 1 3074
## 2 49704
## 3 24072
## 5 4101
## 6 29204
## 7 14350
## 9 9977
## 10 1049
## 11 17173
## 12 719
#I would like to find the most popular/common manufacturer
tab <- table(mydata_cars2$Manufacturer)
most_common <- names(which.max(tab))
most_common
## [1] "VW"
# I am only interested in most common brand which is VW, now i will randomly select VW model, to analyse further
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:pastecs':
##
## first, last
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mydata_cars_VW <- mydata_cars2 %>%
filter(Manufacturer == "VW")
mydata_cars_VW$Manufacturer <- NULL
head(mydata_cars_VW, 5) #Showing first 5 units
## Model Engine.size Fuel.type Year.of.manufacture Mileage Price
## 1 Polo 1.0 0 2006 127869 4101
## 2 Polo 1.2 0 2012 73470 9977
## 3 Golf 2.0 1 2014 83047 17173
## 4 Golf 1.2 1 2007 92697 7792
## 5 Golf 1.6 1 1989 222390 933
sum(mydata_cars$Model == "Golf")
## [1] 5050
Within the VW brand, the Golf model was randomly selected by computer for analysis. Focusing on a single model allows me to examine relationships between price, mileage, year of manufacture, engine size and fuel type without variation caused by differences between models, ensuring clearer and more interpretable results.
mydata_golf <- mydata_cars_VW[mydata_cars_VW$Model == "Golf",]
head(mydata_golf, 5) #Showing first 5 units
## Model Engine.size Fuel.type Year.of.manufacture Mileage Price
## 3 Golf 2.0 1 2014 83047 17173
## 4 Golf 1.2 1 2007 92697 7792
## 5 Golf 1.6 1 1989 222390 933
## 7 Golf 2.0 0 2020 18985 36387
## 20 Golf 1.2 0 2002 139884 3713
mydata_golf$Engine.size <- factor(mydata_golf$Engine.size)
levels(mydata_golf$Engine.size)
## [1] "1.2" "1.4" "1.6" "1.8" "2.0"
round(stat.desc(mydata_golf[ , -c(1,2,3)]), 2)
## Year.of.manufacture Mileage Price
## nbr.val 4537.00 4.537000e+03 4537.00
## nbr.null 0.00 0.000000e+00 0.00
## nbr.na 0.00 0.000000e+00 0.00
## min 1984.00 6.300000e+02 135.00
## max 2022.00 4.219790e+05 46830.00
## range 38.00 4.213490e+05 46695.00
## sum 9093222.00 5.092727e+08 45983700.00
## median 2004.00 1.021160e+05 6472.00
## mean 2004.24 1.122488e+05 10135.27
## SE.mean 0.14 1.054210e+03 142.41
## CI.mean.0.95 0.28 2.066770e+03 279.20
## var 92.47 5.042236e+09 92016939.14
## std.dev 9.62 7.100871e+04 9592.55
## coef.var 0.00 6.300000e-01 0.95
table(mydata_golf$Fuel.type)
##
## 0 1
## 3002 1535
table(mydata_golf$Engine.size)
##
## 1.2 1.4 1.6 1.8 2.0
## 912 952 885 917 871
Description:
ggplot(mydata_golf, aes(x = Price)) +
geom_histogram(binwidth = 1000, fill = "skyblue", color = "black") +
labs(title = "Distribution of used VW Golf Prices", x = "Price (€)", y = "Count")+
theme_minimal()
ggplot(mydata_golf, aes(x = Mileage)) +
geom_histogram(binwidth = 10000, fill = "lightgreen", color = "black") +
labs(title = "Distribution of used VW Golf Mileage", x = "Mileage (km)", y = "Count")+
theme_minimal()
ggplot(mydata_golf, aes(x = Year.of.manufacture)) +
geom_histogram(binwidth = 1, fill = "lightcoral", color = "black") +
labs(title = "Distribution of Year of Manufacture", x = "Year", y = "Count")+
theme_minimal()
ggplot(mydata_golf, aes(x = factor(Fuel.type))) +
geom_bar(fill = "lightblue") +
scale_x_discrete(labels = c("Petrol", "Diesel")) +
labs(title = "Distribution of Fuel Type", x = "Fuel Type", y = "Count")+
theme_minimal()
ggplot(mydata_golf, aes(x = Engine.size)) +
geom_bar(fill = "pink") +
labs(title = "Distribution of Engine Size", x = "Engine Size (L)", y = "Count")+
theme_minimal()
Explination:
As assumed both price and mileage distribution is skiewed to the right.Where we can still see long tails due to few cars being sold at very high mileage and high prices for a used car. While in contrastz year of make is roughly normally distributed.
While on last two graphs we can see the distribution of frequency for Engine size and Fuel type. We can see, that there was more petrol cars sold, while engine size is quite evenly distributed.
ggplot(mydata_golf, aes(x = Engine.size, y = Price)) +
geom_boxplot(fill = "pink") +
labs(title = "Price by Engine Size", x = "Engine Size (L)", y = "Price (€)")+
theme_minimal()
ggplot(mydata_golf, aes(x = factor(Fuel.type), y = Price)) +
geom_boxplot(fill = "lightblue") +
scale_x_discrete(labels = c("Petrol", "Diesel")) +
labs(title = "Price by Fuel Type", x = "Fuel Type", y = "Price (€)")+
theme_minimal()
library(car)
scatterplotMatrix(mydata_golf[ , c(4, 5, 6)], #Scatterplot matrix
smooth = FALSE,
regLine = list(col = "red"))
Explination:
Scatterplot matrix (year, mileage and price):
Boxplot: Price by engine size:
Boxplot: Price by fuel type:
lm(Price ~ Year.of.manufacture + Mileage + Fuel.type + Engine.size, data = mydata_golf)
##
## Call:
## lm(formula = Price ~ Year.of.manufacture + Mileage + Fuel.type +
## Engine.size, data = mydata_golf)
##
## Coefficients:
## (Intercept) Year.of.manufacture Mileage
## -1.428e+06 7.186e+02 -2.918e-02
## Fuel.type Engine.size1.4 Engine.size1.6
## 3.010e+02 6.929e+02 1.432e+03
## Engine.size1.8 Engine.size2.0
## 1.982e+03 2.722e+03
Explination of fit of multiple regression:
library(readxl)
mydata_MBA <- read_excel("D:/IMB/R/R Take Home Exam 2025/Business School.xlsx")
head(mydata_MBA, 10) #Showing first 10 units
## # A tibble: 10 × 9
## `Student ID` `Undergrad Degree` `Undergrad Grade` `MBA Grade`
## <dbl> <chr> <dbl> <dbl>
## 1 1 Business 68.4 90.2
## 2 2 Computer Science 70.2 68.7
## 3 3 Finance 76.4 83.3
## 4 4 Business 82.6 88.7
## 5 5 Finance 76.9 75.4
## 6 6 Computer Science 83.3 82.1
## 7 7 Engineering 76 66.9
## 8 8 Engineering 82.8 76.8
## 9 9 Business 76 72.3
## 10 10 Finance 76.9 72.4
## # ℹ 5 more variables: `Work Experience` <chr>, `Employability (Before)` <dbl>,
## # `Employability (After)` <dbl>, Status <chr>, `Annual Salary` <dbl>
ggplot(mydata_MBA, aes(x = `Undergrad Degree`))+
geom_bar(fill ="lightblue")+
labs(title = "Distribution of undergraduate degrees",
x = "Undergraduate degree",
y = "Count")+
theme_minimal()
Explination:
We can assume from the graph, that the Business degree is the most common Undergraduate degree.
round(stat.desc(mydata_MBA$`Annual Salary`))
## nbr.val nbr.null nbr.na min max range
## 100 0 0 20000 340000 320000
## sum median mean SE.mean CI.mean.0.95 var
## 10905800 103500 109058 4150 8235 1722373475
## std.dev coef.var
## 41501 0
Explination:
ggplot(mydata_MBA, aes(x = `Annual Salary`)) +
geom_histogram(binwidth = 5000, colour = "black", fill = "lightblue") +
labs(
title = "Annual Salary distribution",
x = "Annual Salary",
y = "Frequency")+
theme_minimal()
#for easier description of distribution
summary(mydata_MBA$`Annual Salary`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20000 87125 103500 109058 124000 340000
Description:
From the histogram above we can see that the graph is skiewed to the right. There is few units that have Annual salary above 200,000, and they strech the range to the right, these units are outsliers. Mean is higher than median, which causes the skiew to the right. With the help of function summary we can also say that 75% of people have a salary equal or less to 124,000, while the maximum Annual salary is 340,000.
Two sided t-test for hypotesis:
𝐻0:𝜇MBA Grade = 74
t.test(mydata_MBA$`MBA Grade`, mu = 74)
##
## One Sample t-test
##
## data: mydata_MBA$`MBA Grade`
## t = 2.6587, df = 99, p-value = 0.00915
## alternative hypothesis: true mean is not equal to 74
## 95 percent confidence interval:
## 74.51764 77.56346
## sample estimates:
## mean of x
## 76.04055
#install.packages("effectsize")
library(effectsize)
cohens_d(mydata_MBA$`MBA Grade`, mu = 74)
## Cohen's d | 95% CI
## ------------------------
## 0.27 | [0.07, 0.46]
##
## - Deviation from a difference of 74.
mean(mydata_MBA$`MBA Grade`)
## [1] 76.04055
We can reject the H0, at α = 5%. Mean of variable MBA Grade is not 74. Even though we rejected H0, Cohen’s d test showed us that effect size is small / weak.
library(readxl)
Apartments <- read_excel("D:/IMB/R/R Take Home Exam 2025/Apartments.xlsx")
head(Apartments, 10) #Showing first 10 units
## # A tibble: 10 × 5
## Age Distance Price Parking Balcony
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
## 6 28 12 1770 0 1
## 7 14 20 1850 0 1
## 8 18 6 1970 1 1
## 9 22 7 2270 1 0
## 10 25 2 2570 1 0
Description:
Apartments$Parking <- factor(Apartments$Parking,
levels = c(0, 1),
labels = c("No parking", "Has parking"))
Apartments$Balcony <- factor(Apartments$Balcony,
levels = c(0, 1),
labels = c("No balcony", "Has balcony"))
head(Apartments, 10) #Showing first 10 units
## # A tibble: 10 × 5
## Age Distance Price Parking Balcony
## <dbl> <dbl> <dbl> <fct> <fct>
## 1 7 28 1640 No parking Has balcony
## 2 18 1 2800 Has parking No balcony
## 3 7 28 1660 No parking No balcony
## 4 28 29 1850 No parking Has balcony
## 5 18 18 1640 Has parking Has balcony
## 6 28 12 1770 No parking Has balcony
## 7 14 20 1850 No parking Has balcony
## 8 18 6 1970 Has parking Has balcony
## 9 22 7 2270 Has parking No balcony
## 10 25 2 2570 Has parking No balcony
H0: Mu_Price = 1900 eur
t.test(Apartments$Price, mu = 1900)
##
## One Sample t-test
##
## data: Apartments$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
We can conclude, that we can reject H0 at α = 5%, mean is not 1900eur. We can also see, that the 95% confidence interval assumes that the mean for price of the apartment lies on the interval 1937.443eur and 2100.440eur. While the sample estimate of the mean for price is 2018.941eur.
fit1 <- lm(Apartments$Price ~ Apartments$Age, data = Apartments)
summary(fit1)
##
## Call:
## lm(formula = Apartments$Price ~ Apartments$Age, data = Apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Apartments$Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
cor(Apartments$Price, Apartments$Age)
## [1] -0.230255
scatterplotMatrix(Apartments[ , c(1, 2, 3)], #Scatterplot matrix
smooth = FALSE,
regLine = list(col = "cyan3"))
From the scatter plot matrix we can see that there is no multicollinearity between age and distance variables. They are not strongly correlated, which means we can include both into a regression model without any concerns.
fit2 <- lm(Price ~ Age + Distance, data = Apartments)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = Apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
library(car)
vif(fit2)
## Age Distance
## 1.001845 1.001845
VIF statistics confirm our assumption about multicollinearity between age and distance. Because both values are almost 1, they are almost completley independent of each other. Both can be included in our regression model.
Apartments$StdResid <- round(rstandard(fit2), 3) #Standardized residuals
Apartments$CooksD <- round(cooks.distance(fit2), 3) #Cooks distances
hist(Apartments$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
hist(Apartments$CooksD,
xlab = "Cooks distance",
ylab = "Frequency",
main = "Histogram of Cooks distances")
Apartments$ID <- 1:nrow(Apartments)
head(Apartments, 10)
## # A tibble: 10 × 8
## Age Distance Price Parking Balcony StdResid CooksD ID
## <dbl> <dbl> <dbl> <fct> <fct> <dbl> <dbl> <int>
## 1 7 28 1640 No parking Has balcony -0.665 0.007 1
## 2 18 1 2800 Has parking No balcony 1.78 0.03 2
## 3 7 28 1660 No parking No balcony -0.594 0.006 3
## 4 28 29 1850 No parking Has balcony 0.754 0.008 4
## 5 18 18 1640 Has parking Has balcony -1.07 0.005 5
## 6 28 12 1770 No parking Has balcony -0.778 0.005 6
## 7 14 20 1850 No parking Has balcony -0.302 0.001 7
## 8 18 6 1970 Has parking Has balcony -0.787 0.004 8
## 9 22 7 2270 Has parking No balcony 0.455 0.001 9
## 10 25 2 2570 Has parking No balcony 1.24 0.017 10
head(Apartments[order(Apartments$StdResid),], 5) #Three units with lowest value of stand. residuals
## # A tibble: 5 × 8
## Age Distance Price Parking Balcony StdResid CooksD ID
## <dbl> <dbl> <dbl> <fct> <fct> <dbl> <dbl> <int>
## 1 7 2 1760 No parking Has balcony -2.15 0.066 53
## 2 12 14 1650 No parking Has balcony -1.50 0.013 13
## 3 12 14 1650 No parking No balcony -1.50 0.013 72
## 4 13 8 1800 No parking No balcony -1.38 0.012 20
## 5 14 16 1660 No parking Has balcony -1.26 0.008 35
head(Apartments[order(-Apartments$CooksD),], 6) #Six units with highest value of Cooks distance
## # A tibble: 6 × 8
## Age Distance Price Parking Balcony StdResid CooksD ID
## <dbl> <dbl> <dbl> <fct> <fct> <dbl> <dbl> <int>
## 1 5 45 2180 Has parking Has balcony 2.58 0.32 38
## 2 43 37 1740 No parking No balcony 1.44 0.104 55
## 3 2 11 2790 Has parking No balcony 2.05 0.069 33
## 4 7 2 1760 No parking Has balcony -2.15 0.066 53
## 5 37 3 2540 Has parking Has balcony 1.58 0.061 22
## 6 40 2 2400 No parking Has balcony 1.09 0.038 39
Looking at standard residuals only units 53, 38 and 33 stand out as potential outlier, but with standrard residual values under 3. While looking at Cooks distances, with rule Cooks D there shouldnt be any gaps in the histogram, thats why we need to remove only unit 38.
library(dplyr)
Apartments <- Apartments %>%
filter(!ID == "38") #Removing 38th unit
fit2 <- lm(Price ~ Age + Distance, data = Apartments)
Apartments$StdFitted <- scale(fit2$fitted.values)
library(car)
scatterplot(y = Apartments$StdResid, x = Apartments$StdFitted,
ylab = "Standardized residuals",
xlab = "Standardized fitted values",
boxplots = FALSE,
regLine = FALSE,
smooth = FALSE)
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
ols_test_breusch_pagan(fit2)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## ---------------------------------
## Response : Price
## Variables: fitted values of Price
##
## Test Summary
## -----------------------------
## DF = 1
## Chi2 = 2.927455
## Prob > Chi2 = 0.08708469
The scatterplot of standardized residuals versus fitted values does not show any clear funnel shape. With Breusch Pagan Test for Heteroskedasticity we get a result of p=0.087 > α = 5% , therefore we can not reject the H0.Our findings support the validity of standard regression assumptions.
H0: standardized residuals are ditributed normally H1: standardized residuals are not ditributed normally
hist(Apartments$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
shapiro.test(Apartments$StdResid)
##
## Shapiro-Wilk normality test
##
## data: Apartments$StdResid
## W = 0.94879, p-value = 0.002187
We can see from the histogram, that there is a small deviation from normal distribution which is also confirmed by Shapiro-Wilk normality test, which results came back at w=0.956 and p=0.0063, we can reject the H0 at p < α, we conclude that the standardized residuals are not distributed normally.
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = Apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -604.92 -229.63 -56.49 192.97 599.35
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2456.076 73.931 33.221 < 2e-16 ***
## Age -6.464 3.159 -2.046 0.044 *
## Distance -22.955 2.786 -8.240 2.52e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 276.1 on 81 degrees of freedom
## Multiple R-squared: 0.4838, Adjusted R-squared: 0.4711
## F-statistic: 37.96 on 2 and 81 DF, p-value: 2.339e-12
fit3 <- lm(Price ~ Age + Distance + Parking + Balcony, data = Apartments)
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = Apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -473.21 -192.37 -28.89 204.17 558.77
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2329.724 93.066 25.033 < 2e-16 ***
## Age -5.821 3.074 -1.894 0.06190 .
## Distance -20.279 2.886 -7.026 6.66e-10 ***
## ParkingHas parking 167.531 62.864 2.665 0.00933 **
## BalconyHas balcony -15.207 59.201 -0.257 0.79795
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 267.5 on 79 degrees of freedom
## Multiple R-squared: 0.5275, Adjusted R-squared: 0.5035
## F-statistic: 22.04 on 4 and 79 DF, p-value: 3.018e-12
anova(fit2, fit3)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + Parking + Balcony
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 81 6176767
## 2 79 5654480 2 522287 3.6485 0.03051 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Based on ANOVA result the fit3 fits data better than model fit2.
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = Apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -473.21 -192.37 -28.89 204.17 558.77
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2329.724 93.066 25.033 < 2e-16 ***
## Age -5.821 3.074 -1.894 0.06190 .
## Distance -20.279 2.886 -7.026 6.66e-10 ***
## ParkingHas parking 167.531 62.864 2.665 0.00933 **
## BalconyHas balcony -15.207 59.201 -0.257 0.79795
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 267.5 on 79 degrees of freedom
## Multiple R-squared: 0.5275, Adjusted R-squared: 0.5035
## F-statistic: 22.04 on 4 and 79 DF, p-value: 3.018e-12
Apartments$fitted_price <- fitted(fit3)
Apartments$Residuals <- residuals(fit3)
Apartments$residual <- Apartments$Price - Apartments$fitted_price
residual_ID2 <- Apartments$Price[Apartments$ID == 2] -
fitted(fit3)[Apartments$ID == 2]
residual_ID2
## 2
## 427.8029