Load necessary libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
Use the built-in mtcars dataset
data(mtcars)
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
Explanation Variables:
mpg (Miles per Gallon)
cyl (Number of Cylinders)
disp (Displacement)
hp (Horsepower)
drat (Rear Axle Ratio)
wt (Weight)
qsec (Quarter Mile Time)
vs (Engine Shape)
am (Transmission)
gear (Number of Gears)
carb (Number of Carburetors)
Data manipulation
Create a new variable: horsepower per weight
mtcars$hp_per_weight <- mtcars$hp / mtcars$wt
Remove rows with missing data (if any)
mtcars_clean <- na.omit(mtcars)
Rename the ‘hp’ column to ‘Horsepower’
mtcars_clean <- mtcars_clean %>%
rename(Horsepower = hp)
head(mtcars_clean)
## mpg cyl disp Horsepower drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
## hp_per_weight
## Mazda RX4 41.98473
## Mazda RX4 Wag 38.26087
## Datsun 710 40.08621
## Hornet 4 Drive 34.21462
## Hornet Sportabout 50.87209
## Valiant 30.34682
Create a new dataframe for cars with Horsepower > 150
mtcars_powerful <- mtcars_clean %>%
filter(Horsepower > 150)
head(mtcars_powerful)
## mpg cyl disp Horsepower drat wt qsec vs am gear carb
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.44 17.02 0 0 3 2
## Duster 360 14.3 8 360.0 245 3.21 3.57 15.84 0 0 3 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.07 17.40 0 0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.73 17.60 0 0 3 3
## Merc 450SLC 15.2 8 275.8 180 3.07 3.78 18.00 0 0 3 3
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.25 17.98 0 0 3 4
## hp_per_weight
## Hornet Sportabout 50.87209
## Duster 360 68.62745
## Merc 450SE 44.22604
## Merc 450SL 48.25737
## Merc 450SLC 47.61905
## Cadillac Fleetwood 39.04762
mtcars$am <- factor(mtcars$am,
levels = c("0", "1"),
labels = c("A", "M"))
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 M 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 M 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 M 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 A 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 A 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 A 3 1
## hp_per_weight
## Mazda RX4 41.98473
## Mazda RX4 Wag 38.26087
## Datsun 710 40.08621
## Hornet 4 Drive 34.21462
## Hornet Sportabout 50.87209
## Valiant 30.34682
Descriptive statistics
mean_hp <- mean(mtcars_clean$Horsepower)
median_hp <- median(mtcars_clean$Horsepower)
sd_hp <- sd(mtcars_clean$Horsepower)
mean_hp_per_weight <- mean(mtcars_clean$hp_per_weight)
median_hp_per_weight <- median(mtcars_clean$hp_per_weight)
sd_hp_per_weight <- sd(mtcars_clean$hp_per_weight)
mean_hp
## [1] 146.6875
median_hp
## [1] 123
sd_hp
## [1] 68.56287
mean_hp_per_weight
## [1] 45.33466
median_hp_per_weight
## [1] 41.03547
sd_hp_per_weight
## [1] 16.28767
mtcars_subset <- mtcars[, !(names(mtcars) %in% c("vs", "am"))]
summary(mtcars_subset)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec gear
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :3.000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:3.000
## Median :3.695 Median :3.325 Median :17.71 Median :4.000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :3.688
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:4.000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :5.000
## carb hp_per_weight
## Min. :1.000 Min. :19.44
## 1st Qu.:2.000 1st Qu.:35.67
## Median :2.000 Median :41.04
## Mean :2.812 Mean :45.33
## 3rd Qu.:4.000 3rd Qu.:47.78
## Max. :8.000 Max. :93.84
mean_hp (146.6875): The average horsepower of the cars in the dataset is 146.69.
median_hp (123): The middle value of horsepower (when sorted) is 123, meaning half of the cars have less than 123 horsepower, and half have more.
sd_hp (68.56287): The standard deviation of horsepower is 68.56, which indicates how much the horsepower values vary from the mean.
Graphing the distribution of variables
ggplot(mtcars_clean, aes(x = Horsepower)) +
geom_histogram(binwidth = 10, fill = "blue", color = "black") +
ggtitle("Distribution of Horsepower")
The histogram shows a peak right above 100 hp. This indicates a unimodal
distribution, which is skewed to the right side.
ggplot(mtcars_clean, aes(x = "", y = Horsepower)) +
geom_boxplot(fill = "orange") +
ggtitle("Boxplot of Horsepower")
Explanation:
Box: The box contains the middle 50% of the data (from the first quartile to the third quartile). The median (middle line in the box) is just below 125, which shows that half of the data has horsepower values below this, and half is above.
Whiskers: The lines (whiskers) extending from the box show the range of the data, excluding outliers. The lower whisker shows the minimum value (just above 50). The upper whisker reaches close to 250, indicating the upper bound of non-outlier data.
Outliers: The black dot above the box is an outlier, which means there is a car with significantly higher horsepower (above 300) compared to the rest of the data.
ggplot(mtcars_clean, aes(x = wt, y = Horsepower)) +
geom_point(color = "red") +
ggtitle("Scatterplot of Weight vs Horsepower")
Explanation:
Positive Relationship: There is a general upward trend, indicating that as the weight of the car increases, the horsepower tends to increase as well. Heavier cars tend to have higher horsepower.
Clusters: There seem to be clusters of data points, especially around weights of 3 to 4 (in 1000 lbs) where horsepower varies between 100 to 200. A few points stand out in the higher weight range, showing much higher horsepower (around 300).
Outliers: There are a few cars that have very high horsepower (above 300) for their weight. These could be high-performance vehicles.
Load necessary libraries
library(dplyr)
library(ggplot2)
library(readxl)
Load the dataset (assuming it’s in Excel format)
library(readxl)
mba_data <- read_xlsx("./R Take Home Exam 2024/Task 2/Business School.xlsx")
mba_data <- as.data.frame(mba_data)
head(mba_data)
## Student ID Undergrad Degree Undergrad Grade MBA Grade Work Experience
## 1 1 Business 68.4 90.2 No
## 2 2 Computer Science 70.2 68.7 Yes
## 3 3 Finance 76.4 83.3 No
## 4 4 Business 82.6 88.7 No
## 5 5 Finance 76.9 75.4 No
## 6 6 Computer Science 83.3 82.1 No
## Employability (Before) Employability (After) Status Annual Salary
## 1 252 276 Placed 111000
## 2 101 119 Placed 107000
## 3 401 462 Placed 109000
## 4 287 342 Placed 148000
## 5 275 347 Placed 255500
## 6 254 313 Placed 103500
Plot the distribution of undergraduate degrees
ggplot(mba_data, aes(x = `Undergrad Degree`)) +
geom_bar(fill = "skyblue", color = "black") +
ggtitle("Distribution of Undergraduate Degrees")
As shown above, the Business undergraduate degree is most common.
Descriptive statistics for Annual Salary
summary(mba_data$`Annual Salary`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20000 87125 103500 109058 124000 340000
mean_salary <- mean(mba_data$`Annual Salary`, na.rm = TRUE)
median_salary <- median(mba_data$`Annual Salary`, na.rm = TRUE)
sd_salary <- sd(mba_data$`Annual Salary`, na.rm = TRUE)
Display statistics
mean_salary
## [1] 109058
median_salary
## [1] 103500
sd_salary
## [1] 41501.49
ggplot(mba_data, aes(x = `Annual Salary`)) +
geom_histogram(binwidth = 50000, fill = "lightgreen", color = "black") +
scale_x_continuous(breaks = seq(0, max(mba_data$`Annual Salary`), by = 50000)) +
ggtitle("Histogram of Annual Salary") +
xlab("Annual Salary") +
ylab("Count")
Explanation:
The distribution is unimodal and skewed to the right.The mean is around 100000 and the distribution has outliers in the area above 300000.
Hypothesis test: H0: mean MBA Grade = 74
t_test_result <- t.test(mba_data$`MBA Grade`, mu = 74)
Display the result of the t-test
t_test_result
##
## One Sample t-test
##
## data: mba_data$`MBA Grade`
## t = 2.6587, df = 99, p-value = 0.00915
## alternative hypothesis: true mean is not equal to 74
## 95 percent confidence interval:
## 74.51764 77.56346
## sample estimates:
## mean of x
## 76.04055
library(effectsize)
effectsize::cohens_d(mba_data$`MBA Grade`,
mu=74)
## Cohen's d | 95% CI
## ------------------------
## 0.27 | [0.07, 0.46]
##
## - Deviation from a difference of 74.
Interpretation:
The t-test results indicate that there is statistically significant evidence (p-value = 0.00915) to suggest that the true mean MBA grade for the current generation of students is different from 74. The confidence interval suggests the mean is likely between 74.52 and 77.56, with the sample mean being 76.04. Therefore, the average MBA grade for this year’s students is likely higher than 74.
Cohen’s d of 0.27 indicates a small effect size, meaning the difference between the mean MBA grade and 74 is minor. The 95% CI [0.07, 0.46] suggests the true effect size is small to moderate. While statistically significant, the practical difference from 74 is minimal.
Import the dataset Apartments.xlsx
library(readxl)
Apartments <- read_excel("R Take Home Exam 2024/Task 3/Apartments.xlsx")
head(Apartments)
## # A tibble: 6 × 5
## Age Distance Price Parking Balcony
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
## 6 28 12 1770 0 1
Description:
Change categorical variables into factors.
Apartments$Parking <- factor(Apartments$Parking, levels = c(0, 1), labels = c("No", "Yes"))
Apartments$Balcony <- factor(Apartments$Balcony, levels = c(0, 1), labels = c("No", "Yes"))
str(Apartments)
## tibble [85 × 5] (S3: tbl_df/tbl/data.frame)
## $ Age : num [1:85] 7 18 7 28 18 28 14 18 22 25 ...
## $ Distance: num [1:85] 28 1 28 29 18 12 20 6 7 2 ...
## $ Price : num [1:85] 1640 2800 1660 1850 1640 1770 1850 1970 2270 2570 ...
## $ Parking : Factor w/ 2 levels "No","Yes": 1 2 1 1 2 1 1 2 2 2 ...
## $ Balcony : Factor w/ 2 levels "No","Yes": 2 1 1 2 2 2 2 2 1 1 ...
Test the hypothesis H0: Mu_Price = 1900 eur. What can you conclude?
t_test <- t.test(Apartments$Price, mu = 1900)
t_test
##
## One Sample t-test
##
## data: Apartments$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
Conclusion:
Since the p-value is 0.004731, which is less than the typical significance level of 0.05, we reject the null hypothesis. This means there is strong evidence that the true mean price per square meter is different from 1900 EUR.
Estimate the simple regression function: Price = f(Age). Save results in object fit1 and explain the estimate of regression coefficient, coefficient of correlation and coefficient of determination.
fit1 <- lm(Price ~ Age, data = Apartments)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = Apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
cor(Apartments$Price, Apartments$Age)
## [1] -0.230255
Estimate of regression coefficient:
Intercept (2185.455): This is the predicted price per square meter when the apartment’s age is 0. It represents the estimated average price for a new apartment. According to the model, a newly built apartment would have an average price of 2185.455 EUR per square meter.
Slope (Age = -8.975): This is the estimated regression coefficient for Age. It means that for every additional year in the age of the apartment, the price per square meter decreases by approximately 8.98 EUR. This negative value indicates an inverse relationship between the age of the apartment and its price: as the apartment gets older, its price tends to decrease.
Coefficient of Correlation: The coefficient of correlation is a measure of the strength and direction of the linear relationship between Price and Age. In this case, the value of the correlation coefficient is given as -0.230255.
Interpretation: This value indicates a weak negative correlation between apartment price and age. Since the correlation is close to zero but negative, it suggests that older apartments tend to have slightly lower prices, but the relationship is not very strong
Coefficient of Determination (R-squared and Adjusted R-squared): R-squared (0.05302): This tells us how much of the variability in the dependent variable (Price) is explained by the independent variable (Age). In this case, the R-squared is 0.05302, or 5.3%. This means that only 5.3% of the variation in apartment prices can be explained by the age of the apartment.
Interpretation: The low R-squared value suggests that Age is not a strong predictor of Price, and most of the variability in apartment prices is due to other factors not captured by this model.
Adjusted R-squared (0.04161): This is a modified version of R-squared that adjusts for the number of predictors in the model. It is slightly lower than the regular R-squared because the model only has one predictor (Age). In this case, the adjusted R-squared is 4.2%, reinforcing that the model doesn’t explain much of the variance in the apartment prices.
Show the scateerplot matrix between Price, Age and Distance. Based on the matrix determine if there is potential problem with multicolinearity.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
panel.scatter <- function(x, y) {
points(x, y, pch = 21, bg = "blue", col = "black") # Customize points
abline(lm(y ~ x), col = "blue") # Add linear regression line
}
panel.density <- function(x) {
usr <- par("usr")
on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5))
d <- density(x, na.rm = TRUE)
lines(d$x, d$y, col = "blue")
}
pairs(Apartments[, c("Price", "Age", "Distance")],
lower.panel = panel.scatter, # Scatterplots with regression lines
upper.panel = panel.scatter, # Scatterplots with regression lines
diag.panel = panel.density) # Density plots on the diagonal
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
Interpretation Scatterplot
Price vs Age: There’s no clear linear relationship, but there is a slight downward trend suggesting that as age increases, price slightly decreases.
Price vs Distance: A stronger negative relationship is visible; as distance from the city increases, prices tend to decrease.
Age vs Distance: There appears to be no clear trend between age and distance.
Estimate the multiple regression function: Price = f(Age, Distance). Save it in object named fit2.
fit2 <- lm(Price ~ Age + Distance, data = Apartments)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = Apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
Check the multicolinearity with VIF statistics. Explain the findings.
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
vif(fit2)
## Age Distance
## 1.001845 1.001845
Interpretation of VIF:
If VIF > 10, multicollinearity is a concern.
Both VIF values for Age and Distance are approximately 1.001845. Since both VIF values are very close to 1, this indicates that there is no significant multicollinearity between the variables Age and Distance.
Calculate standardized residuals and Cooks Distances for model fit2. Remove any potentially problematic units (outliers or units with high influence).
fit2 <- lm(Price ~ Age + Distance + Parking + Balcony, data = Apartments)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = Apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -459.92 -200.66 -57.48 260.08 594.37
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2301.667 94.271 24.415 < 2e-16 ***
## Age -6.799 3.110 -2.186 0.03172 *
## Distance -18.045 2.758 -6.543 5.28e-09 ***
## ParkingYes 196.168 62.868 3.120 0.00251 **
## BalconyYes 1.935 60.014 0.032 0.97436
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.7 on 80 degrees of freedom
## Multiple R-squared: 0.5004, Adjusted R-squared: 0.4754
## F-statistic: 20.03 on 4 and 80 DF, p-value: 1.849e-11
fit2 <- lm(Price ~ Age + Distance, data = Apartments)
cooks_dist <- cooks.distance(fit2)
hist(cooks_dist, breaks = 5, main = "Histogram of Cook's Distances",
xlab = "Cook's Distance", ylab = "Frequency", col = "lightblue", border = "black")
abline(v = 13/nrow(Apartments), col = "red", lwd = 2, lty = 2)
The red dashed line represents the threshold (around 0.15), above which
observations are considered potential outliers or highly
influential.
A few observations exceed this threshold, suggesting they may have a significant influence on the regression model and might warrant further investigation.
Check for potential heteroskedasticity with scatterplot between standarized residuals and standrdized fitted values. Explain the findings.
fit2 <- lm(Price ~ Age + Distance, data = Apartments)
std_resid <- rstandard(fit2)
fitted_values <- fit2$fitted.values
plot(fitted_values, std_resid,
main = "Standardized Residuals vs Fitted Values (Task 2)",
xlab = "Fitted Values",
ylab = "Standardized Residuals",
pch = 19, col = "blue")
abline(h = 0, col = "red", lwd = 2)
No clear pattern: The residuals are spread around zero without a clear
pattern, which suggests that the model’s assumption of constant variance
(homoscedasticity) holds fairly well.
Mild heteroscedasticity: There is a slight increase in the spread of residuals as fitted values increase, indicating mild heteroscedasticity, which may need attention if it becomes more pronounced.
Random distribution: The residuals seem randomly distributed, suggesting no major issues with the model’s fit.
Are standardized residuals ditributed normally? Show the graph and formally test it. Explain the findings.
fit3 <- lm(Price ~ Age + Distance + Parking + Balcony, data = Apartments)
std_resid <- rstandard(fit3)
hist(std_resid, breaks = 20, main = "Histogram of Standardized Residuals",
xlab = "Standardized Residuals", col = "lightblue", border = "black", probability = TRUE)
curve(dnorm(x, mean = mean(std_resid), sd = sd(std_resid)),
col = "red", lwd = 2, add = TRUE)
shapiro.test(std_resid)
##
## Shapiro-Wilk normality test
##
## data: std_resid
## W = 0.95749, p-value = 0.006874
All values fall within the range of -3 to 3, a commonly used threshold for identifying potential outliers.
The Shapiro-Wilk test shows a W = 0.95749 and a p-value = 0.006874, indicating that the standardized residuals significantly deviate from normality (since the p-value is less than 0.05).
Estimate the fit2 again without potentially excluded units and show the summary of the model. Explain all coefficients.
Apartments_clean <- Apartments
fit2_clean <- lm(Price ~ Age + Distance, data = Apartments_clean)
summary(fit2_clean)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = Apartments_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
Intercept (2502.467): When both Age and Distance are zero, the expected price of the apartment is 2502.47 EUR per square meter (for a new apartment located at the city center).
Age (-8.674): For each additional year in the apartment’s age, the price decreases by approximately 8.67 EUR per square meter, indicating a negative relationship between age and price.
Distance (-24.063): For every additional kilometer away from the city center, the price decreases by 24.06 EUR per square meter, showing a stronger negative impact of distance on price than age.
R-squared (0.5361): The model explains 53.6% of the variance in apartment prices.
p-values: Both Age and Distance are statistically significant predictors (p-values < 0.05), with Distance having a much stronger effect.
Estimate the linear regression function Price = f(Age, Distance, Parking and Balcony). Be careful to correctly include categorical variables. Save the object named fit3.
fit3 <- lm(Price ~ Age + Distance + Parking + Balcony, data = Apartments)
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = Apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -459.92 -200.66 -57.48 260.08 594.37
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2301.667 94.271 24.415 < 2e-16 ***
## Age -6.799 3.110 -2.186 0.03172 *
## Distance -18.045 2.758 -6.543 5.28e-09 ***
## ParkingYes 196.168 62.868 3.120 0.00251 **
## BalconyYes 1.935 60.014 0.032 0.97436
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.7 on 80 degrees of freedom
## Multiple R-squared: 0.5004, Adjusted R-squared: 0.4754
## F-statistic: 20.03 on 4 and 80 DF, p-value: 1.849e-11
With function anova check if model fit3 fits data better than model fit2.
anova(fit2, fit3)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + Parking + Balcony
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 82 6720983
## 2 80 5991088 2 729894 4.8732 0.01007 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Show the results of fit3 and explain regression coefficient for both categorical variables. Can you write down the hypothesis which is being tested with F-statistics, shown at the bottom of the output?
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = Apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -459.92 -200.66 -57.48 260.08 594.37
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2301.667 94.271 24.415 < 2e-16 ***
## Age -6.799 3.110 -2.186 0.03172 *
## Distance -18.045 2.758 -6.543 5.28e-09 ***
## ParkingYes 196.168 62.868 3.120 0.00251 **
## BalconyYes 1.935 60.014 0.032 0.97436
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.7 on 80 degrees of freedom
## Multiple R-squared: 0.5004, Adjusted R-squared: 0.4754
## F-statistic: 20.03 on 4 and 80 DF, p-value: 1.849e-11
ParkingYes (Coefficient: 196.168): If an apartment has parking (Parking = Yes), the price increases by 196.168 EUR per square meter, compared to an apartment without parking (Parking = No). This positive coefficient suggests that parking adds value to the apartment.
BalconyYes (Coefficient: 1.935): If an apartment has a balcony (Balcony = Yes), the price increases by 1.935 EUR per square meter, compared to an apartment without a balcony (Balcony = No). The coefficient is very small and statistically insignificant (as the p-value is likely above 0.05), meaning the balcony doesn’t have a meaningful effect on price in this model.
Hypothesis Tested by the F-Statistic: The F-statistic at the bottom of the regression output tests the null hypothesis that all regression coefficients are equal to zero, except for the intercept.
Null Hypothesis (H0): All predictors, including both continuous (Age, Distance) and categorical variables (Parking, Balcony), have no effect on the dependent variable (Price).
Alternative Hypothesis (H1): At least one of the predictors has a significant effect on the dependent variable (Price).
Since the p-value is 1.849e-11, we reject H0 and accept H1, meaning that at least one of the variables (Age, Distance, Parking, or Balcony) has a significant effect on the apartment price.
Save fitted values and claculate the residual for apartment ID2.
fitted_values <- fitted(fit3)
residual_id2 <- Apartments$Price[2] - fitted_values[2]
residual_id2
## 2
## 442.5889