library(carData) #Load the required library
Data import
data <- force("Davis")
Rename variables
colnames(Davis) <- c("Gender", "Weight", "Height", "ReportedWeight", "ReportedHeight")
Show the first few rows of the “Davis” Dataset
head(Davis) #Display the first few rows of the dataset
## Gender Weight Height ReportedWeight ReportedHeight
## 1 M 77 182 77 180
## 2 F 58 161 51 159
## 3 F 53 161 54 158
## 4 M 68 177 70 175
## 5 F 59 157 59 155
## 6 M 76 170 76 165
Explanation of dataset
str(Davis) #Display the structure of the dataset
## 'data.frame': 200 obs. of 5 variables:
## $ Gender : Factor w/ 2 levels "F","M": 2 1 1 2 1 2 2 2 2 2 ...
## $ Weight : int 77 58 53 68 59 76 76 69 71 65 ...
## $ Height : int 182 161 161 177 157 170 167 186 178 171 ...
## $ ReportedWeight: int 77 51 54 70 59 76 77 73 71 64 ...
## $ ReportedHeight: int 180 159 158 175 155 165 165 180 175 170 ...
Create Davis2 dataset by excluding rows with missing ReportedWeight and ReportedHeight values
Davis2 <- Davis[complete.cases(Davis$ReportedWeight, Davis$ReportedHeight),]
Insert new variable called BMI (Body Mass Index) for Davis2 dataset
Davis2$BMI <- Davis2$Weight / (Davis2$Height ^2)
Create categorical BMI variable for Davis2 dataset
Davis2$BMI_category <- cut(Davis2$BMI,
breaks = c(0, 18.5, 24.9, 29.9, Inf), #Define the BMI categories
labels = c("Underweight", "Normal Weight", "Overweight", "Obese"))
Davis2$BMI <- round(Davis2$BMI * 10000, 1) #Converting decimal BMI values to conventional format
View the first few rows of the updated Davis2 dataset
head(Davis2)
## Gender Weight Height ReportedWeight ReportedHeight BMI
## 1 M 77 182 77 180 23.2
## 2 F 58 161 51 159 22.4
## 3 F 53 161 54 158 20.4
## 4 M 68 177 70 175 21.7
## 5 F 59 157 59 155 23.9
## 6 M 76 170 76 165 26.3
## BMI_category
## 1 Underweight
## 2 Underweight
## 3 Underweight
## 4 Underweight
## 5 Underweight
## 6 Underweight
Sort Davis2 by BMI_category and Gender
Davis2 <- Davis2[order(Davis2$BMI_category, Davis2$Gender), ]
Display the sorted dataset
head(Davis2)
## Gender Weight Height ReportedWeight ReportedHeight BMI
## 2 F 58 161 51 159 22.4
## 3 F 53 161 54 158 20.4
## 5 F 59 157 59 155 23.9
## 12 F 166 57 56 163 510.9
## 13 F 51 161 52 158 19.7
## 14 F 64 168 64 165 22.7
## BMI_category
## 2 Underweight
## 3 Underweight
## 5 Underweight
## 12 Underweight
## 13 Underweight
## 14 Underweight
Identify the row(s) with erroneous data
erroneous_row <- Davis2$Height < 100 | Davis2$Weight > 150
Remove the row(s) with erroneous data
Davis2 <- Davis2[!erroneous_row, ]
Create a new data frame for individuals categorized as “Overweight”
Overweight_df <- subset(Davis2, BMI_category == "Overweight")
View the first few rows of the new data frame
head(Overweight_df)
## [1] Gender Weight Height ReportedWeight
## [5] ReportedHeight BMI BMI_category
## <0 rows> (or 0-length row.names)
Make another data frame, which will include only males, with height between 170 and 195 cm, and weight above 80 kg.
Males_filtered_df <- Davis2[Davis2$Gender == "M" & Davis2$Height >= 170 & Davis2$Height <= 195 & Davis2$Weight > 80, ]
head(Males_filtered_df) #View the first few rows of the filtered data frame
## Gender Weight Height ReportedWeight ReportedHeight BMI
## 17 M 92 187 101 185 26.3
## 21 M 119 180 124 178 36.7
## 30 M 101 183 100 180 30.2
## 39 M 88 178 86 175 27.8
## 45 M 85 179 82 175 26.5
## 51 M 82 182 85 183 24.8
## BMI_category
## 17 Underweight
## 21 Underweight
## 30 Underweight
## 39 Underweight
## 45 Underweight
## 51 Underweight
Calculate descriptive statistics for variables in the Davis2 dataset
summary(Davis2)
## Gender Weight Height ReportedWeight
## F:98 Min. : 39.00 Min. :148.0 Min. : 41.00
## M:82 1st Qu.: 55.75 1st Qu.:164.0 1st Qu.: 55.00
## Median : 63.00 Median :169.5 Median : 63.00
## Mean : 65.75 Mean :170.8 Mean : 65.73
## 3rd Qu.: 74.25 3rd Qu.:178.0 3rd Qu.: 74.25
## Max. :119.00 Max. :197.0 Max. :124.00
## ReportedHeight BMI BMI_category
## Min. :148.0 Min. :15.80 Underweight :180
## 1st Qu.:161.0 1st Qu.:20.20 Normal Weight: 0
## Median :168.0 Median :21.90 Overweight : 0
## Mean :168.7 Mean :22.36 Obese : 0
## 3rd Qu.:175.0 3rd Qu.:24.05
## Max. :200.0 Max. :36.70
Calculate descriptive statistics for all numeric variables in Davis2
library(pastecs)
statistics <- stat.desc(Davis2[, sapply(Davis2, is.numeric)])
print(statistics) #View the calculated statistics
## Weight Height ReportedWeight ReportedHeight
## nbr.val 1.800000e+02 1.800000e+02 180.000000 1.800000e+02
## nbr.null 0.000000e+00 0.000000e+00 0.000000 0.000000e+00
## nbr.na 0.000000e+00 0.000000e+00 0.000000 0.000000e+00
## min 3.900000e+01 1.480000e+02 41.000000 1.480000e+02
## max 1.190000e+02 1.970000e+02 124.000000 2.000000e+02
## range 8.000000e+01 4.900000e+01 83.000000 5.200000e+01
## sum 1.183500e+04 3.074100e+04 11832.000000 3.036400e+04
## median 6.300000e+01 1.695000e+02 63.000000 1.680000e+02
## mean 6.575000e+01 1.707833e+02 65.733333 1.686889e+02
## SE.mean 1.002290e+00 6.687711e-01 1.032603 7.014788e-01
## CI.mean.0.95 1.977825e+00 1.319690e+00 2.037642 1.384232e+00
## var 1.808254e+02 8.050587e+01 191.928492 8.857306e+01
## std.dev 1.344713e+01 8.972506e+00 13.853826 9.411326e+00
## coef.var 2.045192e-01 5.253736e-02 0.210758 5.579103e-02
## BMI
## nbr.val 180.0000000
## nbr.null 0.0000000
## nbr.na 0.0000000
## min 15.8000000
## max 36.7000000
## range 20.9000000
## sum 4025.6000000
## median 21.9000000
## mean 22.3644444
## SE.mean 0.2263351
## CI.mean.0.95 0.4466283
## var 9.2209634
## std.dev 3.0366039
## coef.var 0.1357782
rounded_statistics <- round(statistics, digits = 2)
print(rounded_statistics)
## Weight Height ReportedWeight ReportedHeight BMI
## nbr.val 180.00 180.00 180.00 180.00 180.00
## nbr.null 0.00 0.00 0.00 0.00 0.00
## nbr.na 0.00 0.00 0.00 0.00 0.00
## min 39.00 148.00 41.00 148.00 15.80
## max 119.00 197.00 124.00 200.00 36.70
## range 80.00 49.00 83.00 52.00 20.90
## sum 11835.00 30741.00 11832.00 30364.00 4025.60
## median 63.00 169.50 63.00 168.00 21.90
## mean 65.75 170.78 65.73 168.69 22.36
## SE.mean 1.00 0.67 1.03 0.70 0.23
## CI.mean.0.95 1.98 1.32 2.04 1.38 0.45
## var 180.83 80.51 191.93 88.57 9.22
## std.dev 13.45 8.97 13.85 9.41 3.04
## coef.var 0.20 0.05 0.21 0.06 0.14
Make a descriptive statistics by gender
library(psych)
describeBy(Davis2, Davis2$Gender)
##
## Descriptive statistics by group
## group: F
## vars n mean sd median trimmed mad min max
## Gender* 1 98 1.00 0.00 1.00 1.00 0.00 1.0 1.0
## Weight 2 98 57.19 6.92 56.00 56.91 5.93 39.0 78.0
## Height 3 98 164.73 5.52 165.00 164.75 4.45 148.0 178.0
## ReportedWeight 4 98 56.67 6.79 56.00 56.45 5.93 41.0 77.0
## ReportedHeight 5 98 162.36 5.82 162.00 162.32 4.45 148.0 176.0
## BMI 6 98 21.06 2.21 20.85 20.99 1.85 15.8 28.6
## BMI_category* 7 98 1.00 0.00 1.00 1.00 0.00 1.0 1.0
## range skew kurtosis se
## Gender* 0.0 NaN NaN 0.00
## Weight 39.0 0.44 0.56 0.70
## Height 30.0 -0.15 0.40 0.56
## ReportedWeight 36.0 0.47 0.56 0.69
## ReportedHeight 28.0 0.08 -0.23 0.59
## BMI 12.8 0.52 0.84 0.22
## BMI_category* 0.0 NaN NaN 0.00
## ----------------------------------------------------
## group: M
## vars n mean sd median trimmed mad min max
## Gender* 1 82 2.00 0.00 2.00 2.00 0.00 2.0 2.0
## Weight 2 82 75.98 12.16 75.00 75.12 11.86 54.0 119.0
## Height 3 82 178.01 6.65 178.00 177.91 7.41 163.0 197.0
## ReportedWeight 4 82 76.56 12.29 75.00 75.44 10.38 56.0 124.0
## ReportedHeight 5 82 176.26 6.98 175.00 176.21 7.41 161.0 200.0
## BMI 6 82 23.92 3.16 23.55 23.77 3.11 17.8 36.7
## BMI_category* 7 82 1.00 0.00 1.00 1.00 0.00 1.0 1.0
## range skew kurtosis se
## Gender* 0.0 NaN NaN 0.00
## Weight 65.0 0.77 0.77 1.34
## Height 34.0 0.14 -0.25 0.73
## ReportedWeight 68.0 1.06 1.57 1.36
## ReportedHeight 39.0 0.30 0.35 0.77
## BMI 18.9 0.79 1.81 0.35
## BMI_category* 0.0 NaN NaN 0.00
Create histograms for Weight and Height, using function hist
hist(Davis2$Weight,
main = "Distribution of Weight",
xlab = "Weight(kg)", #Name the x-axis
ylab = "Frequency", #Name the y-axis
col = "lightblue",
border = "black")
The histogram for “Weight” in the Davis2 dataset reveals the
distribution of individuals’ actual weights.
The data is approximately normally distributed, forming a bell-shaped curve.
The majority of individuals have weights that are concentrated around the central value, which is approximately in the range of 60 kg to 70 kg.
There are fewer individuals at the extremes, both on the lower end (below 60 kg) and the higher end (above 70 kg).
This distribution suggests that most individuals in the dataset have weights that fall within the average weight range, with a peak around 65 kg. It’s a typical distribution for weight data.
hist(Davis2$Height,
main = "Distribution of Height",
xlab = "Height(cm)", #Name the x-axis
ylab = "Frequency", #Name the y-axis
col = "lightpink",
border = "grey")
The histogram for “Height” in the Davis2 dataset illustrates the
distribution of individuals’ actual heights.
It also resembles a normal distribution, with a peak in the range of approximately 165 cm to 170 cm.
The data shows that the majority of individuals have heights clustered around the central value.
There is a slight left skew, indicating that there are relatively more individuals with heights below the mean height in the dataset compared to those with heights above the mean.
Similar to weight, there are fewer individuals at the extremes (very short or very tall).
This distribution suggests that most individuals have heights within a typical height range, with a peak around 168 cm.
Create histograms for Reported Weight, Reported Height and BMI, using function ggplot
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
ggplot(data = Davis2, aes(x = ReportedWeight))+
geom_histogram( binwidth = 5,
fill = "lightgreen",
color = "black") +
labs (title = "Reported Weight Distribution",
x = "Reported Weight (kg)",
y = "Frequency")
The histogram for “Reported Weight” shows that the data is somewhat
normally distributed but has a slight right skew.
Most individuals reported weights that are clustered around the central value.
There are fewer individuals who reported very low or very high weights.
The peak of the distribution is around the 55-65 kg range.
This distribution indicates that many individuals reported weights close to their actual weights, but there are variations.
ggplot(data = Davis2, aes(x = ReportedHeight)) +
geom_histogram(binwidth = 5,
fill = "lightyellow",
color = "black") +
labs(title = "Reported Height Distribution",
x = "Reported Height (cm)",
y = "Frequency")
The histogram for “Reported Height” reveals a distribution with a peak around the 160-170 cm range.
The data is slightly left-skewed, with more individuals reporting heights below the mean height in the dataset.
Similar to reported weight, there are fewer individuals who reported heights significantly above or below the central value.
The distribution suggests that reported heights tend to be concentrated around a specific range.
ggplot(data = Davis2, aes(x = BMI)) +
geom_histogram(binwidth = 1,
fill = "purple",
color = "black") +
labs(title = "BMI Distribution",
x = "BMI",
y = "Frequency")
The histogram for “BMI” shows a distribution that is approximately
bell-shaped but slightly right-skewed.
Most individuals have BMIs clustered around the central value.
There is a tail on the right side of the distribution, indicating some individuals with higher BMIs.
The peak of the distribution is around the 20-25 BMI range, which is typically considered the normal weight range.
This distribution suggests that while many individuals have BMIs within the normal range, there are also individuals with higher BMIs.
Visually analise the relation between Height and Weight
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(Davis2$Height ~ Davis2$Weight,
smooth = FALSE,
boxplots = FALSE,
ylab = "Height in cm",
xlab = "Weight in kg")
It’s linear and positive correlation between both variables.
fit <- lm(Height ~ Weight,
data = Davis2)
summary (fit)
##
## Call:
## lm(formula = Height ~ Weight, data = Davis2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.0736 -3.8115 -0.2366 3.3762 20.9636
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 137.08691 2.14903 63.79 <2e-16 ***
## Weight 0.51249 0.03203 16.00 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.762 on 178 degrees of freedom
## Multiple R-squared: 0.5899, Adjusted R-squared: 0.5876
## F-statistic: 256.1 on 1 and 178 DF, p-value: < 2.2e-16
Height = 137 + 0.5 Weight If Weight increases by 1kg,then the height increases by 0.5cm on average.
Load the dataset
setwd("/Users/markodjordjevic/Downloads")
data <- read.csv("Body mass.csv",
header = TRUE,
sep = ";",
dec = ",")
head(data)
## BodyMass.X
## 1 62.1
## 2 64.5
## 3 56.5
## 4 53.4
## 5 61.3
## 6 62.2
Description: - BodyMass: Weight of 50 random ninth graders measured in kg - X: The grade attended by the student
Histogram for BodyMass
hist(data$BodyMass.X,
main = "Distribution of Body Mass",
xlab = "Body Mass",
ylab = "Frequency",
col = "blue",
border = "black")
mean(data$BodyMass)
## [1] 62.876
sd(data$BodyMass)
## [1] 6.011403
HO: Mu = 59.5 H1: Mu =/ 59.5
H1: Mu < 59.5 “less” H1: Mu > 59.5 “greater”
t.test(data$BodyMass,
mu = 59.5,
alternative = "two.sided")
##
## One Sample t-test
##
## data: data$BodyMass
## t = 3.9711, df = 49, p-value = 0.000234
## alternative hypothesis: true mean is not equal to 59.5
## 95 percent confidence interval:
## 61.16758 64.58442
## sample estimates:
## mean of x
## 62.876
Estimate of arithmetic mean: Y_hat = 62.87
Comparing the p-value to alpha (0.05) p-value:0.000234 less than 0.001
Since p< 0.001 we can reject HO.
Average weight of the students in the school year 2021/22 is different than in the school year 2018/19. The weight increased.
95% confidence interval mean: 61 < Mu < 65
The 95% confidence interval (61, 65) means that we are 95% confident that the true population mean (Mu) falls within this interval. This interval does not include the null hypothesis value of 59.5.
Since the null hypothesis value of Mu (H0: Mu = 59.5) is outside this confidence interval, it indicates that the null hypothesis is not supported by the data. In other words, there is evidence to suggest that the true population mean is not equal to 59.5, which means we now can reject the HO at 5%.
library(psych)
describe(data$BodyMass)
## vars n mean sd median trimmed mad min max range skew
## X1 1 50 62.88 6.01 62.8 62.56 3.34 49.7 83.2 33.5 0.85
## kurtosis se
## X1 2.11 0.85
library(pastecs)
round(stat.desc(data$BodyMass), 2)
## nbr.val nbr.null nbr.na min max
## 50.00 0.00 0.00 49.70 83.20
## range sum median mean SE.mean
## 33.50 3143.80 62.80 62.88 0.85
## CI.mean.0.95 var std.dev coef.var
## 1.71 36.14 6.01 0.10
lower boundary: 62.88 - 1.71 upper boundary: 62.88 + 1.71
Calculating the effect size
#install.packages("effectsize")
library(effectsize)
##
## Attaching package: 'effectsize'
## The following object is masked from 'package:psych':
##
## phi
cohens_d (data$BodyMass, mu = 59.5)
## Cohen's d | 95% CI
## ------------------------
## 0.56 | [0.26, 0.86]
##
## - Deviation from a difference of 59.5.
interpret_cohens_d(0.56, rules = "sawilowsky2009")
## [1] "medium"
## (Rules: sawilowsky2009)
#install.packages("openxlsx")
library(openxlsx)
apartments_data <- read.xlsx("Apartments (1).xlsx")
apartments_data <- as.data.frame(apartments_data)
apartments_data$ParkingFactor <- factor(apartments_data$Parking,
levels = c(1, 0),
labels = c("Yes", "No"))
apartments_data$BalconyFactor <- factor(apartments_data$Balcony,
levels = c(1, 0),
labels = c("Yes", "No"))
head(apartments_data, 5)
## Age Distance Price Parking Balcony ParkingFactor BalconyFactor
## 1 7 28 1640 0 1 No Yes
## 2 18 1 2800 1 0 Yes No
## 3 7 28 1660 0 0 No No
## 4 28 29 1850 0 1 No Yes
## 5 18 18 1640 1 1 Yes Yes
mean(apartments_data$Price)
## [1] 2018.941
sd(apartments_data$Price)
## [1] 377.8417
H0: Mu = 1900 H1: Mu =/ 1900
H1: Mu < 1900 “less” H1: Mu > 1900 “greater”
t.test(apartments_data$Price,
mu = 1900,
alternative = "two.sided")
##
## One Sample t-test
##
## data: apartments_data$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
Estimate of arithmetic mean: Y_hat = 2018.94
Compariing the p-value to alpha (0.05) p-value: 0.004731 less than 0.05
Since p-value is less than alpha, we have enough evidence to reject H0.
95% confidence interval mean: 1938 < Mu > 2100
The 95% confidence interval (1938, 2100) means that we are 95% confident that the true population mean (Mu) falls within this interval. This interval does not include the null hypothesis value of 1900.
Since the null hypothesis value of Mu (H0: Mu = 1900) is outside this confidence interval, it indicates that the null hypothesis is not supported by the data. In other words, there is evidence to suggest that the true population mean is not equal to 1900, which means we now can reject the HO at 5%.
library(psych)
describe(apartments_data$Price)
## vars n mean sd median trimmed mad min max range skew
## X1 1 85 2018.94 377.84 1950 1990.29 429.95 1400 2820 1420 0.54
## kurtosis se
## X1 -0.69 40.98
library(pastecs)
round(stat.desc(apartments_data$Price, 2))
## median mean SE.mean CI.mean.0.95 var
## 1950 2019 41 81 142764
## std.dev coef.var
## 378 0
lower boundary: 2019 - 81 upper boundary: 2019 + 81
Calculating the effect size
library(effectsize)
cohens_d(apartments_data$Price, Mu = 1900)
## Cohen's d | 95% CI
## ------------------------
## 5.34 | [4.49, 6.16]
interpret_cohens_d(5.34, rules = "sawilowsky2009")
## [1] "huge"
## (Rules: sawilowsky2009)
fit1 <- lm(Price ~ Age, data = apartments_data)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = apartments_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
correlation <- cor(apartments_data$Price, apartments_data$Age)
Estimate of Regression Coefficient (Age):
The estimate of the regression coefficient for “Age” is approximately -8.975. This value represents the estimated change in the “Price” of apartments for each one-unit change in “Age.” In this case, it suggests that, on average, for every additional year of “Age,” the price of apartments is estimated to decrease by approximately 8.975 EUR.
Coefficient of Correlation (r):
The coefficient of correlation (r) is not explicitly provided in this output, but it quantifies the strength and direction of the linear relationship between “Price” and “Age.” If “r” were calculated, a value close to -1 would indicate a strong negative linear correlation, suggesting that as “Age” increases, “Price” tends to decrease. A value close to 1 would indicate a strong positive linear correlation, suggesting that as “Age” increases, “Price” tends to increase.
Coefficient of Determination (R-squared):
The coefficient of determination (R-squared or R²) is a measure of how well the independent variable “Age” explains the variability in the dependent variable “Price.” In this model, the R-squared value is approximately 0.05302. This means that about 5.30% of the variability in apartment prices (“Price”) can be explained by the linear relationship with “Age.” In other words, the model is able to account for a relatively small portion of the variation in prices, indicating that other factors may also influence apartment prices.
pairs(apartments_data[, c("Price", "Age", "Distance")])
correlation_matrix <- cor(apartments_data[, c("Price", "Age", "Distance")])
print(correlation_matrix)
## Price Age Distance
## Price 1.0000000 -0.23025502 -0.63102794
## Age -0.2302550 1.00000000 0.04290813
## Distance -0.6310279 0.04290813 1.00000000
fit2 <- lm(Price ~ Age + Distance, data = apartments_data)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = apartments_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
library(car)
vif_values <- vif(fit2)
vif_values
## Age Distance
## 1.001845 1.001845
Based on the VIF values, it appears that there is relatively low multicollinearity between “Age” and “Distance” in the multiple regression model. Both predictors have VIF values close to 1, suggesting that they are not highly correlated with each other.
Low VIF values are a positive sign in regression analysis, as they indicate that the predictors do not suffer from severe multicollinearity, and the coefficients can be interpreted more reliably.
# Calculate standardized residuals for fit2
standardized_residuals <- rstandard(fit2)
# Calculate Cook's distances for fit2
cooksd <- cooks.distance(fit2)
# Identify problematic units (outliers or units with high influence)
# Let's consider units with standardized residuals greater than 2 or Cook's distance greater than 4 times the mean as potentially problematic
problematic_units <- which(standardized_residuals > 2 | cooksd > 4 * mean(cooksd))
# Remove problematic units from the dataset
apartments_data_cleaned <- apartments_data[-problematic_units, ]
# Re-estimate fit2 with the cleaned dataset
fit2_cleaned <- lm(Price ~ Age + Distance, data = apartments_data_cleaned)
# Show the summary of the cleaned model fit2_cleaned
summary(fit2_cleaned)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = apartments_data_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -411.50 -203.69 -45.24 191.11 492.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2502.467 75.024 33.356 < 2e-16 ***
## Age -8.674 3.221 -2.693 0.00869 **
## Distance -24.063 2.692 -8.939 1.57e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared: 0.5361, Adjusted R-squared: 0.524
## F-statistic: 44.49 on 2 and 77 DF, p-value: 1.437e-13
I suspect that we have heteroskedasticity in this regression model because, when examining the scatterplot of standardized residuals against the fitted values, we can observe a funnel-like pattern where the spread of residuals systematically increases or decreases as the predicted values change. This pattern suggests that the variance of the residuals is not constant across the range of fitted values, which is a characteristic indicator of heteroskedasticity in regression analysis.
# Estimate fit2 without potentially excluded units
fit2_cleaned <- lm(Price ~ Age + Distance, data = apartments_data_cleaned)
# Show the summary of the model
summary(fit2_cleaned)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = apartments_data_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -411.50 -203.69 -45.24 191.11 492.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2502.467 75.024 33.356 < 2e-16 ***
## Age -8.674 3.221 -2.693 0.00869 **
## Distance -24.063 2.692 -8.939 1.57e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared: 0.5361, Adjusted R-squared: 0.524
## F-statistic: 44.49 on 2 and 77 DF, p-value: 1.437e-13
Intercept (Intercept):
The intercept represents the estimated expected value of the dependent variable (Price) when both Age and Distance are equal to zero. In the context of your data, this may not have a direct practical interpretation, as Age and Distance are likely not equal to zero in your dataset.
Age:
The coefficient for Age is -8.674. This means that, holding Distance constant, for each one-unit increase in Age, the Price is estimated to decrease by approximately 8.674 units (in the currency of your Price variable). The t-value of -2.693 and the associated p-value of 0.00869 indicate that Age is statistically significant at a significance level of 0.01. In other words, Age has a significant effect on Price.
Distance:
The coefficient for Distance is -24.063. This means that, holding Age constant, for each one-unit increase in Distance, the Price is estimated to decrease by approximately 24.063 units (in the currency of your Price variable). The t-value of -8.939 and the extremely low p-value (1.57e-13) indicate that Distance is highly statistically significant. Distance has a strong and significant effect on Price.
Residuals:
The residual standard error is 256.8. This represents the estimated standard deviation of the residuals, which measures the spread of the residuals around the fitted values. It provides a measure of the model’s goodness of fit.
Multiple R-squared and Adjusted R-squared:
Multiple R-squared (0.5361) represents the proportion of the variance in Price that is explained by Age and Distance together. In this case, approximately 53.61% of the variability in Price is explained by the model. Adjusted R-squared (0.524) adjusts the R-squared value for the number of predictors. It provides a more realistic estimate of how well the model fits the data while penalizing for the number of predictors.
F-statistic and p-value:
The F-statistic tests the overall significance of the model. In this case, the F-statistic is 44.49 with an extremely low p-value (1.437e-13), indicating that the model as a whole is highly significant.
apartments_data_cleaned$Parking <- as.factor(apartments_data_cleaned$Parking)
apartments_data_cleaned$Balcony <- as.factor(apartments_data_cleaned$Balcony)
fit3 <- lm(Price ~ Age + Distance + Parking + Balcony, data = apartments_data_cleaned)
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = apartments_data_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -390.93 -198.19 -53.64 186.73 518.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2393.316 93.930 25.480 < 2e-16 ***
## Age -7.970 3.191 -2.498 0.0147 *
## Distance -21.961 2.830 -7.762 3.39e-11 ***
## Parking1 128.700 60.801 2.117 0.0376 *
## Balcony1 6.032 57.307 0.105 0.9165
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 252.7 on 75 degrees of freedom
## Multiple R-squared: 0.5623, Adjusted R-squared: 0.5389
## F-statistic: 24.08 on 4 and 75 DF, p-value: 7.764e-13