data("LifeCycleSavings") #since dataset do not include categorical variable we will make it
mydata <- (LifeCycleSavings) #to get mydata name instead of LifeCycleSaving, it will be easier later to write functions
head(mydata) #to show me first 6 units and variables of the dataset
## sr pop15 pop75 dpi ddpi
## Australia 11.43 29.35 2.87 2329.68 2.87
## Austria 12.07 23.32 4.41 1507.99 3.93
## Belgium 13.17 23.80 4.43 2108.47 3.82
## Bolivia 5.75 41.89 1.67 189.13 0.22
## Brazil 12.88 42.19 0.83 728.47 4.56
## Canada 8.79 31.72 2.85 2982.88 2.43
EXPLANATION OF VARIABLES:
mean(mydata$dpi) #to get the mean of disposable income of all countries and based on that we will add a categorical variable; if country has more than 1106,758 will be denoted as high income otherwise it will be as low income
## [1] 1106.758
Creating a new variable - categorial one:
mydata$IncomeGroup <- ifelse(
mydata$dpi > mean(mydata$dpi),
"0", "1") #ifelse() is a function to create conditional statments for vectors. It allows us to check a condition for each element; if it is true or false. If it is true it will give value assigned based on the condition in our example if it is disposable income > mean it is "0" as High Income and if it is false so it is "1" as lower than mean is lower income country
Modus, Standard deviation of disposable income:
#install.packages("modeest") #if we want to calculate mode we need to install modeest package, after we did we need to insert # infront of istall so it does not keep installing it all the time
library(modeest)
mlv(mydata$dpi)
## Warning: argument 'method' is missing. Data are supposed to be continuous.
## Default method 'shorth' is used
## [1] 330.796
sd(mydata$dpi)
## [1] 990.8689
SD for all variables separately:
sapply(mydata,FUN=sd)
## sr pop15 pop75 dpi ddpi IncomeGroup
## 4.4804069 9.1517272 1.2907714 990.8688890 2.8698706 0.4948717
mydata$IncomeGroup<-factor(mydata$IncomeGroup,
levels = c(0,1),
labels = c("High Income Country", "Low Income Country")) #we could do this in the above function where we instead of 0 and 1 will write High Income and Low income I just wanted to show also this function
If we want to modify results within data, lets say we want to change Austria so that population over the 75 years is instead 4.41% 5%. In our case that is not useful since it is not incorrect number but in case we had such a problem we will do it like this:
mydata[2,3] <- 5 #now we have in data 5%
rm(data_newvariable) #I made a mistake, so I wanted to remove the dataset from the environment. I did it with this formula
## Warning in rm(data_newvariable): object 'data_newvariable' not found
If we want to rename variable se into SavingsRate, DisposableIncome = dpi, GrowthDisposableIncom = ddpi and create a new data set:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data_newvariable <- mydata %>% rename(SavingsRate = sr, DisposableIncome = dpi, GrowthDisposableIncom = ddpi) #We have changed se into SavingsRate, firstly I made data_newvariable <- that's how I got additional data in the environment
To show descriptive statistics we can use more functions: summary(), str() but the most comprehensive is stat.desc(), so we will use this one:
#install.packages("pastecs")
library (pastecs)
##
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
##
## first, last
round(stat.desc(data_newvariable[,1:5]),2) #we install and use stat desc, but before we run it, we also demanded that values will be rounded to two decimals
## SavingsRate pop15 pop75 DisposableIncome GrowthDisposableIncom
## nbr.val 50.00 50.00 50.00 50.00 50.00
## nbr.null 0.00 0.00 0.00 0.00 0.00
## nbr.na 0.00 0.00 0.00 0.00 0.00
## min 0.60 21.44 0.56 88.94 0.22
## max 21.10 47.64 5.00 4001.89 16.71
## range 20.50 26.20 4.44 3912.95 16.49
## sum 483.55 1754.48 115.24 55337.92 187.88
## median 10.51 32.58 2.17 695.66 3.00
## mean 9.67 35.09 2.30 1106.76 3.76
## SE.mean 0.63 1.29 0.19 140.13 0.41
## CI.mean.0.95 1.27 2.60 0.37 281.60 0.82
## var 20.07 83.75 1.72 981821.16 8.24
## std.dev 4.48 9.15 1.31 990.87 2.87
## coef.var 0.46 0.26 0.57 0.90 0.76
library(ggplot2)
ggplot(data_newvariable, aes(x=SavingsRate))+
geom_histogram(binwidth = 5, colour = "grey", fill= "pink")+
facet_wrap(~IncomeGroup, ncol=1)+
ylab("Frequency")+
labs(fill="Income of Country")
## Ignoring unknown labels:
## • fill : "Income of Country"
ggplot(data_newvariable, aes(x=DisposableIncome))+
geom_histogram(binwidth = 1000, colour = "grey", fill= "pink")+
facet_wrap(~IncomeGroup, ncol=1)+
ylab("Frequency")+
labs(fill="Income of Country")
## Ignoring unknown labels:
## • fill : "Income of Country"
library(readxl)
mydata2 <- read_xlsx("./Business School.xlsx")
mydata2 <- as.data.frame(mydata2)
head(mydata2)
## Student ID Undergrad Degree Undergrad Grade MBA Grade Work Experience
## 1 1 Business 68.4 90.2 No
## 2 2 Computer Science 70.2 68.7 Yes
## 3 3 Finance 76.4 83.3 No
## 4 4 Business 82.6 88.7 No
## 5 5 Finance 76.9 75.4 No
## 6 6 Computer Science 83.3 82.1 No
## Employability (Before) Employability (After) Status Annual Salary
## 1 252 276 Placed 111000
## 2 101 119 Placed 107000
## 3 401 462 Placed 109000
## 4 287 342 Placed 148000
## 5 275 347 Placed 255500
## 6 254 313 Placed 103500
library(dplyr)
mydata2 <- mydata2 %>% rename(
UndergradDegree = `Undergrad Degree`,
UndergradGrade = `Undergrad Grade`,
MBAGrade = `MBA Grade`,
WorkExperience = `Work Experience`,
EmployBefore = `Employability (Before)`,
EmployAfter = `Employability (After)`,
AnnualSalary = `Annual Salary`) #I renamed variable to the names without spaces since it will be easier for the next functions
str(mydata2$UndergradDegree)
## chr [1:100] "Business" "Computer Science" "Finance" "Business" "Finance" ...
ggplot(mydata2, aes(x= UndergradDegree))+
geom_bar(binwidth = 5, colour = "lightblue1", fill= "lightblue")+
ylab("Frequency") #Due to UndergradDegree is categorical factor we need to use geom_bar
## Warning in geom_bar(binwidth = 5, colour = "lightblue1", fill = "lightblue"):
## Ignoring unknown parameters: `binwidth`
library(pastecs)
options(scipen = 999) # turn off scientific notation
round(stat.desc(mydata2$AnnualSalary),2)
## nbr.val nbr.null nbr.na min max
## 100.00 0.00 0.00 20000.00 340000.00
## range sum median mean SE.mean
## 320000.00 10905800.00 103500.00 109058.00 4150.15
## CI.mean.0.95 var std.dev coef.var
## 8234.80 1722373474.75 41501.49 0.38
library(ggplot2)
ggplot(mydata2, aes(x = AnnualSalary)) +
geom_histogram(binwidth = 20000, # adjust bin width for clarity
fill = "skyblue",
color = "black") +
labs(title = "Histogram of Annual Salary",
x = "Annual Salary",
y = "Frequency") +
theme_minimal()
library(psych)
## Registered S3 method overwritten by 'psych':
## method from
## plot.residuals rmutil
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
describe(mydata2$AnnualSalary)
## vars n mean sd median trimmed mad min max range skew
## X1 1 100 109058 41501.49 103500 104600.2 25945.5 20000 340000 320000 2.22
## kurtosis se
## X1 9.41 4150.15
describe(mydata2$MBAGrade)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 100 76.04 7.68 76.38 76.18 8.29 58.14 95 36.86 -0.15 -0.59 0.77
t.test(mydata2$MBAGrade,
mu=74,
alternative = "two.sided") #To test siginiciance of hypothesis we do t-test so we can compare p-values with alpha and make conclussion about H0 hypothesis.
##
## One Sample t-test
##
## data: mydata2$MBAGrade
## t = 2.6587, df = 99, p-value = 0.00915
## alternative hypothesis: true mean is not equal to 74
## 95 percent confidence interval:
## 74.51764 77.56346
## sample estimates:
## mean of x
## 76.04055
To intrepret effect size we do Cohen’s d:
mean_MBAGrades <- mean(mydata2$MBAGrade, na.rm = TRUE)
sd_MBAGrades <- sd(mydata2$MBAGrade, na.rm = TRUE)
cohen_d <- (mean_MBAGrades - 74) / sd_MBAGrades #To calculate it with R, we neede to assign a new value for the mean and sd, here we used na.rm=TRUE which are is case there will be a missing values
print(cohen_d)
## [1] 0.2658658
mydata3 <- read_excel("./Apartments.xlsx")
mydata3 <- as.data.frame(mydata3)
Description:
mydata3$ParkingFactor <- factor (mydata3$Parking,
levels = c(0,1),
labels = c("No","Yes"))
mydata3$BalconyFactor <- factor (mydata3$Balcony,
levels = c(0,1),
labels = c("No","Yes"))
describe(mydata3$Price)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 85 2018.94 377.84 1950 1990.29 429.95 1400 2820 1420 0.54 -0.69
## se
## X1 40.98
t.test(mydata3$Price,
mu=1900,
alternative = "two.sided") #To test siginiciance of hypothesis we do t-test so we can compare p-values with alpha and make conclussion about H0 hypothesis.
##
## One Sample t-test
##
## data: mydata3$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
fit1<- lm( Price ~ Age,
data = mydata3)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = mydata3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <0.0000000000000002 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
cor(mydata3$Price,mydata3$Age)
## [1] -0.230255
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
scatterplotMatrix(~ Price + Age + Distance, data = mydata3,
smooth = FALSE)
fit2 <- lm( Price ~ Age + Distance,
data = mydata3)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 0.0000000000000002 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 0.0000000000618 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 0.00000000004896
vif(fit2)
## Age Distance
## 1.001845 1.001845
rm(std_res)
## Warning in rm(std_res): object 'std_res' not found
# Standardized residuals
mydata3$StdResid <- round(rstandard(fit2),3)
mydata3$CooksD <- round(cooks.distance(fit2),3)
hist(mydata3$CooksD,
xlab = "Cooks distance",
ylab = "Frequency",
main = "Histogram of Cooks distance") #To check if we need to remove some influentical cases
hist(mydata3$StdResid,
xlab = "Standardized residulas",
ylab = "Frequency",
main = "Histogram of Standardized residuals") #To check if we need to remove some cases
head(mydata3[order(mydata3$StdResid),], 10)
## Age Distance Price Parking Balcony ParkingFactor BalconyFactor StdResid
## 53 7 2 1760 0 1 No Yes -2.152
## 13 12 14 1650 0 1 No Yes -1.499
## 72 12 14 1650 0 0 No No -1.499
## 20 13 8 1800 0 0 No No -1.381
## 35 14 16 1660 0 1 No Yes -1.261
## 36 24 5 1830 1 0 Yes No -1.189
## 54 30 17 1560 0 0 No No -1.101
## 5 18 18 1640 1 1 Yes Yes -1.073
## 64 18 18 1640 1 1 Yes Yes -1.073
## 28 18 19 1620 1 0 Yes No -1.071
## CooksD
## 53 0.066
## 13 0.013
## 72 0.013
## 20 0.012
## 35 0.008
## 36 0.012
## 54 0.012
## 5 0.005
## 64 0.005
## 28 0.005
head(mydata3[order(-mydata3$StdResid),], 10) #To show stand. resiudals by the order
## Age Distance Price Parking Balcony ParkingFactor BalconyFactor StdResid
## 38 5 45 2180 1 1 Yes Yes 2.577
## 33 2 11 2790 1 0 Yes No 2.051
## 2 18 1 2800 1 0 Yes No 1.783
## 61 18 1 2800 1 1 Yes Yes 1.783
## 58 8 2 2820 1 0 Yes No 1.655
## 57 10 1 2810 0 0 No No 1.601
## 22 37 3 2540 1 1 Yes Yes 1.576
## 25 8 26 2300 1 1 Yes Yes 1.571
## 11 12 5 2700 1 0 Yes No 1.551
## 70 12 5 2700 1 1 Yes Yes 1.551
## CooksD
## 38 0.320
## 33 0.069
## 2 0.030
## 61 0.030
## 58 0.037
## 57 0.032
## 22 0.061
## 25 0.034
## 11 0.020
## 70 0.020
head (mydata3[order(-mydata3$CooksD),], 20) #Cooks distane date, to show which one to remove.
## Age Distance Price Parking Balcony ParkingFactor BalconyFactor StdResid
## 38 5 45 2180 1 1 Yes Yes 2.577
## 55 43 37 1740 0 0 No No 1.445
## 33 2 11 2790 1 0 Yes No 2.051
## 53 7 2 1760 0 1 No Yes -2.152
## 22 37 3 2540 1 1 Yes Yes 1.576
## 39 40 2 2400 0 1 No Yes 1.091
## 58 8 2 2820 1 0 Yes No 1.655
## 25 8 26 2300 1 1 Yes Yes 1.571
## 57 10 1 2810 0 0 No No 1.601
## 2 18 1 2800 1 0 Yes No 1.783
## 31 45 21 1910 0 1 No Yes 0.889
## 61 18 1 2800 1 1 Yes Yes 1.783
## 27 16 1 2750 1 0 Yes No 1.550
## 48 38 13 1610 0 1 No Yes -1.009
## 80 38 13 1610 0 0 No No -1.009
## 11 12 5 2700 1 0 Yes No 1.551
## 70 12 5 2700 1 1 Yes Yes 1.551
## 46 14 25 2250 1 0 Yes No 1.478
## 78 14 25 2250 1 1 Yes Yes 1.478
## 10 25 2 2570 1 0 Yes No 1.241
## CooksD
## 38 0.320
## 55 0.104
## 33 0.069
## 53 0.066
## 22 0.061
## 39 0.038
## 58 0.037
## 25 0.034
## 57 0.032
## 2 0.030
## 31 0.030
## 61 0.030
## 27 0.023
## 48 0.022
## 80 0.022
## 11 0.020
## 70 0.020
## 46 0.019
## 78 0.019
## 10 0.017
We have checked the data set by both standard residuals from smallest to largest and Cook’s D from largest to smallest in order to detect and remove the influential value. We find out that the one which deviates from the Cooks D histogram is the apartment in rows 38, 55, 33, 22. While standard residuals are on the interval -1 to +3 continuously, so we will not remove any.
To delete it, we will first assign a new variable called ID and then delete it with a filter function
mydata3$ID <- 1:nrow(mydata3) #We made a new variable ID to sequence the rows
library(dplyr)
mydata3_copy <- mydata3 #We made a copy of the same data; otherwise, if we delete the row from the original data set, we cannot run scaterplot in the next task.
mydata3_copy <- mydata3_copy %>% filter(!ID %in% c(38,55,33,53,22)) #We made another identical data set in the environment to be able to run a scatter plot in the next task, since if we delete a row in data3, it will be an error, here we delted 38, 55, 33, 53 and 22 row which had Cooks distance as a inflation value.
mydata3$StdFitted <- scale(fit2$fitted.values)
library(car)
scatterplot(y = mydata3$StdResid, x = mydata3$StdFitted,
ylab = "Standardized residuals",
xlab = "Standardized fitted values",
boxplots = FALSE,
regLine = FALSE,
smooth = FALSE)
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
ols_test_breusch_pagan(fit2)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## ---------------------------------
## Response : Price
## Variables: fitted values of Price
##
## Test Summary
## ---------------------------
## DF = 1
## Chi2 = 0.968106
## Prob > Chi2 = 0.325153
hist(mydata3$StdResid,
xlab = "Standardized residulas",
ylab = "Frequency",
main = "Histogram of Standardized residuals") #To check the assumption of normal distributin of erros. With historgram we will check if stand. residuals are distributued normally and with Shapiro-Wilk test we will test it.
shapiro.test(mydata3$StdResid)
##
## Shapiro-Wilk normality test
##
## data: mydata3$StdResid
## W = 0.95303, p-value = 0.003645
H0: errors are normally distributed
H1: errors are not normally distributed
Even though, we did not eliminate any residuals from the dataset, based on the arguments I made at 384, we can say with 95% of confidence since p-value < ALPHA (5%) we can rejcet H0, meaning errors are not normally distributed and we cannot rely on t-test.
hist(mydata3_copy$CooksD,
xlab = "Cooks distance",
ylab = "Frequency",
main = "Histogram of Cooks distance") #To check if we need to remove some influentical cases
fit2 <- lm( Price ~ Age + Distance,
data = mydata3_copy)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata3_copy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -411.50 -203.69 -45.24 191.11 492.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2502.467 75.024 33.356 < 0.0000000000000002 ***
## Age -8.674 3.221 -2.693 0.00869 **
## Distance -24.063 2.692 -8.939 0.000000000000157 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared: 0.5361, Adjusted R-squared: 0.524
## F-statistic: 44.49 on 2 and 77 DF, p-value: 0.0000000000001437
sqrt(summary(fit2)$r.squared) # Coefficient of multiple correlation
## [1] 0.732187
Since p value is < than alpha (1%), for both coefficients, the test is statistically significant, meaning age and distance affect the price of an apartment.
estimate of regression coefficient AGE : if the age of an apartmant increases by 1 year, the price of apartmant (p=0,00869) will on average decreases by 8.674 EUR per m2, assuming distance remain unchanged.
estimate of regression coefficient DISTANCE : if the distance from the city center increase by 1km, the price of apartmant (p=0,0001) will on average decreases by 24.063 EUR per m2, assuming age remain unchanged.
coefficient of determination: 53,61 % of variability in the price of apatement is explained by the linear effect of the apartmant age and distance
coefficient of multiple correlation: Linear relationship between age and distance is strong.
F-STATISTIC: 37.96 with p-value less than 0.001 denotes that it is a higly significant model.
fit3 <- lm( Price ~ Age + Distance + ParkingFactor + BalconyFactor,
data = mydata3_copy)
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + ParkingFactor + BalconyFactor,
## data = mydata3_copy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -390.93 -198.19 -53.64 186.73 518.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2393.316 93.930 25.480 < 0.0000000000000002 ***
## Age -7.970 3.191 -2.498 0.0147 *
## Distance -21.961 2.830 -7.762 0.0000000000339 ***
## ParkingFactorYes 128.700 60.801 2.117 0.0376 *
## BalconyFactorYes 6.032 57.307 0.105 0.9165
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 252.7 on 75 degrees of freedom
## Multiple R-squared: 0.5623, Adjusted R-squared: 0.5389
## F-statistic: 24.08 on 4 and 75 DF, p-value: 0.0000000000007764
anova(fit2,fit3) #Comparison of two regression models that differ in the number of explantion variables.
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + ParkingFactor + BalconyFactor
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 77 5077362
## 2 75 4791128 2 286234 2.2403 0.1135
H0 = CHANGE OF ROW = 0
H1 = CHANGE OF ROW ≠ 0
**Due to p-value (0.1135) > than alpha (5%) we cannot rejcet H0 meaning that model fit 3 is not better than model fit2
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + ParkingFactor + BalconyFactor,
## data = mydata3_copy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -390.93 -198.19 -53.64 186.73 518.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2393.316 93.930 25.480 < 0.0000000000000002 ***
## Age -7.970 3.191 -2.498 0.0147 *
## Distance -21.961 2.830 -7.762 0.0000000000339 ***
## ParkingFactorYes 128.700 60.801 2.117 0.0376 *
## BalconyFactorYes 6.032 57.307 0.105 0.9165
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 252.7 on 75 degrees of freedom
## Multiple R-squared: 0.5623, Adjusted R-squared: 0.5389
## F-statistic: 24.08 on 4 and 75 DF, p-value: 0.0000000000007764
Since p value is < than alpha (5%), for both coefficients, the test is statistically significant, meaning age and distance affect the price of an apartment.
estimate of regression coefficient AGE : if the age of an apartmant increases by 1 year, the price of apartmant (p=0,0147) will on average decreases by 7.970 EUR per m2, assuming all other variables remain unchanged.
estimate of regression coefficient DISTANCE : if the distance from the city center increase by 1km, the price of apartmant (p=0,0001) will on average decreases by 21.961 EUR per m2, assuming all other variables remain unchanged.
estimate of (dummy) regression coefficient DISTANCE : Given the variables age, distance and balcony, the apartment with parking is on average 128.7 EUR per m2 (p=0.0376) more expensive.
coefficient of determination: 56,23 % of variability in the price of apatement is explained by the linear effect of the apartmant age, distance, parking and balcony.
F-STATISTIC: 24.08 with p-value less than 0.001 denotes that it is a higly significant model.
We have ANOVA Hypothesis:
H0: R squared = 0 (predictors do not exaplain any variance)
H1: R squared > 0 (predictors explain some variance)
it is a test for testing if the model is significant or not.
mydata3_copy$Fitted <- fitted.values(fit3)
mydata3_copy$Residuals <- residuals(fit3)
head(mydata3_copy[colnames(mydata3_copy) %in% c("ID", "Fitted", "Residuals")])
## ID Fitted Residuals
## 1 1 1728.641 -88.64095
## 2 2 2356.597 443.40256
## 3 3 1722.609 -62.60903
## 4 4 1539.312 310.68782
## 5 5 1989.286 -349.28625
## 6 6 1912.655 -142.65528
sum(mydata3_copy$Residuals)
## [1] -0.0000000000004334311