#Importing the data
mydata <- read.table("./Sleep_health_and_lifestyle_dataset.csv", header=TRUE, sep=",",check.names = FALSE)
head(mydata)
## Person ID Gender Age Occupation Sleep Duration Quality of Sleep Physical Activity Level
## 1 1 Male 27 Software Engineer 6.1 6 42
## 2 2 Male 28 Doctor 6.2 6 60
## 3 3 Male 28 Doctor 6.2 6 60
## 4 4 Male 28 Sales Representative 5.9 4 30
## 5 5 Male 28 Sales Representative 5.9 4 30
## 6 6 Male 28 Software Engineer 5.9 4 30
## Stress Level BMI Category Blood Pressure Heart Rate Daily Steps Sleep Disorder
## 1 6 Overweight 126/83 77 4200 None
## 2 8 Normal 125/80 75 10000 None
## 3 8 Normal 125/80 75 10000 None
## 4 8 Obese 140/90 85 3000 Sleep Apnea
## 5 8 Obese 140/90 85 3000 Sleep Apnea
## 6 8 Obese 140/90 85 3000 Insomnia
A data frame with 374 observations on the following 14 variables:
#Creating new variable called sleep efficienty
mydata$Sleep_Efficiency <- mydata$`Quality of Sleep`/ mydata$`Sleep Duration`
head(mydata)
## Person ID Gender Age Occupation Sleep Duration Quality of Sleep Physical Activity Level
## 1 1 Male 27 Software Engineer 6.1 6 42
## 2 2 Male 28 Doctor 6.2 6 60
## 3 3 Male 28 Doctor 6.2 6 60
## 4 4 Male 28 Sales Representative 5.9 4 30
## 5 5 Male 28 Sales Representative 5.9 4 30
## 6 6 Male 28 Software Engineer 5.9 4 30
## Stress Level BMI Category Blood Pressure Heart Rate Daily Steps Sleep Disorder Sleep_Efficiency
## 1 6 Overweight 126/83 77 4200 None 0.9836066
## 2 8 Normal 125/80 75 10000 None 0.9677419
## 3 8 Normal 125/80 75 10000 None 0.9677419
## 4 8 Obese 140/90 85 3000 Sleep Apnea 0.6779661
## 5 8 Obese 140/90 85 3000 Sleep Apnea 0.6779661
## 6 8 Obese 140/90 85 3000 Insomnia 0.6779661
#Delete units to missing data
library(tidyr)
mydata <- mydata %>% drop_na()
#Rename variables
colnames(mydata)[4] <- "Job"
head(mydata)
## Person ID Gender Age Job Sleep Duration Quality of Sleep Physical Activity Level
## 1 1 Male 27 Software Engineer 6.1 6 42
## 2 2 Male 28 Doctor 6.2 6 60
## 3 3 Male 28 Doctor 6.2 6 60
## 4 4 Male 28 Sales Representative 5.9 4 30
## 5 5 Male 28 Sales Representative 5.9 4 30
## 6 6 Male 28 Software Engineer 5.9 4 30
## Stress Level BMI Category Blood Pressure Heart Rate Daily Steps Sleep Disorder Sleep_Efficiency
## 1 6 Overweight 126/83 77 4200 None 0.9836066
## 2 8 Normal 125/80 75 10000 None 0.9677419
## 3 8 Normal 125/80 75 10000 None 0.9677419
## 4 8 Obese 140/90 85 3000 Sleep Apnea 0.6779661
## 5 8 Obese 140/90 85 3000 Sleep Apnea 0.6779661
## 6 8 Obese 140/90 85 3000 Insomnia 0.6779661
#Creating a new data.frame based on conditions
mydata2 <- data.frame("Job" = c("Doctor", "Software Engineer", "Sales Representative", "Nurse"),
"Age" = c(30, 26, 40, 34),
"Gender" = c("Male", "Male", "Male", "Female"),
"Sleep Dissorder" = c("None", "Sleep Apnea", "Insomnia", "None"))
print(mydata2)
## Job Age Gender Sleep.Dissorder
## 1 Doctor 30 Male None
## 2 Software Engineer 26 Male Sleep Apnea
## 3 Sales Representative 40 Male Insomnia
## 4 Nurse 34 Female None
#I want to see the average sleep time
mean(mydata$`Sleep Duration`)
## [1] 7.132086
library(pastecs)
##
## Attaching package: 'pastecs'
## The following object is masked from 'package:tidyr':
##
## extract
round(stat.desc(mydata[ , c(-1,-2,-4,-9,-10,-13)]),2)
## Age Sleep Duration Quality of Sleep Physical Activity Level Stress Level
## nbr.val 374.00 374.00 374.00 374.00 374.00
## nbr.null 0.00 0.00 0.00 0.00 0.00
## nbr.na 0.00 0.00 0.00 0.00 0.00
## min 27.00 5.80 4.00 30.00 3.00
## max 59.00 8.50 9.00 90.00 8.00
## range 32.00 2.70 5.00 60.00 5.00
## sum 15777.00 2667.40 2735.00 22130.00 2014.00
## median 43.00 7.20 7.00 60.00 5.00
## mean 42.18 7.13 7.31 59.17 5.39
## SE.mean 0.45 0.04 0.06 1.08 0.09
## CI.mean.0.95 0.88 0.08 0.12 2.12 0.18
## var 75.22 0.63 1.43 433.92 3.15
## std.dev 8.67 0.80 1.20 20.83 1.77
## coef.var 0.21 0.11 0.16 0.35 0.33
## Heart Rate Daily Steps Sleep_Efficiency
## nbr.val 374.00 374.00 374.00
## nbr.null 0.00 0.00 0.00
## nbr.na 0.00 0.00 0.00
## min 65.00 3000.00 0.68
## max 86.00 10000.00 1.13
## range 21.00 7000.00 0.45
## sum 26242.00 2549500.00 382.00
## median 70.00 7000.00 1.04
## mean 70.17 6816.84 1.02
## SE.mean 0.21 83.66 0.00
## CI.mean.0.95 0.42 164.51 0.01
## var 17.10 2617651.14 0.01
## std.dev 4.14 1617.92 0.09
## coef.var 0.06 0.24 0.09
Explanations:
The average physical activity level is about 59.17 minutes of exercise per day.
The range of sleep duration is 2.7 hours (from 5.8 to 8.5 hours).
The median age is 43 years, meaning that half of the participants are younger than 43, and half are older.
hist(mydata$`Sleep Duration`,
ylab = "Number of people",
xlab = "Sleep duration",
main = "Distribution of Sleep duration",
breaks = seq(from =5, to = 9, by = 0.5))
The histogram titled “Distribution of Sleep Duration” shows how many hours people in the dataset typically sleep, ranging from 5 to 9 hours.
library(car)
## Loading required package: carData
scatterplot(y = mydata$`Sleep Duration`,
x = mydata$`Quality of Sleep`,
ylab = "Sleep duration in hours",
xlab = "Quality of sleep (1-10)",
smooth = FALSE)
The scatter plot shows that longer sleep durations are associated with better quality of sleep. In other words, people who sleep more tend to rate their sleep quality higher.
mydata3 <- mydata[ , c(-1, -2, -3, -4, -8, -9, -10, -11, -12, -13, -14 )]
library(car)
scatterplotMatrix(mydata3,
smooth = FALSE)
The scatter plot matrix shows the relationships between Sleep Duration, Quality of Sleep, and Physical Activity Level.
From the matrix, we can see that all three variables are positively correlated. This means that as one increases, the others tend to increase as well. For example, people who sleep longer also report better sleep quality and tend to engage in more physical activity.
library(readxl)
mydataTask2 <- read_xlsx("./Business School.xlsx")
mydataTask2 <- as.data.frame(mydataTask2)
head(mydataTask2)
## Student ID Undergrad Degree Undergrad Grade MBA Grade Work Experience Employability (Before)
## 1 1 Business 68.4 90.2 No 252
## 2 2 Computer Science 70.2 68.7 Yes 101
## 3 3 Finance 76.4 83.3 No 401
## 4 4 Business 82.6 88.7 No 287
## 5 5 Finance 76.9 75.4 No 275
## 6 6 Computer Science 83.3 82.1 No 254
## Employability (After) Status Annual Salary
## 1 276 Placed 111000
## 2 119 Placed 107000
## 3 462 Placed 109000
## 4 342 Placed 148000
## 5 347 Placed 255500
## 6 313 Placed 103500
library(pastecs)
round(stat.desc(mydataTask2[ , -c(1, 2, 5, 8, 11)]), 2)
## Undergrad Grade MBA Grade Employability (Before) Employability (After) Annual Salary
## nbr.val 100.00 100.00 100.00 100.00 1.000000e+02
## nbr.null 0.00 0.00 0.00 0.00 0.000000e+00
## nbr.na 0.00 0.00 0.00 0.00 0.000000e+00
## min 61.20 58.14 101.00 119.00 2.000000e+04
## max 100.00 95.00 421.00 631.03 3.400000e+05
## range 38.80 36.86 320.00 512.03 3.200000e+05
## sum 7689.90 7604.06 25793.08 42269.06 1.090580e+07
## median 76.65 76.38 256.83 435.64 1.035000e+05
## mean 76.90 76.04 257.93 422.69 1.090580e+05
## SE.mean 0.75 0.77 5.93 12.92 4.150150e+03
## CI.mean.0.95 1.48 1.52 11.78 25.64 8.234800e+03
## var 55.68 58.91 3522.10 16701.30 1.722373e+09
## std.dev 7.46 7.68 59.35 129.23 4.150149e+04
## coef.var 0.10 0.10 0.23 0.31 3.800000e-01
summary(mydataTask2)
## Student ID Undergrad Degree Undergrad Grade MBA Grade Work Experience
## Min. : 1.00 Length:100 Min. : 61.20 Min. :58.14 Length:100
## 1st Qu.: 25.75 Class :character 1st Qu.: 71.47 1st Qu.:71.14 Class :character
## Median : 50.50 Mode :character Median : 76.65 Median :76.38 Mode :character
## Mean : 50.50 Mean : 76.90 Mean :76.04
## 3rd Qu.: 75.25 3rd Qu.: 81.70 3rd Qu.:82.15
## Max. :100.00 Max. :100.00 Max. :95.00
## Employability (Before) Employability (After) Status Annual Salary
## Min. :101.0 Min. :119.0 Length:100 Min. : 20000
## 1st Qu.:245.8 1st Qu.:312.0 Class :character 1st Qu.: 87125
## Median :256.8 Median :435.6 Mode :character Median :103500
## Mean :257.9 Mean :422.7 Mean :109058
## 3rd Qu.:261.0 3rd Qu.:529.0 3rd Qu.:124000
## Max. :421.0 Max. :631.0 Max. :340000
colnames(mydataTask2)[2]<- "Undergrad_Degree"
colnames(mydataTask2)[3]<- "Undergrad_Grade"
colnames(mydataTask2)[4]<- "MBA_Grade"
colnames(mydataTask2)[5]<- "Work_Exp"
colnames(mydataTask2)[9]<- "Annual_Salary"
head(mydataTask2)
## Student ID Undergrad_Degree Undergrad_Grade MBA_Grade Work_Exp Employability (Before)
## 1 1 Business 68.4 90.2 No 252
## 2 2 Computer Science 70.2 68.7 Yes 101
## 3 3 Finance 76.4 83.3 No 401
## 4 4 Business 82.6 88.7 No 287
## 5 5 Finance 76.9 75.4 No 275
## 6 6 Computer Science 83.3 82.1 No 254
## Employability (After) Status Annual_Salary
## 1 276 Placed 111000
## 2 119 Placed 107000
## 3 462 Placed 109000
## 4 342 Placed 148000
## 5 347 Placed 255500
## 6 313 Placed 103500
library(ggplot2)
ggplot(mydataTask2, aes(x=Undergrad_Degree)) +
geom_bar() +
ylab("Frequency")+
xlab("Undergrad Degree")
options(scipen = 1000)
ggplot(mydataTask2, aes(x = Annual_Salary)) +
geom_histogram(binwidth = 10000, fill = "lightblue", color = "black", position = "dodge") +
ylab("Frequency") +
xlab("Annual Salary") +
theme_minimal()
The highest salary is 340,000, with an average of 109,058. Half of the MBA students make more than 103,500. The distribution looks right-skewed, and there are some outliers.
mean(mydataTask2$MBA_Grade)
## [1] 76.04055
sd(mydataTask2$MBA_Grade)
## [1] 7.675114
t.test(mydataTask2$MBA_Grade,
mu = 74,
alternative = "two.sided")
##
## One Sample t-test
##
## data: mydataTask2$MBA_Grade
## t = 2.6587, df = 99, p-value = 0.00915
## alternative hypothesis: true mean is not equal to 74
## 95 percent confidence interval:
## 74.51764 77.56346
## sample estimates:
## mean of x
## 76.04055
We can reject null hypotesis at (p<0,005), and we can say that the true mean is not equal to 74. The 95% confidence interval shows that the true mean lies between 74.52 and 77.56.
library(readxl)
apartments <- read_xlsx("./Apartments.xlsx")
head(apartments)
## # A tibble: 6 × 5
## Age Distance Price Parking Balcony
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
## 6 28 12 1770 0 1
Description:
apartments$ParkingF <- factor(apartments$Parking,
levels = c (0, 1),
labels = c ("No", "Yes"))
apartments$BalconyF <- factor(apartments$Balcony,
levels = c (0, 1),
labels = c ("No", "Yes"))
head(apartments)
## # A tibble: 6 × 7
## Age Distance Price Parking Balcony ParkingF BalconyF
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct>
## 1 7 28 1640 0 1 No Yes
## 2 18 1 2800 1 0 Yes No
## 3 7 28 1660 0 0 No No
## 4 28 29 1850 0 1 No Yes
## 5 18 18 1640 1 1 Yes Yes
## 6 28 12 1770 0 1 No Yes
t.test(apartments$Price,
mu = 1900,
alternative = "two.sided")
##
## One Sample t-test
##
## data: apartments$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
Since the p-value (p = 0.004731) is less than 0.05, we can reject the null hypothesis and conclude that the true mean is not 1900. The 95% confidence interval suggests the true mean is between 1937.44 and 2100.44.
fit1 <- lm(Price ~ Age,
data = apartments)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <0.0000000000000002 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
cor(apartments$Price, apartments$Age)
## [1] -0.230255
library(car)
scatterplotMatrix(apartments[ , c(-4, -5,-6,-7)],
smooth = FALSE)
library(Hmisc)
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
rcorr(as.matrix(apartments[ , c(1, 2, 3)]))
## Age Distance Price
## Age 1.00 0.04 -0.23
## Distance 0.04 1.00 -0.63
## Price -0.23 -0.63 1.00
##
## n= 85
##
##
## P
## Age Distance Price
## Age 0.6966 0.0340
## Distance 0.6966 0.0000
## Price 0.0340 0.0000
As we can see from the correlation matrix and the Pearson correlation coefficient, there is a weak correlation between Age and Distance and Age and Price, but there is a strong correlation between Price and Distance.
fit2 <- lm(Price ~ Age + Distance,
data = apartments)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 0.0000000000000002 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 0.0000000000618 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 0.00000000004896
vif(fit2)
## Age Distance
## 1.001845 1.001845
mean(vif(fit2))
## [1] 1.001845
Multicollinearity isn’t an issue because the VIF for both Age and Distance is below 5.
apartments$StdResid <- round(rstandard(fit2), 3)
apartments$CooksD <- round(cooks.distance(fit2), 3)
head(apartments[order(-apartments$CooksD),], 6)
## # A tibble: 6 × 9
## Age Distance Price Parking Balcony ParkingF BalconyF StdResid CooksD
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <dbl> <dbl>
## 1 5 45 2180 1 1 Yes Yes 2.58 0.32
## 2 43 37 1740 0 0 No No 1.44 0.104
## 3 2 11 2790 1 0 Yes No 2.05 0.069
## 4 7 2 1760 0 1 No Yes -2.15 0.066
## 5 37 3 2540 1 1 Yes Yes 1.58 0.061
## 6 40 2 2400 0 1 No Yes 1.09 0.038
hist(apartments$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
We can see that all standardized residuals are between -3 and 3.
hist(apartments$CooksD,
xlab = "Cooks distance",
ylab = "Frequency",
main = "Histogram of Cooks distances")
head(apartments[order(-apartments$CooksD),], 6)
## # A tibble: 6 × 9
## Age Distance Price Parking Balcony ParkingF BalconyF StdResid CooksD
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <dbl> <dbl>
## 1 5 45 2180 1 1 Yes Yes 2.58 0.32
## 2 43 37 1740 0 0 No No 1.44 0.104
## 3 2 11 2790 1 0 Yes No 2.05 0.069
## 4 7 2 1760 0 1 No Yes -2.15 0.066
## 5 37 3 2540 1 1 Yes Yes 1.58 0.061
## 6 40 2 2400 0 1 No Yes 1.09 0.038
We will remove 2 apartament with high Cooks distance.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
##
## src, summarize
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:pastecs':
##
## first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
apartments <- apartments %>%
filter(!CooksD == 0.320)
apartments <- apartments %>%
filter(!CooksD == 0.104)
fit3 <- lm(Price ~ Age + Distance,
data = apartments)
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -627.27 -212.96 -46.23 205.05 578.98
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2490.112 76.189 32.684 < 0.0000000000000002 ***
## Age -7.850 3.244 -2.420 0.0178 *
## Distance -23.945 2.826 -8.473 0.000000000000953 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.5 on 80 degrees of freedom
## Multiple R-squared: 0.4968, Adjusted R-squared: 0.4842
## F-statistic: 39.49 on 2 and 80 DF, p-value: 0.000000000001173
apartments$StdResid <- round(rstandard(fit3), 3) #Standardized residuals
apartments$CooksD <- round(cooks.distance(fit3), 3) #Cooks distances
hist(apartments$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
shapiro.test(apartments$StdResid)
##
## Shapiro-Wilk normality test
##
## data: apartments$StdResid
## W = 0.95952, p-value = 0.01044
hist(apartments$CooksD,
xlab = "Cooks distance",
ylab = "Frequency",
main = "Histogram of Cooks distances")
apartments$StdFitted <- scale(fit3$fitted.values)
library(car)
scatterplot(y = apartments$StdResid, x = apartments$StdFitted,
ylab = "Standardized residuals",
xlab = "Standardized fitted values",
boxplots = FALSE,
regLine = FALSE,
smooth = FALSE)
We will do Breusch Pagan test, to see if we have heterosketicity
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
ols_test_breusch_pagan(fit3)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## ---------------------------------
## Response : Price
## Variables: fitted values of Price
##
## Test Summary
## -----------------------------
## DF = 1
## Chi2 = 3.775135
## Prob > Chi2 = 0.05201969
Since the p-value is greater than 0.05, we can’t reject the null hypothesis that the error variance is constant. Therefore, we assume homoskedasticity.
apartments$StdResid <- round(rstandard(fit3), 3) #Standardized residuals
apartments$CooksD <- round(cooks.distance(fit3), 3) #Cooks distances
hist(apartments$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
shapiro.test(apartments$StdResid)
##
## Shapiro-Wilk normality test
##
## data: apartments$StdResid
## W = 0.95952, p-value = 0.01044
H0: Standardized residuals are distributed normally
H1: Standardized residuals are not distributed normally
p= 0.002 ; We can reject H0 at p=0.002, and accept H1. Standardized residuals are not distributed normally
fit3 <- lm(Price ~ Age + Distance,
data = apartments)
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -627.27 -212.96 -46.23 205.05 578.98
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2490.112 76.189 32.684 < 0.0000000000000002 ***
## Age -7.850 3.244 -2.420 0.0178 *
## Distance -23.945 2.826 -8.473 0.000000000000953 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.5 on 80 degrees of freedom
## Multiple R-squared: 0.4968, Adjusted R-squared: 0.4842
## F-statistic: 39.49 on 2 and 80 DF, p-value: 0.000000000001173
If apartament gets 1 year older, the price per m2 will decrease for 7850 EUR on average (p>0.001), given the other explenatory variables unchanged.
If the distance of the appartment increases by 1 km, the price of the appartment decreases by 23.945 euro/m^2 (p>0.001), given the other explenatory variables unchanged.
The average price per m2 of new apartment (assuming age=0) located directly in the center (assuming that distance from the city center = 0) is 2456.1 eur (p<0.001), given the other explenatory variables unchanged.
I save it as fit4, because I have already used fit3
fit4 <- lm(Price ~ Age + Distance + Parking + Balcony,
data = apartments)
summary(fit4)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -499.06 -194.33 -32.04 219.03 544.31
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2358.900 93.664 25.185 < 0.0000000000000002 ***
## Age -7.197 3.148 -2.286 0.02499 *
## Distance -21.241 2.911 -7.296 0.000000000214 ***
## Parking 168.921 62.166 2.717 0.00811 **
## Balcony -6.985 58.745 -0.119 0.90566
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 264.5 on 78 degrees of freedom
## Multiple R-squared: 0.5408, Adjusted R-squared: 0.5173
## F-statistic: 22.97 on 4 and 78 DF, p-value: 0.000000000001449
anova(fit3,fit4)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + Parking + Balcony
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 80 5982100
## 2 78 5458696 2 523404 3.7395 0.02813 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
H0: Model 1 is better
H1: Model 2 is better
Since the p-value is 0.028, we reject the null hypothesis. Model 2 is better, as it explains more of the variability in apartment prices.
summary(fit4)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -499.06 -194.33 -32.04 219.03 544.31
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2358.900 93.664 25.185 < 0.0000000000000002 ***
## Age -7.197 3.148 -2.286 0.02499 *
## Distance -21.241 2.911 -7.296 0.000000000214 ***
## Parking 168.921 62.166 2.717 0.00811 **
## Balcony -6.985 58.745 -0.119 0.90566
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 264.5 on 78 degrees of freedom
## Multiple R-squared: 0.5408, Adjusted R-squared: 0.5173
## F-statistic: 22.97 on 4 and 78 DF, p-value: 0.000000000001449
Given the values of other explenatory variables unchanged, the apartaments with parking are on average 168.921 EUR pre m2 more expensive compared to those without (p < 0.01)
We cannot interpret other categorial variable because (p> 0.05)
F-statistic
H0: All explanatory coefficient = 0
H1: All explanatory coefficient x=x 0
We reject the H0, and accept H1, at least one explanatory variable has significant effect on dependent variable.
apartments$Fitted <- fitted.values(fit4)
apartments$Residuals <- residuals(fit4)
apartments[2, c("Fitted", "Residuals")]
## # A tibble: 1 × 2
## Fitted Residuals
## <dbl> <dbl>
## 1 2377. 423.
The residual for apartment ID2 is 422.9572