#Importing a data set from Kaggle.com, named "Health and Sleep Statistics".
mydata <- read.table("./- HealthSleepStatistics.csv", header = TRUE, sep = ",", dec = ".")
#Header = TRUE: means that the file contains the names of the variables in the first row.
#Sep = ",": means that values on each line are separated by a comma.
#Dec = ".": means that decimal points are indicated by a dot.
head(mydata, 10) #Function "head" automatically shows 6 first rows, however I set it to show 10 first rows.
## User.ID Age Gender Sleep.Quality Bedtime Wake.up.Time Daily.Steps
## 1 1 25 f 8 23:00 06:30 8000
## 2 2 34 m 7 00:30 07:00 NA
## 3 3 29 f 9 22:45 06:45 9000
## 4 4 41 m 5 01:00 06:30 4000
## 5 5 22 f 8 23:30 07:00 10000
## 6 6 37 m 6 00:15 07:15 6000
## 7 7 30 f 8 22:30 06:00 NA
## 8 8 45 m 4 01:30 07:00 3000
## 9 9 27 f 9 23:00 07:30 9500
## 10 10 32 m 7 00:45 07:15 6500
## Calories.Burned Dietary.Habits
## 1 2500 healthy
## 2 2200 unhealthy
## 3 2700 healthy
## 4 2100 unhealthy
## 5 2800 medium
## 6 2300 unhealthy
## 7 2600 healthy
## 8 2000 unhealthy
## 9 2750 healthy
## 10 2400 medium
NA - missing data for daily steps for some participants.
#Basic description of the data frame structure.
#Shows the number of units and variables, and types of variables (int = integer / full number, chr = character / string).
#We can see that there are 50 objects in the table, and 9 variables.
str(mydata)
## 'data.frame': 50 obs. of 9 variables:
## $ User.ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 25 34 29 41 22 37 30 45 27 32 ...
## $ Gender : chr "f" "m" "f" "m" ...
## $ Sleep.Quality : int 8 7 9 5 8 6 8 4 9 7 ...
## $ Bedtime : chr "23:00" "00:30" "22:45" "01:00" ...
## $ Wake.up.Time : chr "06:30" "07:00" "06:45" "06:30" ...
## $ Daily.Steps : int 8000 NA 9000 4000 10000 6000 NA 3000 9500 6500 ...
## $ Calories.Burned: int 2500 2200 2700 2100 2800 2300 2600 2000 2750 2400 ...
## $ Dietary.Habits : chr "healthy" "unhealthy" "healthy" "unhealthy" ...
#Changing the depiction of Gender from abbreviations "m" and "f" to words "Male" and "Female":
mydata$Gender <- factor(mydata$Gender,
levels = c("m", "f"),
labels = c("Male", "Female"))
contrasts(mydata$Gender) #Showing what is behind the data.
## Female
## Male 0
## Female 1
#Renaming some columns titles in order to make the data frame neater:
colnames(mydata)[1] <- ("ID") #Its previous name was User.ID.
colnames(mydata)[4] <- ("Sleep") #Its previous name was Sleep.Quality.
colnames(mydata)[6] <- ("Wakeup") #Its previous name was Wake.up.Time.
colnames(mydata)[7] <- ("Steps") #Its previous name was Daily.Steps.
colnames(mydata)[8] <- ("Calories") #Its previous name was Calories.Burned.
colnames(mydata)[9] <- ("Diet") #Its previous name was Dietary.Habits.
head(mydata)
## ID Age Gender Sleep Bedtime Wakeup Steps Calories Diet
## 1 1 25 Female 8 23:00 06:30 8000 2500 healthy
## 2 2 34 Male 7 00:30 07:00 NA 2200 unhealthy
## 3 3 29 Female 9 22:45 06:45 9000 2700 healthy
## 4 4 41 Male 5 01:00 06:30 4000 2100 unhealthy
## 5 5 22 Female 8 23:30 07:00 10000 2800 medium
## 6 6 37 Male 6 00:15 07:15 6000 2300 unhealthy
#Some of the participants miss information about the number of Steps taken.
#I will remove missing numbers (NA) using drop_na function.
#install.packages("tidyr") #Installing the package.
library(tidyr) #Activating the package.
mydata <- drop_na(mydata)
#Creating a new variable "Weight", by assigning each individual a random number on a range from 50 to 120 kg.
mydata$Weight <- runif(44, min = 50, max = 120)
#Creating a new variable called "Activity", that will show the activity level of each individual from the information about the average number of steps per day. If a person walked less than 8000 steps, the activity is "low", and if more than or equal to 8000 steps, the activity is "high":
mydata$Activity <- ifelse(mydata$Steps < 8000, "low", "high")
head(mydata)
## ID Age Gender Sleep Bedtime Wakeup Steps Calories Diet Weight
## 1 1 25 Female 8 23:00 06:30 8000 2500 healthy 63.67558
## 2 3 29 Female 9 22:45 06:45 9000 2700 healthy 104.86989
## 3 4 41 Male 5 01:00 06:30 4000 2100 unhealthy 113.05510
## 4 5 22 Female 8 23:30 07:00 10000 2800 medium 74.84734
## 5 6 37 Male 6 00:15 07:15 6000 2300 unhealthy 50.88947
## 6 8 45 Male 4 01:30 07:00 3000 2000 unhealthy 59.62275
## Activity
## 1 high
## 2 high
## 3 low
## 4 high
## 5 low
## 6 low
#Creating a new data frame, based on some conditions.
#I get rid of the columns "Bedtime" and "Wake-up", and also get rid of the first 4 rows.
data <- mydata[ -c(1:4), -c(5,6)]
head(data)
## ID Age Gender Sleep Steps Calories Diet Weight Activity
## 5 6 37 Male 6 6000 2300 unhealthy 50.88947 low
## 6 8 45 Male 4 3000 2000 unhealthy 59.62275 low
## 7 9 27 Female 9 9500 2750 healthy 66.72622 high
## 8 10 32 Male 7 6500 2400 medium 93.88181 low
## 9 11 50 Female 5 3500 2100 unhealthy 87.28771 low
## 10 12 23 Male 9 11000 2900 healthy 72.92246 high
#Creating a dataset, made up only of the participants from age 30 till age 40.
#install.packages("dplyr")
library(dplyr)
data1 <- data %>% filter(data$Age >= 30 & data$Age <= 40)
print(data1)
## ID Age Gender Sleep Steps Calories Diet Weight Activity
## 1 6 37 Male 6 6000 2300 unhealthy 50.88947 low
## 2 10 32 Male 7 6500 2400 medium 93.88181 low
## 3 13 36 Female 8 7000 2400 medium 111.01247 low
## 4 16 31 Male 6 6000 2300 unhealthy 102.27121 low
## 5 18 39 Male 5 4000 2100 unhealthy 66.62345 low
## 6 19 33 Female 9 10000 2800 healthy 73.92935 high
## 7 22 35 Male 5 4000 2100 unhealthy 97.49566 low
## 8 23 40 Female 9 9500 2750 healthy 72.69745 high
## 9 25 32 Female 8 8500 2600 medium 109.84104 high
## 10 28 38 Male 7 6500 2400 medium 94.20922 low
## 11 29 31 Female 8 9000 2500 healthy 52.88532 high
## 12 33 37 Female 8 8500 2600 medium 64.42292 high
## 13 36 40 Male 7 6000 2300 unhealthy 103.73815 low
## 14 37 35 Female 8 9000 2500 medium 53.61492 high
## 15 41 30 Female 8 8500 2600 medium 102.00041 high
## 16 43 34 Female 9 10000 2750 healthy 103.00768 high
## 17 45 33 Female 8 9000 2500 medium 60.57023 high
## 18 49 32 Female 8 8500 2600 medium 92.80093 high
#Ordering a data frame descending by the number of Steps taken. And units with the same number of steps taken are sorted in descending order by the Calorie intake.
print(data[order(-data$Steps, -data$Calories), ])
## ID Age Gender Sleep Steps Calories Diet Weight Activity
## 10 12 23 Male 9 11000 2900 healthy 72.92246 high
## 28 31 24 Female 9 10500 2900 healthy 51.24709 high
## 34 39 26 Female 9 10500 2900 healthy 114.64544 high
## 42 47 28 Female 9 10500 2900 healthy 54.08065 high
## 17 19 33 Female 9 10000 2800 healthy 73.92935 high
## 24 27 27 Female 9 10000 2800 healthy 62.41206 high
## 31 35 28 Female 9 10000 2750 healthy 57.51187 high
## 38 43 34 Female 9 10000 2750 healthy 103.00768 high
## 7 9 27 Female 9 9500 2750 healthy 66.72622 high
## 21 23 40 Female 9 9500 2750 healthy 72.69745 high
## 13 15 28 Female 9 9500 2700 healthy 90.24973 high
## 19 21 29 Female 8 9000 2600 healthy 75.43202 high
## 26 29 31 Female 8 9000 2500 healthy 52.88532 high
## 33 37 35 Female 8 9000 2500 medium 53.61492 high
## 40 45 33 Female 8 9000 2500 medium 60.57023 high
## 22 25 32 Female 8 8500 2600 medium 109.84104 high
## 29 33 37 Female 8 8500 2600 medium 64.42292 high
## 36 41 30 Female 8 8500 2600 medium 102.00041 high
## 43 49 32 Female 8 8500 2600 medium 92.80093 high
## 15 17 26 Female 8 8500 2500 medium 113.15480 high
## 11 13 36 Female 8 7000 2400 medium 111.01247 low
## 8 10 32 Male 7 6500 2400 medium 93.88181 low
## 25 28 38 Male 7 6500 2400 medium 94.20922 low
## 5 6 37 Male 6 6000 2300 unhealthy 50.88947 low
## 14 16 31 Male 6 6000 2300 unhealthy 102.27121 low
## 32 36 40 Male 7 6000 2300 unhealthy 103.73815 low
## 39 44 48 Male 7 6000 2300 unhealthy 77.74197 low
## 18 20 42 Male 7 5500 2400 medium 112.12231 low
## 35 40 41 Male 6 5000 2200 unhealthy 98.67881 low
## 16 18 39 Male 5 4000 2100 unhealthy 66.62345 low
## 20 22 35 Male 5 4000 2100 unhealthy 97.49566 low
## 9 11 50 Female 5 3500 2100 unhealthy 87.28771 low
## 27 30 49 Male 5 3500 2100 unhealthy 81.00960 low
## 41 46 47 Male 5 3500 2100 unhealthy 102.09491 low
## 6 8 45 Male 4 3000 2000 unhealthy 59.62275 low
## 12 14 48 Male 4 3000 2000 unhealthy 54.09000 low
## 23 26 44 Male 4 3000 2000 unhealthy 57.57180 low
## 30 34 46 Male 4 3000 2000 unhealthy 58.40319 low
## 37 42 44 Male 4 3000 2000 unhealthy 63.54617 low
## 44 50 46 Male 4 3000 2000 unhealthy 57.83157 low
#For the further statistical analysis of data, I get rid of all non-numerical variables, and create a new data frame called "clear.data".
clear.data <- data[ , -c(3,7,9)] #Remove columns number 3, 7, 9.
head(clear.data)
## ID Age Sleep Steps Calories Weight
## 5 6 37 6 6000 2300 50.88947
## 6 8 45 4 3000 2000 59.62275
## 7 9 27 9 9500 2750 66.72622
## 8 10 32 7 6500 2400 93.88181
## 9 11 50 5 3500 2100 87.28771
## 10 12 23 9 11000 2900 72.92246
Some of the basic calculations of statistical parameters for the variable “Age”:
mean(clear.data$Age) #Mean = primarily measure of central location, the average result.
## [1] 36.025
–> The average age of all participants is 36 years old.
median(clear.data$Age) #Median = the middle number of the dataset.
## [1] 35
–> Half of participants is younger than 35 years old, and half is older than 35 years old.
var(clear.data$Age) #Variance = the average from the mean; how far the outcome varies from the mean.
## [1] 60.64038
sd(clear.data$Age) #Standard deviation = spread around the mean; how far the standard deviation is from the expected value.
## [1] 7.787194
–> On average, the ages of participants deviate from the mean age by about 7.78 years. So, since the mean is 36, most participants are likely to have ages in the range of 28 to 44 years old.
quantile(clear.data$Age, p=0.25) #First quartile or 25th percentile.
## 25%
## 29.75
quantile(clear.data$Age, p=0.5) #Second quartile or 50th percentile, or median.
## 50%
## 35
quantile(clear.data$Age, p=0.75) #Third quartile or 75th percentile.
## 75%
## 42.5
#Calculate mode (the most common value) for all variables in the dataset, using "mlv" function:
#install.packages("modeest")
library(modeest)
sapply(clear.data[ , -1], FUN = mlv) #Calculating mode for all variables except ID.
## $Age
## [1] 28 32
##
## $Sleep
## [1] 9
##
## $Steps
## [1] 3000
##
## $Calories
## [1] 2000
##
## $Weight
## [1] 60.57995
#"Summary" - most commonly used function for statistical description of a data frame:
summary(clear.data[ , -1]) #Not including ID in the analysis.
## Age Sleep Steps Calories
## Min. :23.00 Min. :4.000 Min. : 3000 Min. :2000
## 1st Qu.:29.75 1st Qu.:5.000 1st Qu.: 4000 1st Qu.:2100
## Median :35.00 Median :8.000 Median : 7750 Median :2450
## Mean :36.02 Mean :7.025 Mean : 7012 Mean :2435
## 3rd Qu.:42.50 3rd Qu.:9.000 3rd Qu.: 9500 3rd Qu.:2712
## Max. :50.00 Max. :9.000 Max. :11000 Max. :2900
## Weight
## Min. : 50.89
## 1st Qu.: 59.32
## Median : 74.68
## Mean : 79.36
## 3rd Qu.: 99.51
## Max. :114.65
#"Describe" - another statistical description of the data for numerical variables:
#install.packages("psych")
library(psych)
describe(clear.data[ , -1])
## vars n mean sd median trimmed mad min max
## Age 1 40 36.02 7.79 35.00 35.84 9.64 23.00 50.00
## Sleep 2 40 7.03 1.83 8.00 7.16 1.48 4.00 9.00
## Steps 3 40 7012.50 2756.08 7750.00 7062.50 3335.85 3000.00 11000.00
## Calories 4 40 2435.00 303.02 2450.00 2431.25 444.78 2000.00 2900.00
## Weight 5 40 79.36 21.28 74.68 78.58 27.67 50.89 114.65
## range skew kurtosis se
## Age 27.00 0.20 -1.23 1.23
## Sleep 5.00 -0.47 -1.30 0.29
## Steps 8000.00 -0.24 -1.52 435.77
## Calories 900.00 -0.03 -1.36 47.91
## Weight 63.76 0.21 -1.52 3.36
#"Stat.desc" - another statistical description of the data for numerical variables:
#install.packages("pastecs")
library(pastecs)
round(stat.desc(clear.data[ , -1]), 1) #Round data to one decimal point.
## Age Sleep Steps Calories Weight
## nbr.val 40.0 40.0 40.0 40.0 40.0
## nbr.null 0.0 0.0 0.0 0.0 0.0
## nbr.na 0.0 0.0 0.0 0.0 0.0
## min 23.0 4.0 3000.0 2000.0 50.9
## max 50.0 9.0 11000.0 2900.0 114.6
## range 27.0 5.0 8000.0 900.0 63.8
## sum 1441.0 281.0 280500.0 97400.0 3174.3
## median 35.0 8.0 7750.0 2450.0 74.7
## mean 36.0 7.0 7012.5 2435.0 79.4
## SE.mean 1.2 0.3 435.8 47.9 3.4
## CI.mean.0.95 2.5 0.6 881.4 96.9 6.8
## var 60.6 3.4 7595993.6 91820.5 452.6
## std.dev 7.8 1.8 2756.1 303.0 21.3
## coef.var 0.2 0.3 0.4 0.1 0.3
Explaining some of the sample statistics for the analysed dataset:
Minimum and Maximum of Age: the youngest person in the sample is 23 years old, and the oldest is 50 years old.
Mean of Steps: sample mean - the average number of steps taken by participants of this dataset is 7012 steps.
Comparing Mean and Median of Steps: mean = 7012, median = 7750. Since the numbers differ, and mean is less than (to the left of) the median, then the dataset for Steps should be skewed to the left. However, the difference is not very significant, so it is not likely that there are outliers.
Range of Weight: the maximum difference between the weights of two participants is 68.3 kg.
Third quantile of Calories: approximately 75 percent of participants burn less than 2712 kcal on average. And approximately 25 percent of participants burn more than 2712 kcal on average.
MAD of Weight: mean absolute deviation - on average, the weights are spread out or deviate from the mean weight by 19.97 kilograms. This MAD is considered moderate to high, meaning there is a great variation among all weight measurements for the participants.
Skew of Sleep: skewness of sleep is -0.47, meaning that there is a slight negative skewness of the data (just below zero) and fairly symmetrical distribution of data, considering that skewness can vary from -1 to +1.
Kurtosis for Steps: the kurtosis value is -1.52, hence it can be assumed that this is excess kurtosis. It indicates that the data is platykurtic - with ligher tails (evenly spread out) and flatter peak (fewer extreme values), compared to normal distribution. There should not be many or large outliers. So the data is fairly well-behaved and neat.
SE.mean of Calories: standard error of the mean 47.9 tells how much the sample mean could vary if the samples of 40 people are repeatedly taken. Since sample mean is high (2435), the SE.mean suggests the sample mean is quite a good estimate of the population mean.
#Histogram for the distribution of Calories burned.
hist(clear.data$Calories,
main = "Distribution of the Calories burned",
ylab = "Frequency", #Y-axis name
xlab = "Calories burned", #X-axis name.
breaks = seq(2000, 3000, 100), #Range from 2000 to 3000 kcal, with bin width of 100.
right = FALSE,
col="lightpink", border = "lightblue4") #Adding colors.
Explanation of the graph:
#Histogram for the distribution of Steps walked.
#install.packages("ggplot2")
library(ggplot2)
ggplot(clear.data, aes(x = Steps),) + #Determine x axis.
geom_histogram(binwidth = 1000, colour = "dodgerblue4", fill = "aquamarine3") + #Determine shape (histogram), width, and color of the graph.
theme_gray() + #Set the background theme for grey.
ggtitle("Distribution of steps walked") #Setting a title.
Explanation of the graph:
#Histogram for the Steps walked according to Gender.
library(ggplot2)
ggplot(data, aes(x = Steps, fill = Gender)) + #Setting division by gender.
geom_histogram(position = "dodge", binwidth = 1000, colour = "lightgray") + #Dodge to avoid overplotting.
ylab("Frequency") +
labs(fill = "Gender") +
scale_fill_brewer(palette="Set1") + #Setting the color palette.
ggtitle("Distribution of steps walked according to gender")
Explanation of the graph:
#Boxplot for the Steps walked according to Gender.
ggplot(data, aes(x=Steps, y=Gender, fill=Gender)) +
geom_boxplot() +
xlab("Steps") +
labs(fill="Gender") +
scale_fill_brewer(palette="Set1") +
ggtitle("Distribution of steps walked according to gender")
Explanation of the graph:
#Scatter plot that shows a possible correlation between Sleep quality and Steps taken.
#install.packages("car")
library(car)
scatterplot(y = clear.data$Sleep,
x = clear.data$Steps,
ylab = "Sleep quality (1-10)",
xlab = "Step count daily (number of steps)",
smooth = FALSE, #It is possible also to remove boxplots by writing 'boxplots=FALSE.
main = "Correlation between quality of sleep and steps taken",
col = "slateblue1")
#Another way to build scatterplot.
library(ggplot2)
ggplot(clear.data, aes(x=Sleep, y=Steps)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, colour = "hotpink1")
## `geom_smooth()` using formula = 'y ~ x'
Explanation of the scatterplot graphs:
#Matrix of scatter plots for the dataset.
library(car)
scatterplotMatrix(clear.data[,-c(1,6)], #Removed ID (doesn't make sense to analyze) and Weight (it was previously created with random numbers, so it will not show any meaningful correlations).
smooth = FALSE)
Explanation of the scatterplot matrix:
It shows how each variable relates to the other variables in the dataset by using scatter plots for each pair of variables, as well as the undividual distributions along the diagonal.
Age has negative correlations with Sleep quality, Steps, and Calories burned, meaning older individuals in the dataset tend to be less active and report lower sleep quality.
Steps and Calories are positively correlated with each other and with Sleep quality, indicating that more physical activity tends to be associated with better sleep.
You have a data set for 100 MBA students of the current generation. In the previous year, the average grade in this program was 74.
#Read Excel file with data:
#install.packages("readxl")
library(readxl)
business_school <- read_xlsx("~/Library/Mobile Documents/com~apple~CloudDocs/R-STUDIO/BOOTCAMP 2024/R DATA/R Take Home Exam 2024/Task 2/Business School.xlsx")
business_school <- as.data.frame(business_school)
head(business_school)
## Student ID Undergrad Degree Undergrad Grade MBA Grade Work Experience
## 1 1 Business 68.4 90.2 No
## 2 2 Computer Science 70.2 68.7 Yes
## 3 3 Finance 76.4 83.3 No
## 4 4 Business 82.6 88.7 No
## 5 5 Finance 76.9 75.4 No
## 6 6 Computer Science 83.3 82.1 No
## Employability (Before) Employability (After) Status Annual Salary
## 1 252 276 Placed 111000
## 2 101 119 Placed 107000
## 3 401 462 Placed 109000
## 4 287 342 Placed 148000
## 5 275 347 Placed 255500
## 6 254 313 Placed 103500
TASK: Graph the distribution of undergrad degrees using the ggplot function. Which degree is the most common?
#Building box plot graph for Undergrad Degree distribution:
library(ggplot2)
ggplot(business_school, aes(x = `Undergrad Degree`),) +
geom_bar(colour = "red3", fill = "deeppink") +
xlab("Undergrad Degree") +
ylab("Student Count") +
ggtitle("Distribution of undergrad degrees") +
theme_light()
Explanation of the graph:
This is a bar chart graph, that shows the distribution of undergrad degrees, named: Art, Business, Computer science, Engineering, Finance, located on the x-axis. The y-axis shows the number of students enrolled into each degree.
The most common degree is “Business”.
TASK: Show the descriptive statistics of the Annual Salary and its distribution with the histogram (ggplot). Describe the distribution.
#Creating descriptive statistics of the Annual Salary, using "summary" function:
summary(business_school[ , 9])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20000 87125 103500 109058 124000 340000
#Creating descriptive statistics of the Annual Salary, using "describe" function:
library(psych)
describe(business_school[ , 9])
## vars n mean sd median trimmed mad min max range skew
## X1 1 100 109058 41501.49 103500 104600.2 25945.5 20000 340000 320000 2.22
## kurtosis se
## X1 9.41 4150.15
#Creating descriptive statistics of the Annual Salary, using "stat.desc" function (rounding to 1 decimal point):
library(pastecs)
round(stat.desc(business_school[ , 9]), 1)
## nbr.val nbr.null nbr.na min max range
## 100.0 0.0 0.0 20000.0 340000.0 320000.0
## sum median mean SE.mean CI.mean.0.95 var
## 10905800.0 103500.0 109058.0 4150.1 8234.8 1722373474.7
## std.dev coef.var
## 41501.5 0.4
#Building histogram graph for the Annual Salary:
library(ggplot2)
ggplot(business_school, aes(x = `Annual Salary`),) +
geom_histogram(binwidth = 10000, colour = "darkslategrey", fill = "darkseagreen1") +
xlab("Annual salary (EUR)") +
ylab("Count (students)") +
ggtitle("Distribution of annual salary") +
theme_light()
Explanation of the graph:
On x-axis, labels “0e+00”, “1e+05”, etc., represent notation for salary values, where 1e+05 translates to 100,000 EUR, 2e+05 translates to 200,000 EUR, and so on.
Unimodal graph (single peak).
Sharp peak, which indicates positive kurtosis (lepkurtosis).
Positive skewness (skew to the right), where the right tail is longer (heavy tail), and most values are concentrated on the left side.
The highest concentration of students earns annual salaries around 100,000 EUR.
Data seems to have high variance, considering the long tails. There are a few high-earners, and also some students that have very low salaries.
There are some outliers.
TASK: Test the following hypothesis: 𝐻0: 𝜇MBA Grade = 74. Explain the result and interpret the effect size.
#One-sample t-test, designed to determine whether mean of MBA Grade is 74.
#H0: 𝜇MBA Grade = 74.
#H1: 𝜇MBA Grade ≠ 74.
t.test(business_school$`MBA Grade`,
mu = 74, #H0
alternative = "two.sided") #H1
##
## One Sample t-test
##
## data: business_school$`MBA Grade`
## t = 2.6587, df = 99, p-value = 0.00915
## alternative hypothesis: true mean is not equal to 74
## 95 percent confidence interval:
## 74.51764 77.56346
## sample estimates:
## mean of x
## 76.04055
Explanation of the result:
t = 2.6587, which shows how far the sample mean is from the hypothesized mean of 74 in terms of standard error.
df (degrees of freedom) = 99. This number is derived from the sample size of 100, since the formula for df is (n-1).
p-value = 0.00915, which is the measure of the probability that the observed data would occur if the H0 is true. Since p-value is lower than L of 0.05 (significance threshold), we reject the null hypothesis in favor of the alternative hypothesis: “true mean is not equal to 74”.
confidence interval = 95%, means that we are 95% confident that the true mean lies within the interval from 74.51764 to 77.66346. The suggested by H0 number 74 is not included in this range.
mean of x = 76.04055, meaning that the mean for MBA Grades is approximately 76.
#Calculating the effect size, using Cohen's D statistics, that determines the standardized difference between two means (here: actual and predicted by H0).
#install.packages("effectsize")
library(effectsize)
effectsize::cohens_d(business_school$`MBA Grade`,
mu = 74)
## Cohen's d | 95% CI
## ------------------------
## 0.27 | [0.07, 0.46]
##
## - Deviation from a difference of 74.
#Interpreting the effect size:
effectsize::interpret_cohens_d(0.27, rules = "sawilowsky2009") #Using Sawilowsky (2009) rules of thumb.
## [1] "small"
## (Rules: sawilowsky2009)
Explanation:
You analyze the price per m2 for a sample of apartments in the Ljubljana region (Apartments.xlsx in Task 3 folder). Follow the questions in R Markdown included in the folder.
#install.packages("readxl")
library(readxl)
apartments <- read_xlsx("~/Library/Mobile Documents/com~apple~CloudDocs/R-STUDIO/BOOTCAMP 2024/R DATA/R Take Home Exam 2024/Task 3/Apartments.xlsx")
apartments <- as.data.frame(apartments)
head(apartments, 5)
## Age Distance Price Parking Balcony
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
Description:
apartments$Parking <- factor(apartments$Parking,
levels = c (0, 1),
labels = c ("no", "yes"))
apartments$Balcony <- factor(apartments$Balcony,
levels = c (0, 1),
labels = c ("no", "yes"))
head(apartments, 5)
## Age Distance Price Parking Balcony
## 1 7 28 1640 no yes
## 2 18 1 2800 yes no
## 3 7 28 1660 no no
## 4 28 29 1850 no yes
## 5 18 18 1640 yes yes
#One-sample t-test.
#H0: 𝜇Price = 1990 eur.
#H1: 𝜇Price ≠ 1990 eur.
t.test(apartments$Price,
mu = 1990,
alternative = "two.sided")
##
## One Sample t-test
##
## data: apartments$Price
## t = 0.70618, df = 84, p-value = 0.482
## alternative hypothesis: true mean is not equal to 1990
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
Conclusion:
P-value of 0.482 is very high compared to L of 0.05, meaning that deviation from the H0 is not significant, and there is weak evidence against the H0. Moreover, confidence interval includes value of 1990. Hence we cannot reject the null hypothesis.
The result of the t-test indicates that there is no significant difference between the mean for apartment prices and suggested by H0 mean of 1990.
However, p-value could be high due to small sample size, low effect size, high data variability, or other violations that inflate p-value.
#install.packages("ggplot2")
library(ggplot2)
ggplot(apartments, aes(x=Price, y=Age)) +
geom_point() + #Building a scatterplot.
geom_smooth(method = "lm", se = FALSE, colour = "deepskyblue2") +
ylab("Price per m^2") +
xlab("Age in years") +
ggtitle("Simple regression function: Price = f(Age)")
#Insert data as linear model (lm), with Price being dependent variable, and Age being independent variable.
fit1 <- lm(Price ~ Age, data = apartments)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
cor(apartments$Price, apartments$Age) #Pearson correlation coefficient.
## [1] -0.230255
Explanation of the results:
Intercept: The predicted value of Price, when independent variable Age is zero, is 2185.455 (p<0.001).
Estimate of regression coefficient (Age): If age of a flat rises by 1 year, the price of a flat decreases by approximately 8.975 euros per m2 on average, while holding other variables constant. The p-value for Age is very small: p=0.034 (less than L=0.05), we can conclude that we found the effect of Age on Price.
Coefficient of correlation: Pearson correlation coefficient measures direction and strength of correlation, and can vary from -1 to +1, from perfect negative to perfect positive correlation, with 0 indicating the absence of linear relationship. In this case, coef. of correlation equals approximately -0.23, which indicates a weak negative correlation between Age of flats and their Prices.
Coefficient of determination: Coefficient of determination, or the Multiple R-Squared indicator, can range from 0 to 1, with 0 meaning that the chosen independent variable explains nothing, and 1 meaning that the chosen independent variable explains everything for the changes in dependent variable. In this case, coef. of determination is equal to 0.05302, which is close to 0, and only about 5.3% of the variation in apartment Prices is explained by the Age of the apartments. This suggests that Age alone is not a strong predictor of apartment Prices.
#install.packages("car")
library(car)
scatterplotMatrix(apartments[ , c(3,1,2)], smooth = FALSE)
Analysis of the matrix:
Price & Age: There is a weak negative correlation (which aligns with what was found with the previous analysis), with data widely spread.
Price & Distance: There is a strong negative correlation. It means that distance might be a better predictor of price.
Age & Distance: There is no clear correlationship or any relation between data whatsoever, since the line is almost flat, and the data is widely scattered. There is a very low potential chance of multicolinearity between these two variables.
fit2 <- lm(Price ~ Age + Distance, data = apartments)
#Vif = Variance inflation factor.
library(car)
vif(fit2)
## Age Distance
## 1.001845 1.001845
mean(vif(fit2))
## [1] 1.001845
Explanation of multicolinearity:
Theory: VIF statistic should be less than 5, while the average VIF statistic should be close to 1, in order to prove absence of high level of multicolinearity.
For Age and Distance, the calculated values are low. And mean or average VIF is very close to 1.
These explanetary variables are not linked to each other, and there is no multicolinearity for Age and Distance.
Examine the assumption of the normal distribution of errors.
apartments$StdResid <- round(rstandard(fit2), 3) #Standardized residuals, rounded to 3 dec.points.
apartments$CooksD <- round(cooks.distance(fit2), 3) #Cooks distances, rounded to 3 dec.points.
#Graph for standardized residuals:
hist(apartments$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals",
col = "cadetblue3")
#Performing Shapiro-Wilk normality test to check normal distribution of errors.
shapiro.test(apartments$StdResid)
##
## Shapiro-Wilk normality test
##
## data: apartments$StdResid
## W = 0.95303, p-value = 0.003645
Explanation of Standardized Residuals analysis:
Theory: Standardized residuals are a way to assess the quality of a regression model by measuring how far each observed value is from the predicted value, in terms of the standard deviation of the residuals.
The histogram should ideally follow a bell-shaped curve. However, on the graph, standardized residuals are asymmetrically skewed to the left, and there are two peaks. This implies that the distribution is not normal.
The histogram shows that values are in the required range from -3 to +3. Therefore, there is no need to remove any outliers.
The Shapiro-Wilk normality test shows that p=value is lower than L=0.05, hence we reject the null hypothesis that the residuals are normally distributed, which confirms that the residuals deviate from a normal distribution.
However, sample size is over 30, so this violation is not likely to significantly affect the results.
#Histogram for Cooks distances:
hist(apartments$CooksD,
xlab = "Cooks distance",
ylab = "Frequency",
main = "Histogram of Cooks distances",
col = "cadetblue3")
Explanation of Cooks distances analysis:
Theory: The Cook’s distances show if there are units with high impact on estimated regression function, which need to be removed. The higher the number, starting from 0, the higher is the effect.
We can see that at least 1 unit has a value that is to some extent greater than the values of other units. This can be seen by a gap between 0.10 and 0.30 in the histogram.
#Showing ten units with highest value of Cook's distance.
head(apartments[order(-apartments$CooksD), c("StdResid", "CooksD")], 10)
## StdResid CooksD
## 38 2.577 0.320
## 55 1.445 0.104
## 33 2.051 0.069
## 53 -2.152 0.066
## 22 1.576 0.061
## 39 1.091 0.038
## 58 1.655 0.037
## 25 1.571 0.034
## 57 1.601 0.032
## 2 1.783 0.030
By printing the first 10 units with the highest Cook’s distances, we can see that the units 38 and 55 have a significantly higher impact. Moreover, units 53 and 33 are also slightly higher, so we decide to remove them as well.
apartments <- apartments[-c(38, 55, 53, 33),]
Now we check the Standardized Residuals and Cook’s Distances once again, to see if the issues have been solved.
fit2 <- lm(Price ~ Age + Distance, data = apartments)
apartments$StdResid <- round(rstandard(fit2), 3)
apartments$CooksD <- round(cooks.distance(fit2), 3)
head(apartments[order(-apartments$CooksD), c("StdResid", "CooksD")], 10)
## StdResid CooksD
## 22 1.597 0.069
## 25 1.998 0.064
## 31 1.113 0.056
## 58 1.721 0.046
## 57 1.641 0.039
## 39 1.039 0.038
## 46 1.865 0.035
## 78 1.865 0.035
## 2 1.827 0.034
## 61 1.827 0.034
#Graph for standardized residuals:
hist(apartments$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals",
col = "coral1")
#Histogram for Cooks distances:
hist(apartments$CooksD,
xlab = "Cooks distance",
ylab = "Frequency",
main = "Histogram of Cooks distances",
col = "coral1")
#Storing the standardized fitted values:
apartments$StdFitted <- scale(fit2$fitted.values) #Creating fitted values.
#Plotting standardized fitted values against the standardized residuals:
#install.packages("car")
library(car)
scatterplot(y = apartments$StdResid, x = apartments$StdFitted,
ylab = "Standardized residuals",
xlab = "Standardized fitted values",
boxplots = FALSE,
regLine = FALSE,
smooth = FALSE)
Explanation of the scatterplot:
Theory: The points must be randomly distributed in a horizontal band of constant variability, with no curvature (which would indicate the problem of non-linearity). If the variability changes, this is called heteroscedasticity, which affects the reliability of the estimated standard errors of the regression coefficients, and lead to a need to use robust standard errors.
In this case, some non-constant variance and variation in the spread of residuals can be seen, with smaller spread on the left side, and significantly increased spread on the right side.
Hence, we cannot assume homoscedasticity, based on this graph.
We use formal test for heteroscedasticity - Breuch-Pagan test:
#install.packages("olsrr")
library(olsrr)
ols_test_breusch_pagan(fit2)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## ---------------------------------
## Response : Price
## Variables: fitted values of Price
##
## Test Summary
## ----------------------------
## DF = 1
## Chi2 = 1.64762
## Prob > Chi2 = 0.1992832
Explanation of the results:
fit2 <- lm(Price ~ Age + Distance, data = apartments)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -408.72 -217.93 -38.86 220.60 506.18
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2492.715 75.544 32.997 < 2e-16 ***
## Age -7.496 3.169 -2.365 0.0205 *
## Distance -24.574 2.700 -9.100 6.9e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 259.4 on 78 degrees of freedom
## Multiple R-squared: 0.5324, Adjusted R-squared: 0.5204
## F-statistic: 44.4 on 2 and 78 DF, p-value: 1.335e-13
Explanation of results:
Intercept: The predicted value of Price, when all independent variables are zero, is 2492 (p<0.001).
Estimate of regression coefficient (Age): If age of a flat rises by 1 year, the price of a flat decreases by approximately 7.496 euros per m2 on average (p<0.05), assuming that the other explanatory variables remain unchanged.
Estimate of regression coefficient (Distance): If distance of a flat from the city center increases by 1 kilometer, the price of a flat decreases by approximately 24.574 euros per m2 on average (p<0.001), assuming that the other explanatory variables remain unchanged.
Coefficient of determination, or Multiple R-Squared: It is equal to 0.5324, which means that about 53% of the variation in apartment Prices is explained by the Age and Distance from the center. This suggests that Age and Distance are quite strong predictors of apartment Prices, but they do not explain everything.
fit3 <- lm(Price ~ Age + Distance + Parking + Balcony, data = apartments)
#Comparing fit2 and fit3 to test if the second model has a better fit than the first (simpler) model.
anova(fit2, fit3)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + Parking + Balcony
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 78 5248925
## 2 76 4915944 2 332981 2.5739 0.08287 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Explanation of results:
RSS: Residual sum of squares is larger for the fit3 model, indicating inclusion of more variables, and hence a better fit. Sum of squares shows the difference between two RSS.
Df: Degrees of freedom show that fit3 model has two more variables.
F: F-statistic tests whether adding Parking and Balcony improves the fit.
Pr(>F): P-value is associated with F-statistic and tells the probability of better fit occurring due to chance. P-value (p=0.08) is greater than 0.05, meaning that we fail to reject H0. Therefore, fit3 model does not significantly improve the prediction of the price, at 5% significance level.
However, if we would test at significance level of L=10%, then we would reject H0, and conclude that fit3 model explains Price more than fit2.
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -403.16 -204.08 -41.24 239.52 528.62
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2372.693 93.344 25.419 < 2e-16 ***
## Age -6.906 3.118 -2.215 0.0298 *
## Distance -22.254 2.839 -7.837 2.25e-11 ***
## Parkingyes 137.479 60.854 2.259 0.0267 *
## Balconyyes 17.234 57.099 0.302 0.7636
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 254.3 on 76 degrees of freedom
## Multiple R-squared: 0.5621, Adjusted R-squared: 0.539
## F-statistic: 24.38 on 4 and 76 DF, p-value: 5.29e-13
sqrt(summary(fit3)$r.squared) #Multiple correlation coefficient.
## [1] 0.7497015
Explanation of results:
Intercept: The predicted value of Price, when all independent variables are zero, is 2372 (p<0.001).
Estimate of regression coefficient (Parking-yes): If a flat has a parking, the price of the flat increases on average by 137.479 euros per m2, holding other variables constant (p<0.05).
Estimate of regression coefficient (Balcony-yes): This estimate does not have a statistically significant effect on price of apartments, because p-value is higher than 0.05.
Coefficient of determination, or Multiple R-squared: It is equal to 0.5621, which means that about 56% of the variation in apartment Prices is explained by the Age, Distance from the center, and Parking.
Hypothesis tested with F-statistic: H0: All regression coefficients are equal to zero, and the model has no predictive power. H1: At least one regression coefficient is statistically significant in explaining the price. Since p-value of F-statistic is 5.29*10^-13 (p<0.001), we reject H0. We conclude that at least one predictor is significant in explaining the price of apartments, and the model has predictive power.
Multiple correlation coefficient: It is obtained by calculating the square root of the multiple coefficient of determination, and it shows that the linear correlation between price, age, distance and parking is strong. It means that the model has explanatory power.
apartments$FittedValues <- fitted.values(fit3)
apartments$Residuals <- residuals(fit3)
#Retrieve Price, Fitted values, and Residuals in a matrix-like object.
head(apartments[ , colnames(apartments) %in% c("Price", "FittedValues", "Residuals")])
## Price FittedValues Residuals
## 1 1640 1718.475 -78.47475
## 2 2800 2363.611 436.38945
## 3 1660 1701.241 -41.24101
## 4 1850 1551.196 298.80422
## 5 1640 2002.527 -362.52739
## 6 1770 1929.513 -159.51268
Explanation of results:
Manually, fitted value for ID2 would be calculated this way: