data(package = .packages(all.available = TRUE))
library(reshape2)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
mydata <- force(tips) #Import available data set
head(mydata, 10)
## total_bill tip sex smoker day time size
## 1 16.99 1.01 Female No Sun Dinner 2
## 2 10.34 1.66 Male No Sun Dinner 3
## 3 21.01 3.50 Male No Sun Dinner 3
## 4 23.68 3.31 Male No Sun Dinner 2
## 5 24.59 3.61 Female No Sun Dinner 4
## 6 25.29 4.71 Male No Sun Dinner 4
## 7 8.77 2.00 Male No Sun Dinner 2
## 8 26.88 3.12 Male No Sun Dinner 4
## 9 15.04 1.96 Male No Sun Dinner 2
## 10 14.78 3.23 Male No Sun Dinner 2
This data set includes recorded information about tips recieved by one waiter over a period of one month, working in one restaurant. He collected 244 tips/observations and recorded 7 different variables.
mydata2 <- mydata[ , -5] #Delete fifth column
colnames(mydata2) <- c("TotalBill", "Tip", "Gender", "Smoker", "Meal", "People") #Change names of variables
mydata2$Name <- c(1) #Add a new variable
mydata2$NameF <- factor(mydata2$Name,
levels = c(1),
labels = c("John")) #Rename the new variable
mydata3 <- mydata2[ , -7] #Delete seventh column
library(tidyr)
mydata4 <- drop_na(mydata3) #Delete all data with NA, if any is present
library(psych)
describe(mydata4[ , c(-3, -4, -5, -7)]) #Use function to show describtive statistics
## vars n mean sd median trimmed mad min max range
## TotalBill 1 244 19.79 8.90 17.8 18.73 7.46 3.07 50.81 47.74
## Tip 2 244 3.00 1.38 2.9 2.84 1.33 1.00 10.00 9.00
## People 3 244 2.57 0.95 2.0 2.42 0.00 1.00 6.00 5.00
## skew kurtosis se
## TotalBill 1.12 1.14 0.57
## Tip 1.45 3.50 0.09
## People 1.43 1.63 0.06
library(pastecs)
##
## Attaching package: 'pastecs'
## The following object is masked from 'package:tidyr':
##
## extract
round(stat.desc(mydata4[ , c(-3, -4, -5, -7)]), 2) #Round to two decimal places
## TotalBill Tip People
## nbr.val 244.00 244.00 244.00
## nbr.null 0.00 0.00 0.00
## nbr.na 0.00 0.00 0.00
## min 3.07 1.00 1.00
## max 50.81 10.00 6.00
## range 47.74 9.00 5.00
## sum 4827.77 731.58 627.00
## median 17.80 2.90 2.00
## mean 19.79 3.00 2.57
## SE.mean 0.57 0.09 0.06
## CI.mean.0.95 1.12 0.17 0.12
## var 79.25 1.91 0.90
## std.dev 8.90 1.38 0.95
## coef.var 0.45 0.46 0.37
quantile(mydata4$TotalBill , p = 0.75) #Use function to explain the procentage of sample data
## 75%
## 24.1275
mydata5 <- mydata4[mydata4$Gender == "Male", ] #Create new data including only male units
str(mydata5) #Display structure of the data set (M)
## 'data.frame': 157 obs. of 7 variables:
## $ TotalBill: num 10.34 21.01 23.68 25.29 8.77 ...
## $ Tip : num 1.66 3.5 3.31 4.71 2 3.12 1.96 3.23 1.71 1.57 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
## $ Smoker : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Meal : Factor w/ 2 levels "Dinner","Lunch": 1 1 1 1 1 1 1 1 1 1 ...
## $ People : int 3 3 2 4 2 4 2 2 2 2 ...
## $ NameF : Factor w/ 1 level "John": 1 1 1 1 1 1 1 1 1 1 ...
hist(mydata5$Tip,
main = "Distribution of variable Tip",
ylab = "Frequency",
xlab = "Tip",
breaks = seq(1, 10, 0.5),
right = FALSE) #Graphical distribution of male units using a histogram
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
ggplot(mydata5, aes(x = Tip)) +
geom_histogram(binwidth = 0.5, colour = "cornflowerblue", fill = "cornflowerblue") +
facet_wrap(~Meal, ncol = 1) +
ylab("Frequency") #Graphical distribution of male units by meal
ggplot(mydata5, aes(x = Meal, y = Tip)) +
geom_boxplot() +
xlab("Tip") #Graphical distribution of male units by meal in a boxplot
str(mydata4) #Display structure of the data set (M, F)
## 'data.frame': 244 obs. of 7 variables:
## $ TotalBill: num 17 10.3 21 23.7 24.6 ...
## $ Tip : num 1.01 1.66 3.5 3.31 3.61 4.71 2 3.12 1.96 3.23 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 2 2 2 2 2 ...
## $ Smoker : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Meal : Factor w/ 2 levels "Dinner","Lunch": 1 1 1 1 1 1 1 1 1 1 ...
## $ People : int 2 3 3 2 4 4 2 4 2 2 ...
## $ NameF : Factor w/ 1 level "John": 1 1 1 1 1 1 1 1 1 1 ...
hist(mydata4$Tip,
main = "Distribution of variable Tip",
ylab = "Frequency",
xlab = "Tip",
breaks = seq(1, 10, 0.5),
right = FALSE) #Graphical distribution of male and female units using a histogram
library(ggplot2)
ggplot(mydata4, aes(x = Tip)) +
geom_histogram(binwidth = 0.5, colour = "darkolivegreen3", fill = "darkolivegreen3") +
facet_wrap(~Meal, ncol = 1) +
ylab("Frequency") #Graphical distribution of male and female units by meal
library(ggplot2)
ggplot(mydata4, aes(x = Tip, fill = Gender)) +
geom_histogram(position = position_dodge(width = 0.3), binwidth = 0.5, colour = "gray") +
facet_wrap(~Meal, ncol = 1) +
ylab("Frequency") +
labs(fill = "Gender") #Graphical distribution of male and female units by meal showing differences between gender
head(mydata4[order(-mydata4$Tip), ], 15) #Show 15 units with highest tip value for potential outliers
## TotalBill Tip Gender Smoker Meal People NameF
## 171 50.81 10.00 Male Yes Dinner 3 John
## 213 48.33 9.00 Male No Dinner 4 John
## 24 39.42 7.58 Male No Dinner 4 John
## 60 48.27 6.73 Male No Dinner 4 John
## 142 34.30 6.70 Male No Lunch 6 John
## 184 23.17 6.50 Male Yes Dinner 4 John
## 215 28.17 6.50 Female Yes Dinner 3 John
## 48 32.40 6.00 Male No Dinner 4 John
## 240 29.03 5.92 Male No Dinner 3 John
## 89 24.71 5.85 Male No Lunch 2 John
## 182 23.33 5.65 Male Yes Dinner 2 John
## 45 30.40 5.60 Male No Dinner 4 John
## 53 34.81 5.20 Female No Dinner 4 John
## 86 34.83 5.17 Female No Lunch 4 John
## 212 25.89 5.16 Male Yes Dinner 4 John
mydata6 <- mydata4[c(-171, -213, -24), ] #Remove three highest tip values (up to 7$)
library(ggplot2)
ggplot(mydata6, aes(x = Tip, fill = Gender)) +
geom_histogram(position = position_dodge(width = 0.3), binwidth = 0.5, colour = "gray") +
facet_wrap(~Meal, ncol = 1) +
ylab("Frequency") +
labs(fill = "Gender") #Graphical distribution of male and female units by meal showing differences between gender with some outliers removed
mydata7 <- read.table("~/Desktop/bootcamp/Exam/Task 2/Body mass.csv",
header = TRUE,
sep = ";",
dec = ",") #Import dataset
head(mydata7, 10)
## ID Mass
## 1 1 62.1
## 2 2 64.5
## 3 3 56.5
## 4 4 53.4
## 5 5 61.3
## 6 6 62.2
## 7 7 62.7
## 8 8 64.5
## 9 9 59.5
## 10 10 68.9
library(pastecs) # Use function to show descriptive statistics and round to two decimal places
round(stat.desc(mydata7[-1]), 2)
## Mass
## nbr.val 50.00
## nbr.null 0.00
## nbr.na 0.00
## min 49.70
## max 83.20
## range 33.50
## sum 3143.80
## median 62.80
## mean 62.88
## SE.mean 0.85
## CI.mean.0.95 1.71
## var 36.14
## std.dev 6.01
## coef.var 0.10
library(ggplot2) #Make a histogram
ggplot(mydata7, aes(x = Mass)) +
theme_bw() +
geom_histogram(binwidth = 4, colour = "red", fill = "coral") +
xlab("Body mass (kg)") +
ylab("Frequency") +
labs(title = "Distribution of body mass of ninth graders of 2021/2022")
t.test(mydata7$Mass, #Perform t-test and explain the results
mu = 59.5,
alternative = "two.sided")
##
## One Sample t-test
##
## data: mydata7$Mass
## t = 3.9711, df = 49, p-value = 0.000234
## alternative hypothesis: true mean is not equal to 59.5
## 95 percent confidence interval:
## 61.16758 64.58442
## sample estimates:
## mean of x
## 62.876
I tested two hypothesis using this t-test:
I used the two tailed test as the average body mass of 2021/2022 can be higher or lower than mean in 2018/2019. First, I looked at the histogram, when I already saw that the mean of 2021/2022 is most likely higher that the one recorded in 2018/2019.
With conducting the t-test I got a 5% significance level (a = 5%) and p-value of 0.000234 or 0.02%. P-value is therefore smaller than a, which means I reject the H0 at p = 0.000234 and accept H1. With that I can conclude that the mean of 2018/2019 in not equal to the mean of 2021/2022. In other words: Average body mass of ninth graders of 2018/2019 is not equal to the average body mass of ninth graders of 2021/2022 - in fact, it is higher. The confidence interval also told me that I can with 95% certainty say that average body mass of all ninth graders of 2021/2022 lies between 61.17 kg and 64.58 kg, which does not include the average from 2018/2019 that was recorded at 59.5 kg.
#install.packages("effectsize")
library(effectsize)
##
## Attaching package: 'effectsize'
## The following object is masked from 'package:psych':
##
## phi
cohens_d(mydata7$Mass, mu = 59.5)
## Cohen's d | 95% CI
## ------------------------
## 0.56 | [0.26, 0.86]
##
## - Deviation from a difference of 59.5.
Cohen’s d is a statistical measure used in hypothesis testing and quantifies the effect size of the difference between two groups. In my case the effect size or the difference in average weight between ninth graders in 2018/2019 and 2021/2022 is medium/moderate. This means that while there is statistically significant difference in average weight, the practical significance is moderate. The value Cohen’s d = 0.56 is positive which also indicates that the average weight of ninth graders was in 2018/2019 bigger than in 2021/2022.
#install.packages("readxl")
library(readxl)
mydata8 <- read_xlsx("~/Desktop/bootcamp/Exam/Task 3/Apartments.xlsx")
mydata8$ParkingF <- factor(mydata8$Parking,
levels = c(0, 1),
labels = c("No", "Yes"))
mydata8$BalconyF <- factor(mydata8$Balcony,
levels = c(0, 1),
labels = c("No", "Yes"))
head(mydata8)
## # A tibble: 6 × 7
## Age Distance Price Parking Balcony ParkingF BalconyF
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct>
## 1 7 28 1640 0 1 No Yes
## 2 18 1 2800 1 0 Yes No
## 3 7 28 1660 0 0 No No
## 4 28 29 1850 0 1 No Yes
## 5 18 18 1640 1 1 Yes Yes
## 6 28 12 1770 0 1 No Yes
t.test(mydata8$Price,
mu = 1900,
alternative = "two.sided")
##
## One Sample t-test
##
## data: mydata8$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
From the data displayed above I can conclude that the average price of apartments, at at 5% significance level (a = 5% ; p < a) is not equal to 1900 eur. Also the confidence interval does not include mean of 1900 eur. With that I can reject H0. I can say with 95% certainty that that the true average price of the apartments lies between 1937.44 eur and 2100.44 eur.
fit1 <- lm (Price ~ Age,
data = mydata8)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = mydata8)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
corr_coeff <- cor(mydata8$Age, mydata8$Price,
method = "pearson")
print(corr_coeff)
## [1] -0.230255
Regression function: E(Y/X) = beta0 + beta1 x X1 –> Price = 2185.455 - 8.975 x Age
Estimate of regression coefficient: The intercept coefficient tells us that if Age (or X1) is equal to 0 and everything else stays constant, the average Price will be 2185.455 eur. On the other hand, the regression coefficient for Age tells us that if Age increases for 1 year and everything else stays constant, Price will on average decrease by 8.975 eur.
On top of that I can with 95% certainty say that coefficients are not equal to 0 and therefore reject the H0 (p < 0.05).
Coefficient of correlation: It tells us how strongly two variables are correlated and also the direction of their linear relationship. In my case, Age and Price are negatively and not strongly correlated as coefficient of correlation is -0.23.
Coefficient of determination: The multiple R - squared statistic tells us that 5.302% of variability in Price can be explained by Age variable (X1).
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplotMatrix(mydata8[ , c(1, 2, 3)],
smooth = FALSE)
Multicollinearity examens relationships between explanatory variables, in this case Age and Distance, and occurs when two or more explanatory variables in the model are highly correlated with eachother. When it is present, it is hard to determine individual contributions of explanatory variables to the dependent variable and creates bias in coefficient estimates. Because the slope of the regression line in the scatterplot matrix of explanatory variables (Age and Distance) is close to 0 (linear), I can conclude that there is no signifficant correlation between the variables. There is no multicollinearity.
fit2 <- lm(Price ~ Age + Distance,
data = mydata8)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata8)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
vif(fit2)
## Age Distance
## 1.001845 1.001845
mean(vif(fit2))
## [1] 1.001845
Variance Inflation Factor (VIF) is a statistical tool used to assess and quantify multicoliniarity between explanatory variables. If VIF < 5 for all explanatory variables, we can include them in our model, and if VIF ≥ 5, there is too much overlapping present and the variable should be removed or looked at due to causing multicollinearity. The mean of all VIF statistics should be close or equal to 1. In my case I can conclude that there is no case of multicollinearity between Age and Distance as both VIF values are less then 5 and their mean is close to 1.
mydata8$StdRes <- round(rstandard(fit2), 3)
hist(mydata8$StdRes,
main = "Histogram of standardized residuals",
col = "deepskyblue",
xlab = "Standardized residuals",
ylab = "Frequency",
xlim = c(-3, 3))
mydata8$CooksD <- round(cooks.distance(fit2), 3)
hist(mydata8$CooksD,
main = "Histogram of Cooks distances",
col = "deeppink2",
xlab = "Cooks distances",
ylab = "Frequency")
head(mydata8[order(-mydata8$CooksD), ])
## # A tibble: 6 × 9
## Age Distance Price Parking Balcony ParkingF BalconyF StdRes CooksD
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <dbl> <dbl>
## 1 5 45 2180 1 1 Yes Yes 2.58 0.32
## 2 43 37 1740 0 0 No No 1.44 0.104
## 3 2 11 2790 1 0 Yes No 2.05 0.069
## 4 7 2 1760 0 1 No Yes -2.15 0.066
## 5 37 3 2540 1 1 Yes Yes 1.58 0.061
## 6 40 2 2400 0 1 No Yes 1.09 0.038
Just by looking at the standardized residuals histogram I can see that there is no outliers that need to be removed as there is no value smaller than -3 or bigger than 3. However, the Cooks distance histogram shows that one observation has significantly higher Cooks distance value compared to others which gives it a stronger effect on the whole regression model. The standardised residual value for this particular observation is 2.577, which is very close to 3. With this I can conclude to remove this observation from the data set. This will be done after finishing all further exercises including fit2.
mydata8$StdFitted <- scale(fit2$fitted.values)
library(car)
scatterplot(x = mydata8$StdFitted, y = mydata8$StdRes,
xlab = "Standardized fitted values",
ylab = "Standardized residuals",
boxplots = FALSE,
regLine = FALSE,
smooth = FALSE)
Heteroscedasticity refers to variability in residuals and shows changes in their dispersions at different values of explanatory variables. On the scatterplot I can see that the variance of standard residuals is constant which means heteroscedasticity is not present. This is the case of homoscedasticity.
hist(mydata8$StdRes,
main = "Histogram of standardized residuals",
xlab = "Standardized residuals",
ylab = "Density",
col = "darksalmon",
prob = TRUE,
xlim = c(-3, 3))
curve(dnorm(x, mean = mean(mydata8$StdRes), sd = sd(mydata8$StdRes)),
col = "red",
lwd = 2,
add = TRUE)
shapiro.test(mydata8$StdRes)
##
## Shapiro-Wilk normality test
##
## data: mydata8$StdRes
## W = 0.95303, p-value = 0.003645
From the histogram I can already see that the distribution is not normally distributed. For clarity, I have also added the normal distribution curve from which we can see how the distribution of the residuals should look like to be classified as normal. Then I have also tested my assumption with the Shapiro-Wilk normality test.
At a 5% significance level, we can reject the H0 and the distribution is not normal.
mydata8 <- mydata8[!(mydata8$CooksD == 0.320), ]
fit2 <- lm(Price ~Age + Distance,
data = mydata8)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata8)
##
## Residuals:
## Min 1Q Median 3Q Max
## -604.92 -229.63 -56.49 192.97 599.35
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2456.076 73.931 33.221 < 2e-16 ***
## Age -6.464 3.159 -2.046 0.044 *
## Distance -22.955 2.786 -8.240 2.52e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 276.1 on 81 degrees of freedom
## Multiple R-squared: 0.4838, Adjusted R-squared: 0.4711
## F-statistic: 37.96 on 2 and 81 DF, p-value: 2.339e-12
sqrt(summary(fit2)$r.squared)
## [1] 0.6955609
First, I removed the observation the observation with Cooks distance of 0.320. Then I estimated the fit2 again to calculate again the new regression values without that observation.
New regression function: Price = 2456.076 - 6.464 x Age - 22.955 x Distance
Estimate of regression coefficient: The intercept coefficient tells us that if Age (or X1) is equal to 0 and everything else stays constant, the average Price will be 2456.076 eur. On the other hand, the regression coefficient for Age tells us that if Age increases for 1 year and everything else stays constant, Price will on average decrease by 6.464 eur. The regression coefficient for Distance tells us that if Distance increases for 1 unit and everything else stays constant, Price will on average decrease by 22.955 eur.
I can at a 5% significance level say that coefficients are not equal to 0 and therefore reject the H0 (p < 0.05 for all three coefficients).
Coefficient of correlation: From square root of R-squared - In my case, Age, Price and Distance are positively and strongly correlated as coefficient of correlation is 0.6955609.
Coefficient of determination: 48.38% of variability in Price can be explained by Age and Distance variables (X1 and X2).
fit3 <- lm(Price ~ Age + Distance + ParkingF + BalconyF,
data = mydata8)
anova(fit2, fit3)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + ParkingF + BalconyF
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 81 6176767
## 2 79 5654480 2 522287 3.6485 0.03051 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Analysis of Variance is a technique used when we want to determine whether there are any significant differences between the means of three or more observed groups.
We reject H0 at a 5% significance level. Because p < 0.05, ANOVA says that model 2(fit3), the one that has more variables and is more complex, is better.
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + ParkingF + BalconyF, data = mydata8)
##
## Residuals:
## Min 1Q Median 3Q Max
## -473.21 -192.37 -28.89 204.17 558.77
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2329.724 93.066 25.033 < 2e-16 ***
## Age -5.821 3.074 -1.894 0.06190 .
## Distance -20.279 2.886 -7.026 6.66e-10 ***
## ParkingFYes 167.531 62.864 2.665 0.00933 **
## BalconyFYes -15.207 59.201 -0.257 0.79795
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 267.5 on 79 degrees of freedom
## Multiple R-squared: 0.5275, Adjusted R-squared: 0.5035
## F-statistic: 22.04 on 4 and 79 DF, p-value: 3.018e-12
The regression coefficient for Parking tells is that if an apartment has parking at its premises and everything else stays unchanged, it has on average 167.53 eur higher Price. The coefficient for Balcony tells us that if an apartment has a Balcony and everything else stays unchanged, the Price decreases for 15.207 eur on average.
F-statistic hypothesis:
mydata8$Fitted <- fitted.values(fit3)
mydata8$Residuals <- residuals(fit3)
round(mydata8[2, 12], 3)
## # A tibble: 1 × 1
## Residuals
## <dbl>
## 1 428.
The residual for apartment ID2 tells me that the actual price for this apartment is by 427.8 eur higher than the estimated value from the regression.