#data(package = .packages(all.available = TRUE))
#install.packages("carData")
library(carData)
#data("Wong")
mydata <- invisible(Wong)
head(mydata,3)
## id days duration sex age piq viq
## 1 3358 30 4 Male 20.67077 87 89
## 2 3535 16 17 Male 55.28816 95 77
## 3 3547 40 1 Male 55.91513 95 116
#install.packages("dplyr")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mydata <- mydata %>% mutate(id = row_number())
mydata <- rename(mydata, gender = sex)
mydata <- rename_with(mydata, toupper)
mydata <- mydata[-2]
head(mydata)
## ID DURATION GENDER AGE PIQ VIQ
## 1 1 4 Male 20.67077 87 89
## 2 2 17 Male 55.28816 95 77
## 3 3 1 Male 55.91513 95 116
## 4 4 10 Male 61.66461 59 73
## 5 5 6 Male 30.12731 67 73
## 6 6 3 Male 57.06229 76 69
ID patient ID number DAYS number of days post coma at which IQs were measured DURATION duration of the coma in days GENDER a factor with levels Female and Male AGE in years at the time of injury PIQ performance (i.e., mathematical) IQ VIQ verbal IQ
mydataM <- mydata[mydata$GENDER=="Male",]
mydataF <- mydata[mydata$GENDER=="Female",]
#install.packages("psych")
library(psych)
describe(mydata[,c(-1,-3)])
## vars n mean sd median trimmed mad min max range skew
## DURATION 1 331 14.30 26.04 7.00 8.85 8.90 0.00 255.00 255.00 4.80
## AGE 2 331 31.85 13.87 26.88 30.05 10.03 6.51 80.03 73.52 1.05
## PIQ 3 331 87.56 15.13 87.00 87.02 14.83 50.00 133.00 83.00 0.33
## VIQ 4 331 94.96 14.05 94.00 94.58 16.31 64.00 132.00 68.00 0.24
## kurtosis se
## DURATION 30.75 1.43
## AGE 0.19 0.76
## PIQ 0.12 0.83
## VIQ -0.53 0.77
DURATION Mean Average duration of the coma is 14.21 days . Median 50% of all observations of data sample have duration lower than 7 days, 50% of observations have higher. Range The difference between maximal and minimal number of days is 255.
AGE Mean Average age at the time of injury is 31.96 years. Median 50% of all observations of data sample have age lower than 27.23 years, 50% of observations have higher. Range The difference between maximal and minimal number of years is 73.52.
PIQ Mean Average performance IQ is 87.57. Median 50% of all observations of data sample have PIQ lower than 87 days, 50% of observations have higher. Range The difference between maximal and minimal PIQ value is 83.
VIQ Mean Average verbal IQ is 95.03. Median 50% of all observations of data sample have VIQ lower than 94 days, 50% of observations have higher. Range The difference between maximal and minimal VIQ value is 68.
hist(mydata[,2],
main="Duration of the coma (DURATION)",
xlab="Duration of coma in days",
breaks = seq(from=0, to=300, by=2),
col="darkgray")
Histogram of Duration of the coma is positivelly skewed. There are few outliers on the histogram.
Removing outliers (Duration > 75 days)
#install.packages("base")
library(base)
mydata <- subset(mydata, DURATION < 75 )
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
ggplot(mydata, aes(x = AGE, fill = GENDER)) +
geom_histogram(position=position_dodge(width=3), binwidth = 5, colour="darkgray") +
ggtitle("Age") +
ylab("Frequency") +
xlab("Age") +
labs(fill="Gender") +
scale_fill_brewer(palette="Set1")
Histogram of age is positivelly skewed and unimodal. Majority of people in the sample have 20-30 years.
library(ggplot2)
ggplot(mydata, aes(x = PIQ, fill = GENDER)) +
geom_histogram(position=position_dodge(width=3), binwidth = 5, colour="black") +
ggtitle("Performance IQ") +
ylab("Frequency") +
labs(fill="Gender")
Histogram of Performance IQ is unimodal. There are no obvious outliers on the histogram.
ggplot(mydata, aes(x = VIQ, fill = GENDER)) +
geom_histogram(position=position_dodge(width=3), binwidth = 5, colour="black") +
ggtitle("Verbal IQ") +
ylab("Frequency") +
labs(fill="Gender")
Histogram of Verbal IQ is bimodal. There are no obvious outliers on the histogram.
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
scatterplotMatrix(mydataF[ ,c(-1, -3)],
smooth = FALSE)
According to the diagrams above, we can conclude that as the duration of coma increases, VIQ and PIQ decreases.
Additionally, we can realize that individuals with greater VIQ factors also have higher PIQ factors.
ggplot(mydata, aes(x = DURATION, y = PIQ)) +
geom_point() +
ggtitle("Effect of duration of the coma on PIQ") +
xlab("Duration of coma in days") +
ylab("PIQ") +
facet_wrap(~GENDER, ncol = 1) +
geom_smooth(method = 'lm', formula = y ~ x, se = FALSE)
We can conclude from the scatterplot above that male experience a lesser effect of prolonged coma duration on Performance IQ than female. In average, a woman’s PIQ decreases by approximately 35 after 60 days in a coma, while a man’s PIQ drops by approximately 15.
ggplot(mydata, aes(x = DURATION, y = VIQ)) +
geom_point() +
ggtitle("Effect of duration of the coma on VIQ") +
xlab("Duration of coma in days") +
ylab("VIQ") +
facet_wrap(~GENDER, ncol = 1) +
geom_smooth(method = 'lm', formula = y ~ x, se = FALSE)
We can conclude from the scatterplot above that male experience a lesser effect of prolonged coma duration on Performance IQ than female. In average, a woman’s PIQ decreases by approximately 15 after 60 days in a coma, while a man’s PIQ drops by less than 5.
mydata <- read.table("~/Desktop/R Take Home Exam/Task 2/Body mass.csv",header=TRUE, sep=";", dec=",")
head(mydata,10)
## ID Mass
## 1 1 62.1
## 2 2 64.5
## 3 3 56.5
## 4 4 53.4
## 5 5 61.3
## 6 6 62.2
## 7 7 62.7
## 8 8 64.5
## 9 9 59.5
## 10 10 68.9
Description ID: ID of ninth grader Mass: Body weight of ninth grader in kg
#install.packages("psych")
library(psych)
describe(mydata[-1])
## vars n mean sd median trimmed mad min max range skew kurtosis se
## Mass 1 50 62.88 6.01 62.8 62.56 3.34 49.7 83.2 33.5 0.85 2.11 0.85
Comment on parameter estimates: Average ninth grader in our sample (n=50, N=N/A) weights 62.88 kg. The heaviest weights 83.2 kg and the lightest 49.7 kg. Half of them have their body weight below 62.8 kg.
library(graphics)
hist(mydata[,-1],
main="Histogram of body weight",
xlab="Weight in kg",
breaks = seq(from=40, to=90, by=5),
col="darkgray")
Comment on Histogram of body weight: - positive skewness - leptokurtic - unimodal
t.test(mydata$Mass,
mu = 59.5,
alternative = "two.sided")
##
## One Sample t-test
##
## data: mydata$Mass
## t = 3.9711, df = 49, p-value = 0.000234
## alternative hypothesis: true mean is not equal to 59.5
## 95 percent confidence interval:
## 61.16758 64.58442
## sample estimates:
## mean of x
## 62.876
H0: true arithmetic mean = 59.5 H1: true arithmetic mean is not equal 59.5
(significance level, alpha < 5%)
It is extremely unlikely that average body weight of ninth graders at the beginning of the 2021/2022 school year, is the same it was in 2018/2019 school year. We reject null hypothesis at p < 0.001.
There is 95% chance that true arithmetic mean (average body weight of ninth graders at the beginning of 2021/2022 school year) is in interval (61.16758,64.58442). It means, that in 5 of 100 different samples, estimated arithmetic mean will not be included in the (confidence) interval.
t = 3.9711
df = 49
sqrt(t*t/(t*t+df))
## [1] 0.4934295
Effect size interpretation: The calculated value of approximately 0,493 suggests, that there is a medium to high effect size on the body weight of ninth graders between 2018/19 and 2021/22 School Years.
#install.packages("car")
library(car)
#install.packages("ggplot2")
library(ggplot2)
#install.packages("reshape2")
library(reshape2)
#install.packages("readxl")
library(readxl)
mydata <- read_xlsx("~/Desktop/R Take Home Exam/Task 3/Apartments.xlsx")
head(mydata)
## # A tibble: 6 × 5
## Age Distance Price Parking Balcony
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
## 6 28 12 1770 0 1
mydata$ParkingF <- factor(mydata$Parking,
levels = c(0, 1),
labels = c("No", "Yes"))
mydata$BalconyF <- factor(mydata$Balcony,
levels = c(0, 1),
labels = c("No", "Yes"))
t.test(mydata$Price,
mu = 1900,
alternative = "two.sided")
##
## One Sample t-test
##
## data: mydata$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
It is extremely unlikely that average Price per m2 of an apartment is 1900. We reject null hypothesis at p < 0.5%.
There is 95% chance that true arithmetic mean (average price per m2 of an apartment) is in interval (1937.443, 2100.44). It means, that in 5 of 100 different samples, estimated arithmetic mean will not be included in the (confidence) interval.
library(pastecs)
##
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
##
## first, last
#Dependent variable ~ Explanatory variables
fit1 <- lm(Price ~ Age,
data = mydata)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
cor(mydata$Price, mydata$Age)
## [1] -0.230255
Estimate of regression coefficient describe relationship between a dependent variable and the response. Value -8.975 indicates that as the Age of apartment increases by 1 year, the Price per m2 of apartment on average decreases by 8.975.
Coefficient of correlation = -0.23 0.0 - 0.1 Very weak 0.1 - 0.3 Weak <- We have negative weak relationship 0.3 - 0.7 Semi strong 0.7 - 0.9 Strong 0.9 - 0.1 Very strong (All values on the line)
Coefficient of determination describes the proportion of explained variability of Y. Based on obtained value of Coefficient of determination (5.302%), we can conclude that Age of an apartment, has low effect on apartment price. 5.302% of variability of apartment price is explained by linear effect of age of the apartment.
library(car)
scatterplotMatrix(mydata[,c(1,2,3)], smooth=FALSE)
Based on the scatterplot matrix, there is no potential problem in multicollinearity.
fit2 <- lm(Price ~ Age + Distance,
data = mydata)
vif(fit2)
## Age Distance
## 1.001845 1.001845
mean(vif(fit2))
## [1] 1.001845
There are no strong relationship between explanatory variables. VIF < 5 mean(VIF) > 1
#Function rstandard
mydata$StdResid <- round(rstandard(fit2), 3)
hist(mydata$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
Points (standardized residuals) are regarded as outliers, if they fall outside the interval (-3,+3). We do not need to remove any outliers, based on standardized residuals.
mydata$CooksD <- round(cooks.distance(fit2), 3)
hist(mydata$CooksD,
xlab = "Cooks distance",
ylab = "Frequency",
main = "Histogram of Cooks distances")
Units are regarded as units with big influence, if their Cooks distance is bigger than 1 or is significantly higher compared to other Cooks distances.
head(mydata[order(-mydata$CooksD),])
## # A tibble: 6 × 9
## Age Distance Price Parking Balcony ParkingF BalconyF StdResid CooksD
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <dbl> <dbl>
## 1 5 45 2180 1 1 Yes Yes 2.58 0.32
## 2 43 37 1740 0 0 No No 1.44 0.104
## 3 2 11 2790 1 0 Yes No 2.05 0.069
## 4 7 2 1760 0 1 No Yes -2.15 0.066
## 5 37 3 2540 1 1 Yes Yes 1.58 0.061
## 6 40 2 2400 0 1 No Yes 1.09 0.038
We remove unit #38
mydata2 <- mydata[-38,]
hist(mydata2$CooksD,
xlab = "Cooks distance",
ylab = "Frequency",
main = "Histogram of Cooks distances")
mydata$StdFittedValues <- round(scale(fit2$fitted.values),3)
library(car)
scatterplot(y = mydata$StdResid, x = mydata$StdFittedValues,
ylab = "Standardized residuals",
xlab = "Standardized fitted values",
boxplots = FALSE,
regLine = TRUE,
smooth = FALSE)
From the distribution we can assume there is no heterskedasticity, because variance is not changing a lot.
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
ols_test_breusch_pagan(fit2)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## ---------------------------------
## Response : Price
## Variables: fitted values of Price
##
## Test Summary
## ---------------------------
## DF = 1
## Chi2 = 0.968106
## Prob > Chi2 = 0.325153
H0: Variance is constant (homoskedasticity) <- p = 32,5% > alpha = 5% H1: Variance is not constant (heteroskedasticity)
hist(mydata2$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
From the histogram we can conclude, that standardized residuals are not distributed normally.
Formal test for normal distribution of standardized residuals is called Shapiro-Wilk normality test.
shapiro.test(mydata2$StdResid)
##
## Shapiro-Wilk normality test
##
## data: mydata2$StdResid
## W = 0.94879, p-value = 0.002187
H0: Variable is normally distributed H1: Variable is NOT normally distributed <- p = 0.2187% < alpha = 5%
This means that in that case variables of do not explain all trends in the dataset.
I think there should be more variables included such as; view, floor level, physical state of the property etc.
#fit2.2 is fit2 with 1 erased observation
fit2.2 <- lm(Price ~ Age + Distance,
data = mydata2)
summary(fit2.2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -604.92 -229.63 -56.49 192.97 599.35
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2456.076 73.931 33.221 < 2e-16 ***
## Age -6.464 3.159 -2.046 0.044 *
## Distance -22.955 2.786 -8.240 2.52e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 276.1 on 81 degrees of freedom
## Multiple R-squared: 0.4838, Adjusted R-squared: 0.4711
## F-statistic: 37.96 on 2 and 81 DF, p-value: 2.339e-12
Estimate of regression coefficient describe relationship between a dependent variable and the response. - Estimate of regression coefficient of Age -6.464 indicates that as the Age of apartment increases by 1 year, the Price per m2 of apartment on average decreases by 6.464 eur. - Estimate of regression coefficient of Distance -22.955 indicates that as the distance from center increases by 1 km, the Price per m2 of apartment on average decreases by 22.955 eur.
Coefficient of determination describes the proportion of explained variability of Y. Multiple-R2: 0.4838 –> 48,38% of variability of apartment price is explained by linear effect of distance from center and age of the apartment.
fit3 <- lm(Price ~ Age + Distance + ParkingF + BalconyF,
data = mydata2)
anova(fit2.2, fit3)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + ParkingF + BalconyF
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 81 6176767
## 2 79 5654480 2 522287 3.6485 0.03051 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
H0: fit2.2 is more appropriate H1: fit3 is more appropriate
H1 is true; Pr(>F) = 3.05 < 5
We can not reject H0
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + ParkingF + BalconyF, data = mydata2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -473.21 -192.37 -28.89 204.17 558.77
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2329.724 93.066 25.033 < 2e-16 ***
## Age -5.821 3.074 -1.894 0.06190 .
## Distance -20.279 2.886 -7.026 6.66e-10 ***
## ParkingFYes 167.531 62.864 2.665 0.00933 **
## BalconyFYes -15.207 59.201 -0.257 0.79795
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 267.5 on 79 degrees of freedom
## Multiple R-squared: 0.5275, Adjusted R-squared: 0.5035
## F-statistic: 22.04 on 4 and 79 DF, p-value: 3.018e-12
Regression coefficients:
Parking: Price per m2 of an apartment with a parking lot is on average for 196.17 higher, than price per m2 of apartment without a parking lot.
Balcony: Price per m2 of an apartment with a balcony is on average for 15.207 lower, than price per m2 of apartment without a balcony.
ANNOVA / F - test H0: RO2 = 0 H1: RO2 is not equal 0
mydata2$StdFittedValues <- round(scale(fit3$fitted.values),3)
mydata2$Residuals <- residuals(fit3)
mydata2$Residuals[2]
## 2
## 427.8029