zdravstvo <- readRDS("~/IMB/statistika/R data/Bootcamp IMB/zdravstvo (1).rds")
head(zdravstvo)
## country alko bdppc izdatki pzd tobak deu
## 1 1 0.62 1960 100 70.20 39.0 0
## 2 2 1.68 31317 2038 79.55 29.0 1
## 3 3 6.34 1807 100 68.45 34.3 0
## 4 4 1.46 29311 2263 78.55 27.0 1
## 5 5 2.60 2535 153 72.15 32.7 0
## 6 6 1.09 6369 363 75.50 25.3 0
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
zdravstvo1 <- rename(zdravstvo,"alcohol"="alko","gdppc"="bdppc","expenditure"="izdatki","life_expectancy"="pzd","Smokers"="tobak","EU_member"="deu")
head(zdravstvo1)
## country alcohol gdppc expenditure life_expectancy Smokers EU_member
## 1 1 0.62 1960 100 70.20 39.0 0
## 2 2 1.68 31317 2038 79.55 29.0 1
## 3 3 6.34 1807 100 68.45 34.3 0
## 4 4 1.46 29311 2263 78.55 27.0 1
## 5 5 2.60 2535 153 72.15 32.7 0
## 6 6 1.09 6369 363 75.50 25.3 0
#changing particular value
zdravstvo1[2,4]<-1500
#creating new variable and informing R that we have non-numerical variable
zdravstvo1$EU_memberF <- factor(zdravstvo1$EU_member,
levels = c(1, 0),
labels = c("EU", "Non_EU"))
head(zdravstvo1)
## country alcohol gdppc expenditure life_expectancy Smokers EU_member
## 1 1 0.62 1960 100 70.20 39.0 0
## 2 2 1.68 31317 1500 79.55 29.0 1
## 3 3 6.34 1807 100 68.45 34.3 0
## 4 4 1.46 29311 2263 78.55 27.0 1
## 5 5 2.60 2535 153 72.15 32.7 0
## 6 6 1.09 6369 363 75.50 25.3 0
## EU_memberF
## 1 Non_EU
## 2 EU
## 3 Non_EU
## 4 EU
## 5 Non_EU
## 6 Non_EU
#I have already renamed the variable names
#deleting forth row
zdravstvo2 <- zdravstvo1[-4,]
#deleting fifth and sixth column
zdravstvo3 <- zdravstvo1[,c(-5,-6)]
#data including only countries, where GDP per capita is higher than 25000 USD
zdravstvo4 <- zdravstvo1[zdravstvo1$gdppc > 25000,]
#creating new variable that doubles the expenditure
zdravstvo4$expenditure_New <- zdravstvo4$expenditure *2
summary(zdravstvo1[,c(-1,-7)]) #summary without country and EU_member
## alcohol gdppc expenditure life_expectancy Smokers
## Min. :0.620 Min. : 1055 Min. : 43 Min. :65.20 Min. :18.00
## 1st Qu.:1.617 1st Qu.: 5893 1st Qu.: 269 1st Qu.:72.79 1st Qu.:25.23
## Median :2.235 Median :14911 Median :1172 Median :77.38 Median :28.00
## Mean :2.863 Mean :20323 Mean :1449 Mean :75.87 Mean :28.65
## 3rd Qu.:3.717 3rd Qu.:31425 3rd Qu.:2470 3rd Qu.:78.75 3rd Qu.:32.77
## Max. :7.640 Max. :59670 Max. :4329 Max. :80.70 Max. :39.00
## EU_memberF
## EU :15
## Non_EU:17
##
##
##
##
library(psych)
describe(zdravstvo1[,c(-1,-7)])
## vars n mean sd median trimmed mad min
## alcohol 1 32 2.86 1.77 2.24 2.63 1.36 0.62
## gdppc 2 32 20322.91 16153.42 14911.00 18965.00 19314.57 1055.00
## expenditure 3 32 1449.44 1243.09 1171.50 1341.69 1457.40 43.00
## life_expectancy 4 32 75.87 4.25 77.38 76.37 3.37 65.20
## Smokers 5 32 28.65 5.20 28.00 28.68 5.93 18.00
## EU_memberF* 6 32 1.53 0.51 2.00 1.54 0.00 1.00
## max range skew kurtosis se
## alcohol 7.64 7.02 1.06 0.23 0.31
## gdppc 59670.00 58615.00 0.51 -0.88 2855.55
## expenditure 4329.00 4286.00 0.53 -0.96 219.75
## life_expectancy 80.70 15.50 -0.90 -0.37 0.75
## Smokers 39.00 21.00 0.02 -0.76 0.92
## EU_memberF* 2.00 1.00 -0.12 -2.05 0.09
Explanations:
hist(zdravstvo1$expenditure,
main ="Health expenditure per capita in USD",
xlab = "Expenditure",
ylab = "Frequency",
col = "blue",
lwd = 2)
- In 15 countries, health expenditure per capita was between $0 and
$1,000. In 5 countries, health expenditure per capita was between $1,000
and $2,000, in 8 countries between $2,000 and $3,000, in three countries
between $3,000 and $4,000. In only one country, health expenditures per
capita was between $4,000 and $5,000.
boxplot(zdravstvo1$gdppc)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
scatterplot(y = zdravstvo1$life_expectancy,
x = zdravstvo1$alcohol,
ylab = "Life expectancy in ages)",
xlab = "Consumption of pure alcohol per inhabitant in liters",
smooth = FALSE,
boxplots=FALSE)
- The variables “life expectancy” and “alcohol” are negatively
correlated. When the consumption of pure alcohol per inhabitant in
liters is higher, the life expectancy in years is lower.
mydata <- read.table("./Body mass.csv",
header = TRUE, #you need to tell the R that in the first row there are name of the variables
sep = ";",
dec = ",")
head (mydata)
## ID Mass
## 1 1 62.1
## 2 2 64.5
## 3 3 56.5
## 4 4 53.4
## 5 5 61.3
## 6 6 62.2
summary(mydata[,-1])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 49.70 60.23 62.80 62.88 64.50 83.20
hist(mydata[,-1],
main = "Distribution of body mass",
xlab = "Kilogram",
ylab = "Frequency",
lwd = 2,
breaks = seq(from = 40, to = 90, by = 10))
t.test(mydata$Mass,
mu = 59.5,
alternative = "two.sided")
##
## One Sample t-test
##
## data: mydata$Mass
## t = 3.9711, df = 49, p-value = 0.000234
## alternative hypothesis: true mean is not equal to 59.5
## 95 percent confidence interval:
## 61.16758 64.58442
## sample estimates:
## mean of x
## 62.876
We reject H0 at p<0.001 and accept H1. Average weight in the year 2021/2022 is different than it was in 2018/2019. The body weight has increased.
95% confident interval for aritmetic mean: 61.2 < Mu < 64.6, we can also reject H0 based on this interval.
Calculating the effect size
library(effectsize)
##
## Attaching package: 'effectsize'
## The following object is masked from 'package:psych':
##
## phi
cohens_d(mydata$Mass, mu=59.5)
## Cohen's d | 95% CI
## ------------------------
## 0.56 | [0.26, 0.86]
##
## - Deviation from a difference of 59.5.
interpret_cohens_d(0.56,rules="sawilowsky2009")
## [1] "medium"
## (Rules: sawilowsky2009)
library(readxl)
apartments <- read_xlsx("./Apartments.xlsx")
head(apartments)
## # A tibble: 6 × 5
## Age Distance Price Parking Balcony
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
## 6 28 12 1770 0 1
Description:
Age: Age of an apartment in years
Distance: The distance from city center in km
Price: Price per m2
Parking: 0-No, 1-Yes
Balcony: 0-No, 1-Yes
apartments$ParkingF <- factor(apartments$Parking,
levels = c (0, 1),
labels = c ("No", "Yes"))
apartments$BalconyF <- factor(apartments$Balcony,
levels = c (0, 1),
labels = c ("No", "Yes"))
t.test(apartments$Price,
mu = 1900,
alternative = "two.sided")
##
## One Sample t-test
##
## data: apartments$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
fit1 <- lm(Price ~ Age,
data = apartments)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
cor(apartments$Price, apartments$Age)
## [1] -0.230255
library(car)
scatterplotMatrix(apartments[ , c(-4, -5,-6,-7)], #Scatterplot matrix, when we have more then two variables matrix
smooth = FALSE)
- Based on the pictures, I don’t see a problem with multicollinearity.
There is no strong correlation between age and distance (the trend line
is horizontal; therefore, I assume there is neither a positive nor a
negative correlation, and we also can’t draw a clear line through the
points).
fit2 <- lm(Price ~ Age + Distance,
data = apartments)
vif(fit2) #Multicolinearity
## Age Distance
## 1.001845 1.001845
mean(vif(fit2))
## [1] 1.001845
apartments$StdResid <- round(rstandard(fit2), 3) #Standardized residuals
apartments$CooksD <- round(cooks.distance(fit2), 3) #Cooks distances
head(apartments[order(-apartments$CooksD),], 6)
## # A tibble: 6 × 9
## Age Distance Price Parking Balcony ParkingF BalconyF StdResid CooksD
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <dbl> <dbl>
## 1 5 45 2180 1 1 Yes Yes 2.58 0.32
## 2 43 37 1740 0 0 No No 1.44 0.104
## 3 2 11 2790 1 0 Yes No 2.05 0.069
## 4 7 2 1760 0 1 No Yes -2.15 0.066
## 5 37 3 2540 1 1 Yes Yes 1.58 0.061
## 6 40 2 2400 0 1 No Yes 1.09 0.038
apartments <- apartments[-38, ] #removing the unit with highest st.res & cooks distance
fit2 <- lm(Price ~ Age + Distance,
data = apartments)
apartments$StdFitted <- scale(fit2$fitted.values) #scale function - for standardization
library(car)
scatterplot(y = apartments$StdResid, x = apartments$StdFitted,
ylab = "Standardized residuals",
xlab = "Standardized fitted values",
boxplots = FALSE,
regLine = FALSE,
smooth = FALSE)
- Based on the scatterplot, I don’t observe any potential
heteroskedasticity. The variability of the residuals appears to be
constant across all levels of the explanatory variables. If there were
any heteroskedasticity, we would expect to see a clear spread of the
dots.
hist(apartments$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
shapiro.test(apartments$StdResid)
##
## Shapiro-Wilk normality test
##
## data: apartments$StdResid
## W = 0.94879, p-value = 0.002187
H0: Standardized residuals are distributed normally
H1: Standardized residuals are not distributed normally
p= 0.002 ; We can reject H0 at p=0.002, and accept H1. Standardized residuals are not distributed normally
fit2 <- lm(Price ~ Age + Distance,
data = apartments)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -604.92 -229.63 -56.49 192.97 599.35
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2456.076 73.931 33.221 < 2e-16 ***
## Age -6.464 3.159 -2.046 0.044 *
## Distance -22.955 2.786 -8.240 2.52e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 276.1 on 81 degrees of freedom
## Multiple R-squared: 0.4838, Adjusted R-squared: 0.4711
## F-statistic: 37.96 on 2 and 81 DF, p-value: 2.339e-12
fit3 <- lm(Price ~ Age + Distance + ParkingF + BalconyF,
data = apartments)
anova(fit2,fit3)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + ParkingF + BalconyF
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 81 6176767
## 2 79 5654480 2 522287 3.6485 0.03051 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
H0: Model 1 is better
H1: Model 2 is better
p = 0.03 ; We reject the null hypothesis at p = 0.03. Model 2 is better >>> it explains more variability in price of the apartments.
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + ParkingF + BalconyF, data = apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -473.21 -192.37 -28.89 204.17 558.77
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2329.724 93.066 25.033 < 2e-16 ***
## Age -5.821 3.074 -1.894 0.06190 .
## Distance -20.279 2.886 -7.026 6.66e-10 ***
## ParkingFYes 167.531 62.864 2.665 0.00933 **
## BalconyFYes -15.207 59.201 -0.257 0.79795
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 267.5 on 79 degrees of freedom
## Multiple R-squared: 0.5275, Adjusted R-squared: 0.5035
## F-statistic: 22.04 on 4 and 79 DF, p-value: 3.018e-12
B3: Assuming that all other variables are the same, the price per m2 of the apartments with a parking on average increase for 167.53 eur, compared to apartments without parking (p = 0.009).
B4: We can not say that Balcony has statistical effect on price per m2 of the apartment (p=0.80)
F-statistic
H0: All explanatory coefficient = 0
H1: All explanatory coefficient != 0
We reject the H0, and accept H1, at least one explanatory variable has significant effect on dependent variable.
apartments$Fitted <- fitted.values(fit3)
apartments$Residuals <- residuals(fit3)
head(apartments[colnames(apartments) %in% c("Fitted", "Residuals")])
## # A tibble: 6 × 2
## Fitted Residuals
## <dbl> <dbl>
## 1 1706. -66.0
## 2 2372. 428.
## 3 1721. -61.2
## 4 1563. 287.
## 5 2012. -372.
## 6 1908. -138.