data()
data(package = .packages(all.available=TRUE))
#install.packages("carData")
library(carData)
mydata <- force(Angell)
head(mydata)
## moral hetero mobility region
## Rochester 19.0 20.6 15.0 E
## Syracuse 17.0 15.6 20.2 E
## Worcester 16.4 22.1 13.6 E
## Erie 16.2 14.0 14.8 E
## Milwaukee 15.8 17.4 17.6 MW
## Bridgeport 15.3 27.9 17.5 E
colnames(mydata) <- c("Moral", "EthDiv", "Mobility", "Region") #Changing name of variables
head(mydata)
## Moral EthDiv Mobility Region
## Rochester 19.0 20.6 15.0 E
## Syracuse 17.0 15.6 20.2 E
## Worcester 16.4 22.1 13.6 E
## Erie 16.2 14.0 14.8 E
## Milwaukee 15.8 17.4 17.6 MW
## Bridgeport 15.3 27.9 17.5 E
colnames(mydata)[2] <- "Ethnic_diversity" #Changing name of the 2nd variable
head(mydata)
## Moral Ethnic_diversity Mobility Region
## Rochester 19.0 20.6 15.0 E
## Syracuse 17.0 15.6 20.2 E
## Worcester 16.4 22.1 13.6 E
## Erie 16.2 14.0 14.8 E
## Milwaukee 15.8 17.4 17.6 MW
## Bridgeport 15.3 27.9 17.5 E
Create new dataset, including only ETHNIC DIVERSITY and MOBILITY
mydata2 <- mydata[ , c(2,3)]
Create new data set with units from Midwest
mydata3 <- mydata [mydata$Region== "MW", ]
Create new data set with the condition: Number of residents moving into and out of the cities in Midwest must be between 20 and 35 percent.
mydata4 <- mydata3[mydata3$Ethnic_diversity >= 20 & mydata3$Ethnic_diversity <= 35, ]
head(mydata4)
## Moral Ethnic_diversity Mobility Region
## Dayton 14.3 23.7 23.8 MW
## Akron 11.3 20.4 22.1 MW
## Indianapolis 8.8 29.2 23.1 MW
## Columbus 8.0 27.4 25.0 MW
Change Mobility for Trenton to 15.7 and show the new number.
mydata["Trenton","Mobility"] <- 15.7
print(mydata["Trenton","Mobility"])
## [1] 15.7
Create a new variable called Mobility_scaled, that changes the scale of Mobility variable from 0 to 1.
mydata$Mobility_scaled <- mydata$Mobility / 100
head(mydata)
## Moral Ethnic_diversity Mobility Region Mobility_scaled
## Rochester 19.0 20.6 15.0 E 0.150
## Syracuse 17.0 15.6 20.2 E 0.202
## Worcester 16.4 22.1 13.6 E 0.136
## Erie 16.2 14.0 14.8 E 0.148
## Milwaukee 15.8 17.4 17.6 MW 0.176
## Bridgeport 15.3 27.9 17.5 E 0.175
summary(mydata [ , c(-4, -5)]) #Summary excluding categorical variable and new variable
## Moral Ethnic_diversity Mobility
## Min. : 4.20 Min. :10.60 Min. :12.10
## 1st Qu.: 8.70 1st Qu.:16.90 1st Qu.:19.45
## Median :11.10 Median :23.70 Median :25.90
## Mean :11.20 Mean :31.37 Mean :27.60
## 3rd Qu.:13.95 3rd Qu.:39.00 3rd Qu.:34.80
## Max. :19.00 Max. :84.50 Max. :49.80
library(psych)
describe(mydata [ , c(-4, -5)]) #Describing data excluding categorical and new variable
## vars n mean sd median trimmed mad min max
## Moral 1 43 11.20 3.57 11.1 11.23 4.15 4.2 19.0
## Ethnic_diversity 2 43 31.37 20.41 23.7 28.33 12.01 10.6 84.5
## Mobility 3 43 27.60 9.79 25.9 27.05 10.82 12.1 49.8
## range skew kurtosis se
## Moral 14.8 -0.02 -0.81 0.54
## Ethnic_diversity 73.9 1.18 0.30 3.11
## Mobility 37.7 0.40 -0.80 1.49
library(pastecs)
stat.desc(mydata [ , c(-4, -5)])
## Moral Ethnic_diversity Mobility
## nbr.val 43.0000000 43.0000000 43.0000000
## nbr.null 0.0000000 0.0000000 0.0000000
## nbr.na 0.0000000 0.0000000 0.0000000
## min 4.2000000 10.6000000 12.1000000
## max 19.0000000 84.5000000 49.8000000
## range 14.8000000 73.9000000 37.7000000
## sum 481.6000000 1349.0000000 1186.6000000
## median 11.1000000 23.7000000 25.9000000
## mean 11.2000000 31.3720930 27.5953488
## SE.mean 0.5446915 3.1127223 1.4932219
## CI.mean.0.95 1.0992319 6.2817279 3.0134437
## var 12.7576190 416.6287265 95.8775969
## std.dev 3.5717809 20.4114852 9.7917106
## coef.var 0.3189090 0.6506255 0.3548319
round(stat.desc(mydata [ , c(-4, -5)]), 2) #Rounding to two decimals
## Moral Ethnic_diversity Mobility
## nbr.val 43.00 43.00 43.00
## nbr.null 0.00 0.00 0.00
## nbr.na 0.00 0.00 0.00
## min 4.20 10.60 12.10
## max 19.00 84.50 49.80
## range 14.80 73.90 37.70
## sum 481.60 1349.00 1186.60
## median 11.10 23.70 25.90
## mean 11.20 31.37 27.60
## SE.mean 0.54 3.11 1.49
## CI.mean.0.95 1.10 6.28 3.01
## var 12.76 416.63 95.88
## std.dev 3.57 20.41 9.79
## coef.var 0.32 0.65 0.35
hist (mydata$Ethnic_diversity,
main = "Distribution of Ethnic diversity in cities",
xlab = "Heterogenity of of nonwhite and foreign-born white residents",
breaks = seq(from = 0, to = 100, by = 10),
col = "orange" , # Color of the bars
border = "blue") # Color of the border around the bars
Positive skewness (to the right): Majority of cities tend to have lower levels of ethnic diversity among nonwhite and foreign-born white residents.
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot (y = mydata$Moral,
x = mydata$Mobility,
ylab = "Moral Integration",
xlab = "Residents moving into and out of the city (in %)",
smooth = FALSE,
boxplot= FALSE,
col = "orange",#Color of the points
pch = 19) #Point type
Explanation of results:
massdata <- read.table("~/Bootcamp R/Bootcamp23_working/Task 2/Body mass.csv", header= TRUE , sep = ";", dec= ",")
head(massdata)
## ID Mass
## 1 1 62.1
## 2 2 64.5
## 3 3 56.5
## 4 4 53.4
## 5 5 61.3
## 6 6 62.2
library(psych)
describe(massdata$Mass)
## vars n mean sd median trimmed mad min max range skew
## X1 1 50 62.88 6.01 62.8 62.56 3.34 49.7 83.2 33.5 0.85
## kurtosis se
## X1 2.11 0.85
hist(massdata$Mass,
main = "Distribution of Mass",
xlab = "Mass",
col = "orange", # Color of the bars
border = "blue") # Color of the border around the bars
The graph shows normal distribution of mass between the students.
H0: Mu = 59.5 H1: Mu != 59.5
t.test(massdata$Mass,
mu = 59.5,
alternative = "two.sided")
##
## One Sample t-test
##
## data: massdata$Mass
## t = 3.9711, df = 49, p-value = 0.000234
## alternative hypothesis: true mean is not equal to 59.5
## 95 percent confidence interval:
## 61.16758 64.58442
## sample estimates:
## mean of x
## 62.876
Estimate of arithmetic mean = 62.876
We compare the p-value to alfa = 5% (0.05) p-value=0.000234 <0.001
We reject H0 and accept H1, at p<0.001 The average mass in 2021/2022 is different than in 2018/2019. The average mass increased.
95% CI (confidence interval) for arithmetic mean is 61.16758 < Mu < 64.58442. We reject H0 at 5%, because 59.9 is not included in the interval.
library(effectsize)
##
## Attaching package: 'effectsize'
## The following object is masked from 'package:psych':
##
## phi
cohens_d(massdata$Mass, mu=59.5) #Calculate Cohen's d effect size for the 'Mass' variable
## Cohen's d | 95% CI
## ------------------------
## 0.56 | [0.26, 0.86]
##
## - Deviation from a difference of 59.5.
0.5 - standardized measure of the effect size
interpret_cohens_d(0.5, rules = "sawilowsky2009") #Choosing sawilowsky2009 for interpretation
## [1] "medium"
## (Rules: sawilowsky2009)
Medium change in mass.
library(readxl)
aprdata <- read_xlsx("~/Bootcamp R/Bootcamp23_working/Task 3/Apartments.xlsx")
aprdata <- as.data.frame(aprdata)
head(aprdata)
## Age Distance Price Parking Balcony
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
## 6 28 12 1770 0 1
Description:
aprdata$ParkingCat <- factor(aprdata$Parking,
levels = c(0,1),
labels = c("No", "Yes"))
aprdata$BalconyCat <- factor(aprdata$Balcony,
levels = c(0,1),
labels = c("No", "Yes"))
head(aprdata)
## Age Distance Price Parking Balcony ParkingCat BalconyCat
## 1 7 28 1640 0 1 No Yes
## 2 18 1 2800 1 0 Yes No
## 3 7 28 1660 0 0 No No
## 4 28 29 1850 0 1 No Yes
## 5 18 18 1640 1 1 Yes Yes
## 6 28 12 1770 0 1 No Yes
library(pastecs)
round(stat.desc(aprdata[c(1,2,3)]), 2) #Statistical description of data, needed for later analysis.
## Age Distance Price
## nbr.val 85.00 85.00 85.00
## nbr.null 0.00 0.00 0.00
## nbr.na 0.00 0.00 0.00
## min 1.00 1.00 1400.00
## max 45.00 45.00 2820.00
## range 44.00 44.00 1420.00
## sum 1577.00 1209.00 171610.00
## median 18.00 12.00 1950.00
## mean 18.55 14.22 2018.94
## SE.mean 1.05 1.23 40.98
## CI.mean.0.95 2.09 2.45 81.50
## var 93.96 129.44 142764.34
## std.dev 9.69 11.38 377.84
## coef.var 0.52 0.80 0.19
H0: Mu = 1900 H1: Mu =/ 1900
t.test(aprdata$Price,
mu = 1900,
alternative = "two.sided")
##
## One Sample t-test
##
## data: aprdata$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
Estimate mean for Price = 2018.941
We compare the p-value to alfa = 5% (0.05) p-value < 0.05
We reject H0 and accept H1
95% CI (confidence interval) for arithmetic mean is 1937.443 < Mu < 2100.440. We reject H0 at 5%, because 1900 is not included in the interval.
fit1 <- lm(Price ~ Age,
data = aprdata)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = aprdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
cor(aprdata$Price, aprdata$Age)
## [1] -0.230255
Intercept 2185.455 with y-axis represents the price of new apartment, where age is equal to 0. Estimate -8.975 is b1 coefficient, which shows us that if the age of the apartment increases for 1 year, the price of m2 on average is decreased by -8.975 eur (p=0.034).
Coefficient of correlation -0.23: Correlation between price per m2 and age is negative linear relationship and it is weak.
Coefficient of determination: R-squared explains that 5.3% of variability in price of m2 of the apartment is explained linear effect age of the apartment.
library(car)
scatterplotMatrix(aprdata[ , c(1,2,3)],
smooth = FALSE,
col = "orange")
Based on the observing the scatterplot, I would say that there is not a potential problem with multicolinearity. Explanatory variables, age and distance, seem to have a weak positive correlation.
cor(aprdata$Distance, aprdata$Age) #Checking intensity of correlation between distance and age as explanatory variables.
## [1] 0.04290813
This mild correlation 0.04 would not represent a problem in this analysis.
fit2 <- lm(Price ~ Age + Distance,
data = aprdata)
vif(fit2)
## Age Distance
## 1.001845 1.001845
Variance Inflation Factors under 5, no problematic colinearity.
aprdata$StdResid <- round(rstandard(fit2), 3) #Standardized residuals
aprdata$CooksD <- round(cooks.distance(fit2), 3) #Cooks distances
Check standardized residuals with histogram.
hist(aprdata$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Distribution of standardized residuals",
breaks = seq(from = -3, to = 3, by = 0.5),
col = "orange",
border = "blue")
There are no values below -3 or above 3, meaning that we do not have to remove any units from our data.
hist(aprdata$CooksD,
xlab = "Cooks distance",
ylab = "Frequency",
main = "Distribution of Cooks distances",
col = "orange",
border = "blue")
Although with cooks distance we would typically remove units with values above 1, from this histogram we can conclude that there are some outliers that could impact our data.
head(aprdata[order(-aprdata$CooksD),]) #Showing data with descending order of cooks distance value.
## Age Distance Price Parking Balcony ParkingCat BalconyCat StdResid
## 38 5 45 2180 1 1 Yes Yes 2.577
## 55 43 37 1740 0 0 No No 1.445
## 33 2 11 2790 1 0 Yes No 2.051
## 53 7 2 1760 0 1 No Yes -2.152
## 22 37 3 2540 1 1 Yes Yes 1.576
## 39 40 2 2400 0 1 No Yes 1.091
## CooksD
## 38 0.320
## 55 0.104
## 33 0.069
## 53 0.066
## 22 0.061
## 39 0.038
Since apartments with with ID 38 and ID 55 seem to have cooks distance values relatively large compared with the rest of the data, they will be removed as outliers.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:pastecs':
##
## first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
aprdata <- aprdata %>%
filter(!(Price %in% c("2180", "1740"))) #Filter out influencing values
fit2 <- lm(Price ~ Age + Distance,
data = aprdata)
aprdata$StdFitted <- scale(fit2$fitted.values)
library(car)
scatterplot(y = aprdata$StdResid, x = aprdata$StdFitted,
ylab = "Standardized residuals",
xlab = "Standardized fitted values",
boxplots = FALSE,
regLine = TRUE,
smooth = FALSE,
col = "orange",
pch = 19)
Potential heteroskedasticity can be observed in the scatterplot. We can run Breusch-Pagan test to check.
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
ols_test_breusch_pagan(fit2)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## ---------------------------------
## Response : Price
## Variables: fitted values of Price
##
## Test Summary
## -----------------------------
## DF = 1
## Chi2 = 3.775135
## Prob > Chi2 = 0.05201969
p value 0.052 is larger than 5%
We cannot reject H0. Variance is constant.
No heteroskedaticity.
hist(aprdata$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Distribution of standardized residuals",
col = "orange",
border = "blue")
shapiro.test(aprdata$StdResid) #Checking the distribution of standardized residuals.
##
## Shapiro-Wilk normality test
##
## data: aprdata$StdResid
## W = 0.94963, p-value = 0.002636
H0: Variables are normally distributed H1: Variables are not normally distributed
p-value=0.0026 , smaller than 5%
We reject H0 and accept H1.
Variables are not normally distributed.
However, since we have sample larger than 30 units, we can neglect this distribution.
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = aprdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -627.27 -212.96 -46.23 205.05 578.98
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2490.112 76.189 32.684 < 2e-16 ***
## Age -7.850 3.244 -2.420 0.0178 *
## Distance -23.945 2.826 -8.473 9.53e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.5 on 80 degrees of freedom
## Multiple R-squared: 0.4968, Adjusted R-squared: 0.4842
## F-statistic: 39.49 on 2 and 80 DF, p-value: 1.173e-12
Explanation of coefficient after excluding 2 units.
Both Age and Distance have statistically significant effect on price of the apartment per m2, which we can conclude from both p-values being below 0.05.
R-squared: 0.4968 - 49.68% of variability of price by m2 is explained by linear effect of Age and Distance.
fit3 <- lm(Price ~ Age + Distance + ParkingCat + BalconyCat,
data = aprdata)
anova(fit2, fit3)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + ParkingCat + BalconyCat
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 80 5982100
## 2 78 5458696 2 523404 3.7395 0.02813 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
H0 = fit2 is better H1 = fit3 is better
p-value=0.03 which is lower than 5%.
We reject H0 and accept H1.
This means that the Model 2 - fit3 fits data better than fit2. It explains more data than fit2.
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + ParkingCat + BalconyCat,
## data = aprdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -499.06 -194.33 -32.04 219.03 544.31
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2358.900 93.664 25.185 < 2e-16 ***
## Age -7.197 3.148 -2.286 0.02499 *
## Distance -21.241 2.911 -7.296 2.14e-10 ***
## ParkingCatYes 168.921 62.166 2.717 0.00811 **
## BalconyCatYes -6.985 58.745 -0.119 0.90566
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 264.5 on 78 degrees of freedom
## Multiple R-squared: 0.5408, Adjusted R-squared: 0.5173
## F-statistic: 22.97 on 4 and 78 DF, p-value: 1.449e-12
It can be observed that age, distance, and parking have statistically significant effect on price of the apartment per m2, which we can conclude from both p-values being below 0.05. However the balcony is not statistically significant with p-value more than 0.05 (P=0.91, so it will not be explained.
Explanation of b1=-7.197 If the age of the apartment increases for 1 year, the price of m2 on average is decreased by -7.197 eur (p=0.025), assuming that all other explanatory variables included in the model are constant.
Explanation of b2=-21.241 If the distance from city center increases for 1 km, the price per m2 decreases by 21.24 EUR on average (p < 0.001), assuming that all other explanatory variables included in the model are constant.
Explanation of b3= 168.921 Apartments with parking slot have on average by 168.9 EUR higher price compared to apartments without parking (p=0.008), assuming that all other explanatory variables included in the model are constant.
F-statistics: Test of significance of regression
H0: Ro2 = 0 H1: Ro2 > 0
F= 22.97, p<0.001
We reject the null hypothesis at p<0.001. We can say that the determination coefficient is more than 0 meaning that we found linear relationship between the price per m2 of an apartment and the explanatory variables included in the model.
aprdata$Fitted <- fitted.values(fit3)
aprdata$Residuals <- residuals(fit3)
head(aprdata[colnames(aprdata) %in% c("Price", "Fitted", "Residuals")], 3)
## Price Fitted Residuals
## 1 1640 1706.795 -66.79523
## 2 2800 2377.043 422.95718
## 3 1660 1713.780 -53.78016
ID2 apartment has a fitted value of 2377.0423 compared to the actual value of 2800. Residual must be 422.957 (actual value - fitted value), as proven above.