#data(package = .packages(all.available = TRUE))
mydata <- Angell %>%
mutate(City = rownames(Angell)) %>%
`rownames<-`( NULL )
str(mydata)
## 'data.frame': 43 obs. of 5 variables:
## $ moral : num 19 17 16.4 16.2 15.8 15.3 15.2 14.3 14.2 14.1 ...
## $ hetero : num 20.6 15.6 22.1 14 17.4 27.9 22.3 23.7 10.6 12.7 ...
## $ mobility: num 15 20.2 13.6 14.8 17.6 17.5 14.7 23.8 19.4 31.9 ...
## $ region : Factor w/ 4 levels "E","MW","S","W": 1 1 1 1 2 1 1 2 1 2 ...
## $ City : chr "Rochester" "Syracuse" "Worcester" "Erie" ...
mydata <- mydata %>%
mutate(Region=factor(mydata$region,
levels = c("E","MW","S", "W"),
labels = c("Northeast","Midwest" , "Southeast", "West")),
Moral_Integration = moral,
Ethnic_Heterogeneity = hetero,
Geographic_Mobility = mobility) %>%
select(-c(moral,hetero,mobility, region))
#kable(
head(mydata[order(-mydata$Moral_Integration),])
## City Region Moral_Integration Ethnic_Heterogeneity Geographic_Mobility
## 1 Rochester Northeast 19.0 20.6 15.0
## 2 Syracuse Northeast 17.0 15.6 20.2
## 3 Worcester Northeast 16.4 22.1 13.6
## 4 Erie Northeast 16.2 14.0 14.8
## 5 Milwaukee Midwest 15.8 17.4 17.6
## 6 Bridgeport Northeast 15.3 27.9 17.5
#, caption = "The first six rows of mydata table")
New data frames based on conditions
mydata2 <- mydata %>% filter(Region == "Midwest")
Moral integration:
– minimum is 4.20
– maximum is 19.00
– range is 14.80
– median is 11.10
– mean is 12.20
Ethnic Heterogeneity:
– minimum is 10.60
– maximum is 84.50
– range is 73.90
– median is 23.70
– mean is 31.37
Geographic Mobility:
– minimum is 12.10
– maximum is 49.80
– range is 37.70
– median is 25.90
– mean is 27.60
Ethnic Heterogeneity has the highest coefficient of variation, meaning that the dispersion of data points in a data series around the mean is the largest.
#describe(mydata[,-c(1,2)])
round(stat.desc(mydata[ , -c(1,2)]), 2)
## Moral_Integration Ethnic_Heterogeneity Geographic_Mobility
## nbr.val 43.00 43.00 43.00
## nbr.null 0.00 0.00 0.00
## nbr.na 0.00 0.00 0.00
## min 4.20 10.60 12.10
## max 19.00 84.50 49.80
## range 14.80 73.90 37.70
## sum 481.60 1349.00 1186.70
## median 11.10 23.70 25.90
## mean 11.20 31.37 27.60
## SE.mean 0.54 3.11 1.49
## CI.mean.0.95 1.10 6.28 3.01
## var 12.76 416.63 95.82
## std.dev 3.57 20.41 9.79
## coef.var 0.32 0.65 0.35
#summary(mydata[,-c(1,2)])
#sapply(mydata[ , -c(1,2)], FUN = mean)
In the Midwest cities on average 26.06 percentages of residents are moving into and out of the city. The median is 24 percentages and the standard deviation is 7.11 percentages.
#round(stat.desc(mydata2[ , -c(1,2)]), 2)
round(mean(mydata2[ , 5]), 2)
## [1] 26.06
round(median(mydata2[ , 5]), 2)
## [1] 24
round(sd(mydata2[ , 5]), 2)
## [1] 7.11
mydata2 %>% summary()
## City Region Moral_Integration Ethnic_Heterogeneity Geographic_Mobility
## Length:14 Northeast: 0 Min. : 8.00 Min. :10.70 Min. :17.60
## Class :character Midwest :14 1st Qu.:11.15 1st Qu.:16.12 1st Qu.:21.73
## Mode :character Southeast: 0 Median :12.75 Median :19.25 Median :24.00
## West : 0 Mean :12.28 Mean :21.68 Mean :26.06
## 3rd Qu.:13.95 3rd Qu.:26.48 3rd Qu.:30.77
## Max. :15.80 Max. :39.70 Max. :42.70
#describeBy(mydata2$Ethnic_Heterogeneity, mydata2$Region)
In the Southeast cities where ethnic heterogeneity is above 25%, the mean of moral integration is 13.44. In the cities where ethnic heterogeneity is under 25%, the mean of moral integration is 16.34
mydata %>% filter(Region == "Northeast", Ethnic_Heterogeneity >= 25 ) %>%
select("Moral_Integration") %>% pull() %>% mean()
## [1] 13.43333
mydata %>% filter(Region == "Northeast", Ethnic_Heterogeneity <= 25 ) %>%
select("Moral_Integration") %>% pull() %>% mean()
## [1] 16.33333
More sample statistics
Statistics <- summarySE(mydata,
measurevar="Geographic_Mobility",
groupvars=c("Region"),
conf.interval=0.95)
Statistics
## Region N Geographic_Mobility sd se ci
## 1 Northeast 9 15.90000 2.657536 0.8858455 2.042763
## 2 Midwest 14 26.05714 7.106304 1.8992397 4.103058
## 3 Southeast 14 32.45714 8.430596 2.2531715 4.867681
## 4 West 6 37.40000 6.567496 2.6811689 6.892164
hist(mydata$Ethnic_Heterogeneity,
main = "Distribution of ethnic heterogeneity in the US",
xlab = "Percentages",
ylab = "Frequency",
breaks = seq(from = 1, to = 100, by = 2))
As expected mean of moral integration in the Southeast cities is the lowest and in the Northeast cities is the highest.
ggplot(mydata, aes(x=Region, y= Moral_Integration))+
geom_boxplot()+
theme_bw()
Ethnic heterogeneity is by far the highest in the Southeast cities. Correlation between ethnic heterogeneity and moral integration can be seen from these 2 box plots.
ggplot(mydata, aes(x=Region, y= Ethnic_Heterogeneity))+
geom_boxplot()+
theme_bw()
ggplot(mydata, aes(x=Region, y= Geographic_Mobility))+
geom_boxplot()+
theme_bw()
As seen in the first box plot highest moral integration is in Northeast and lowest in the Southeast.
ggplot(mydata, aes(x=Moral_Integration)) +
geom_histogram(binwidth = 5, colour="gray") +
facet_wrap(~Region, ncol = 1) +
ylab("Frequency")+
theme_bw()
Graph in the first column and second row shows negative correlation between moral integration and ethnic heterogeneity.
scatterplotMatrix(mydata[ , c(-1, -2)],
smooth = FALSE)
rcorr(as.matrix(mydata[ , c(-1, -2)]))
## Moral_Integration Ethnic_Heterogeneity Geographic_Mobility
## Moral_Integration 1.00 -0.59 -0.49
## Ethnic_Heterogeneity -0.59 1.00 -0.06
## Geographic_Mobility -0.49 -0.06 1.00
##
## n= 43
##
##
## P
## Moral_Integration Ethnic_Heterogeneity Geographic_Mobility
## Moral_Integration 0.0000 0.0008
## Ethnic_Heterogeneity 0.0000 0.6898
## Geographic_Mobility 0.0008 0.6898
mydata3 <- read.table("/Users/zanmikola/Documents/IMB/Bootcmap R/R Take Home Exam/Task 2/Body mass.csv",
header=TRUE,
sep=";",
dec=",")
head(mydata3)
## ID Mass
## 1 1 62.1
## 2 2 64.5
## 3 3 56.5
## 4 4 53.4
## 5 5 61.3
## 6 6 62.2
Description:
mean(mydata3$Mass)
## [1] 62.876
sd(mydata3$Mass)
## [1] 6.011403
hist(mydata3$Mass,
main = "Distribution of Mass",
xlab = "Kilograms",
ylab = "Frequency",
breaks = seq(from = 40, to = 90, by = 1))
library(ggplot2)
ggplot(NULL, aes(c(-4, 4))) +
geom_line(stat = "function", fun = dt, args = list (df = 49)) +
ylab("Density") +
xlab("Sample estimates") +
labs(title="Distribution of sample estimates")
qt(p = 0.025, df = 49, lower.tail = FALSE)
## [1] 2.009575
qt(p = 0.025, df = 49, lower.tail = TRUE)
## [1] -2.009575
Statistics <- summarySE(mydata3,
measurevar="Mass",
conf.interval=0.95)
Statistics$se
## [1] 0.8501407
t <- (mean(mydata3$Mass)- 59.5) / Statistics$se
t is higher than 2.009575, so we reject H_0.
OR
We reject H_0 at p = 0.000234. True mean in not equal to 59.5.
t.test(mydata3$Mass,
mu = 59.5,
alternative = "two.sided")
##
## One Sample t-test
##
## data: mydata3$Mass
## t = 3.9711, df = 49, p-value = 0.000234
## alternative hypothesis: true mean is not equal to 59.5
## 95 percent confidence interval:
## 61.16758 64.58442
## sample estimates:
## mean of x
## 62.876
r = 0.4934302, so the effect is medium.
r <- sqrt((t^2)/(t^2 + 49))
#install.packages("readxl")
library(readxl)
mydata4 <- data.frame(read_xlsx("/Users/zanmikola/Documents/IMB/Bootcmap R/R Take Home Exam/Task 3/Apartments.xlsx"))
head(mydata4)
## Age Distance Price Parking Balcony
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
## 6 28 12 1770 0 1
Description:
mydata4 <- mydata4 %>%
mutate(Parking_factor=factor(mydata4$Parking,
levels = c(0,1),
labels = c("No","Yes")),
Balcony_factor=factor(mydata4$Balcony,
levels = c(0,1),
labels = c("No","Yes")))
mydata5<- mydata4
We reject H_0 at p = 0.5% and conclude that true price mean in not equal to 1900.
t.test(mydata4$Price,
mu = 1900,
alternative = "two.sided")
##
## One Sample t-test
##
## data: mydata4$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
fit1 <- lm(formula = Price ~ Age, data=mydata4 )
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = mydata4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
# scatterplot(y = mydata4$Price,
# x = mydata4$Age,
# ylab = "Price",
# xlab = "Age",
# smooth = FALSE)
cor(mydata4$Price, mydata4$Age)
## [1] -0.230255
Based on the matrix there is not problem with multicolinearity. Age and distance are not correlated (line is hotizontal)
scatterplotMatrix(mydata4[ , c(3,1,2)],
smooth = FALSE)
fit2 <- lm(formula = Price ~ Age + Distance, data = mydata4)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
There is no multicolinearity because VIF is lower than 5 for all the variables.
vif(fit2)
## Age Distance
## 1.001845 1.001845
mean(vif(fit2))
## [1] 1.001845
mydata4$StdResid <- round(rstandard(fit2), 3)
mydata4$CooksD <- round(cooks.distance(fit2), 3)
hist(mydata4$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
shapiro.test(mydata4$StdResid)
##
## Shapiro-Wilk normality test
##
## data: mydata4$StdResid
## W = 0.95303, p-value = 0.003645
#Much less than 5%, it is problem
hist(mydata4$CooksD,
xlab = "Cooks distance",
ylab = "Frequency",
main = "Histogram of Cooks distances")
head(mydata4[order(mydata4$StdResid),], 3)
## Age Distance Price Parking Balcony Parking_factor Balcony_factor StdResid CooksD
## 53 7 2 1760 0 1 No Yes -2.152 0.066
## 13 12 14 1650 0 1 No Yes -1.499 0.013
## 72 12 14 1650 0 0 No No -1.499 0.013
head(mydata4[order(-mydata4$CooksD),], 6)
## Age Distance Price Parking Balcony Parking_factor Balcony_factor StdResid CooksD
## 38 5 45 2180 1 1 Yes Yes 2.577 0.320
## 55 43 37 1740 0 0 No No 1.445 0.104
## 33 2 11 2790 1 0 Yes No 2.051 0.069
## 53 7 2 1760 0 1 No Yes -2.152 0.066
## 22 37 3 2540 1 1 Yes Yes 1.576 0.061
## 39 40 2 2400 0 1 No Yes 1.091 0.038
mydata4 <- mydata4[c(-53),]
#Unit less, so we must repeat lm
fit2 <- lm(formula = Price ~ Age + Distance, data = mydata4)
mydata4$StdResid <- round(rstandard(fit2), 3)
mydata4$CooksD <- round(cooks.distance(fit2), 3)
hist(mydata4$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
shapiro.test(mydata4$StdResid)
##
## Shapiro-Wilk normality test
##
## data: mydata4$StdResid
## W = 0.93901, p-value = 0.0006104
#Much less than 5%, it is problem
hist(mydata4$CooksD,
xlab = "Cooks distance",
ylab = "Frequency",
main = "Histogram of Cooks distances")
head(mydata4[order(mydata4$StdResid),], 3)
## Age Distance Price Parking Balcony Parking_factor Balcony_factor StdResid CooksD
## 13 12 14 1650 0 1 No Yes -1.583 0.015
## 72 12 14 1650 0 0 No No -1.583 0.015
## 20 13 8 1800 0 0 No No -1.473 0.014
head(mydata4[order(-mydata4$CooksD),], 6)
## Age Distance Price Parking Balcony Parking_factor Balcony_factor StdResid CooksD
## 38 5 45 2180 1 1 Yes Yes 2.642 0.336
## 55 43 37 1740 0 0 No No 1.594 0.129
## 33 2 11 2790 1 0 Yes No 2.011 0.069
## 22 37 3 2540 1 1 Yes Yes 1.618 0.064
## 39 40 2 2400 0 1 No Yes 1.129 0.040
## 31 45 21 1910 0 1 No Yes 0.988 0.038
We check the assumption with the help of a scatter plot between standardized residuals and (standardized) fitted values. Variance is not growing, so we expect that assumption of homoskedasticity is not violated.
mydata4$StdFittedValues <- scale(fit2$fitted.values)
scatterplot(y = mydata4$StdResid, x = mydata4$StdFittedValues,
ylab = "Standardized residuals",
xlab = "Standardized fitted values",
boxplot = FALSE,
regLine = FALSE,
smooth = FALSE)
ols_test_breusch_pagan(fit2)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## ---------------------------------
## Response : Price
## Variables: fitted values of Price
##
## Test Summary
## ----------------------------
## DF = 1
## Chi2 = 0.1801242
## Prob > Chi2 = 0.6712665
shapiro.test(mydata4$StdResid)
##
## Shapiro-Wilk normality test
##
## data: mydata4$StdResid
## W = 0.93901, p-value = 0.0006104
#ols_plot_resid_hist(fit2)
hist(mydata4$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
fit2 <- lm(formula = Price ~ Age + Distance, data = mydata5)
vif(fit2)
## Age Distance
## 1.001845 1.001845
mean(vif(fit2))
## [1] 1.001845
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
fit3 <- lm(formula =Price ~ Age + Distance + Parking_factor + Balcony_factor,
data = mydata5)
Yes, fit3 is better than fit2 (p=0.05).
anova(fit2, fit3)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + Parking_factor + Balcony_factor
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 82 6720983
## 2 80 5991088 2 729894 4.8732 0.01007 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Explanation of the coefficients:
b1 is estimated -6.799 With every additional year on average apartment’s price per m2 will decrease by 6.799 (p=0.05), if everything else remains unchanged
b2 is estimated -18.045 With every additional km from city centre on average apartment’s price per m2 will decrease by 18.045 (p=0.001), if everything else remains unchanged
Parking_factory: if apartment in city centre has parking the price is on average higher by 196.17 EUR than without it
Balcony_factory: it is not significant since p-value is 0.97
coefficient of correlation: r= 0.71, there is a strong positive correlation between price and age
coefficient of determination: R-squared = 50% of variability of price is explained by linear effect of age and distance of the apartment
Categorical variables:
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking_factor + Balcony_factor,
## data = mydata5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -459.92 -200.66 -57.48 260.08 594.37
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2301.667 94.271 24.415 < 2e-16 ***
## Age -6.799 3.110 -2.186 0.03172 *
## Distance -18.045 2.758 -6.543 5.28e-09 ***
## Parking_factorYes 196.168 62.868 3.120 0.00251 **
## Balcony_factorYes 1.935 60.014 0.032 0.97436
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.7 on 80 degrees of freedom
## Multiple R-squared: 0.5004, Adjusted R-squared: 0.4754
## F-statistic: 20.03 on 4 and 80 DF, p-value: 1.849e-11
sqrt(0.5004)
## [1] 0.7073896
Price - Fitted values = 2800 - 2357.411 = 442.6
mydata5 <- mydata5 %>% mutate(Fittedvalues = fitted.values(fit3))
mydata5$Residuals <- residuals(fit3)
head(mydata5)
## Age Distance Price Parking Balcony Parking_factor Balcony_factor Fittedvalues Residuals
## 1 7 28 1640 0 1 No Yes 1750.741 -110.74150
## 2 18 1 2800 1 0 Yes No 2357.411 442.58893
## 3 7 28 1660 0 0 No No 1748.807 -88.80674
## 4 28 29 1850 0 1 No Yes 1589.921 260.07897
## 5 18 18 1640 1 1 Yes Yes 2052.576 -412.57579
## 6 28 12 1770 0 1 No Yes 1896.691 -126.69107