mydata <- read.table("~/Program R/IMB Bootcamp/BostonHousing.csv",
header = TRUE,
sep = ",",
dec = ".")
options(width = 150)
head(mydata)
## CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 24.0
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 21.6
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 34.7
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 33.4
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33 36.2
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21 28.7
options(width = 100)
summary(mydata)
## CRIM ZN INDUS CHAS NOX
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000 Min. :0.3850
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000 1st Qu.:0.4490
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000 Median :0.5380
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917 Mean :0.5547
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000 3rd Qu.:0.6240
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000 Max. :0.8710
##
## RM AGE DIS RAD TAX PTRATIO
## Min. :3.561 Min. : 2.90 Min. : 1.130 Min. : 1.000 Min. :187.0 Min. :12.60
## 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40
## Median :6.208 Median : 77.50 Median : 3.207 Median : 5.000 Median :330.0 Median :19.05
## Mean :6.285 Mean : 68.57 Mean : 3.795 Mean : 9.549 Mean :408.2 Mean :18.46
## 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20
## Max. :8.780 Max. :100.00 Max. :12.127 Max. :24.000 Max. :711.0 Max. :22.00
##
## B LSTAT MEDV
## Min. : 0.32 Min. : 1.73 Min. : 5.00
## 1st Qu.:375.38 1st Qu.: 6.95 1st Qu.:17.00
## Median :391.44 Median :11.36 Median :21.20
## Mean :356.67 Mean :12.65 Mean :22.55
## 3rd Qu.:396.23 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :396.90 Max. :37.97 Max. :50.00
## NA's :5
This data set explains housing situation in Boston, it has 506 observations and 14 variables (13 numeric and 1 categorical)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mydataR <- rename(mydata,
"Crime" = "CRIM",
"Residential land" = "ZN",
"Non-retail business" = "INDUS",
"River proximity" = "CHAS",
"Nitric oxide" = "NOX",
"Number of rooms" = "RM",
"Age" = "AGE",
"Distance to employement centres" = "DIS",
"Highway accesibility" = "RAD",
"Property tax" = "TAX",
"Child to teacher ratio" = "PTRATIO",
"Black population" = "B",
"Lower status population" = "LSTAT",
"Owner occupied homes" = "MEDV")
mydataR$RiverProximityF <- factor(mydataR$`River proximity`,
levels = c(0,1),
labels = c("not river bound", "river bound"))
library(tidyr)
mydataR_Clean <- drop_na(mydataR)
options(width = 100)
summary(mydataR_Clean)
## Crime Residential land Non-retail business River proximity Nitric oxide
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000 Min. :0.3850
## 1st Qu.: 0.08199 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000 1st Qu.:0.4490
## Median : 0.25387 Median : 0.00 Median : 9.69 Median :0.00000 Median :0.5380
## Mean : 3.64417 Mean : 11.33 Mean :11.18 Mean :0.06986 Mean :0.5553
## 3rd Qu.: 3.69311 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000 3rd Qu.:0.6240
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000 Max. :0.8710
## Number of rooms Age Distance to employement centres Highway accesibility
## Min. :3.561 Min. : 2.90 Min. : 1.130 Min. : 1.000
## 1st Qu.:5.885 1st Qu.: 45.10 1st Qu.: 2.088 1st Qu.: 4.000
## Median :6.209 Median : 77.70 Median : 3.152 Median : 5.000
## Mean :6.287 Mean : 68.66 Mean : 3.785 Mean : 9.609
## 3rd Qu.:6.629 3rd Qu.: 94.10 3rd Qu.: 5.118 3rd Qu.:24.000
## Max. :8.780 Max. :100.00 Max. :12.127 Max. :24.000
## Property tax Child to teacher ratio Black population Lower status population
## Min. :187.0 Min. :12.60 Min. : 0.32 Min. : 1.73
## 1st Qu.:279.0 1st Qu.:17.30 1st Qu.:375.33 1st Qu.: 6.93
## Median :330.0 Median :19.00 Median :391.45 Median :11.38
## Mean :409.5 Mean :18.44 Mean :356.39 Mean :12.67
## 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23 3rd Qu.:16.96
## Max. :711.0 Max. :22.00 Max. :396.90 Max. :37.97
## Owner occupied homes RiverProximityF
## Min. : 5.00 not river bound:466
## 1st Qu.:17.00 river bound : 35
## Median :21.20
## Mean :22.55
## 3rd Qu.:25.00
## Max. :50.00
mydataR_Room <- mydataR_Clean %>% filter(`Number of rooms`>5 & `Number of rooms`<7)
mydataR_without <- mydataR_Clean[,-c(12,13)]
mydataR_new <- mydataR_Clean[-c(10,11,12,13,14,15,16,17,18,19,20),-7]
summary(mydataR_Clean)
## Crime Residential land Non-retail business River proximity Nitric oxide
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000 Min. :0.3850
## 1st Qu.: 0.08199 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000 1st Qu.:0.4490
## Median : 0.25387 Median : 0.00 Median : 9.69 Median :0.00000 Median :0.5380
## Mean : 3.64417 Mean : 11.33 Mean :11.18 Mean :0.06986 Mean :0.5553
## 3rd Qu.: 3.69311 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000 3rd Qu.:0.6240
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000 Max. :0.8710
## Number of rooms Age Distance to employement centres Highway accesibility
## Min. :3.561 Min. : 2.90 Min. : 1.130 Min. : 1.000
## 1st Qu.:5.885 1st Qu.: 45.10 1st Qu.: 2.088 1st Qu.: 4.000
## Median :6.209 Median : 77.70 Median : 3.152 Median : 5.000
## Mean :6.287 Mean : 68.66 Mean : 3.785 Mean : 9.609
## 3rd Qu.:6.629 3rd Qu.: 94.10 3rd Qu.: 5.118 3rd Qu.:24.000
## Max. :8.780 Max. :100.00 Max. :12.127 Max. :24.000
## Property tax Child to teacher ratio Black population Lower status population
## Min. :187.0 Min. :12.60 Min. : 0.32 Min. : 1.73
## 1st Qu.:279.0 1st Qu.:17.30 1st Qu.:375.33 1st Qu.: 6.93
## Median :330.0 Median :19.00 Median :391.45 Median :11.38
## Mean :409.5 Mean :18.44 Mean :356.39 Mean :12.67
## 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23 3rd Qu.:16.96
## Max. :711.0 Max. :22.00 Max. :396.90 Max. :37.97
## Owner occupied homes RiverProximityF
## Min. : 5.00 not river bound:466
## 1st Qu.:17.00 river bound : 35
## Median :21.20
## Mean :22.55
## 3rd Qu.:25.00
## Max. :50.00
mean(mydataR_Clean$`Owner occupied homes`)
## [1] 22.5521
mean(mydataR_Clean$`Property tax`)
## [1] 409.505
max(mydataR_Clean$`Number of rooms`)
## [1] 8.78
hist(mydataR_Clean$`Distance to employement centres`,
main = "Distance to employement centres in kilometres",
xlab = "Distance",
ylab = "Frequency",
col = "dodgerblue")
boxplot(mydataR_Clean$`Property tax`,
main = "Property tax in 10.000$",
col = "red")
library(ggplot2)
ggplot(mydataR_Clean,
aes(y=`Number of rooms`, x=Age)) +
geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
library(readxl)
MBAraw <- read_excel("~/Program R/IMB Bootcamp/R Take Home Exam 2024/Task 2/Business School.xlsx")
head(MBAraw)
## # A tibble: 6 × 9
## `Student ID` `Undergrad Degree` `Undergrad Grade` `MBA Grade` `Work Experience`
## <dbl> <chr> <dbl> <dbl> <chr>
## 1 1 Business 68.4 90.2 No
## 2 2 Computer Science 70.2 68.7 Yes
## 3 3 Finance 76.4 83.3 No
## 4 4 Business 82.6 88.7 No
## 5 5 Finance 76.9 75.4 No
## 6 6 Computer Science 83.3 82.1 No
## # ℹ 4 more variables: `Employability (Before)` <dbl>, `Employability (After)` <dbl>, Status <chr>,
## # `Annual Salary` <dbl>
MBA <- as.data.frame(MBAraw)
head(MBA)
## Student ID Undergrad Degree Undergrad Grade MBA Grade Work Experience Employability (Before)
## 1 1 Business 68.4 90.2 No 252
## 2 2 Computer Science 70.2 68.7 Yes 101
## 3 3 Finance 76.4 83.3 No 401
## 4 4 Business 82.6 88.7 No 287
## 5 5 Finance 76.9 75.4 No 275
## 6 6 Computer Science 83.3 82.1 No 254
## Employability (After) Status Annual Salary
## 1 276 Placed 111000
## 2 119 Placed 107000
## 3 462 Placed 109000
## 4 342 Placed 148000
## 5 347 Placed 255500
## 6 313 Placed 103500
MBA$`Undergrad Degree` <- factor(MBA$`Undergrad Degree`,
levels = c("Art", "Business", "Computer Science", "Engineering", "Finance"),
labels = c("Art", "Business", "Computer Science", "Engineering", "Finance"))
ggplot(MBA, aes(x = `Undergrad Degree`)) +
geom_bar(fill = "green") +
labs(title = "Undergrad degree",
x = "Undergrad degree",
y = "Frequency")
summary(MBA)
## Student ID Undergrad Degree Undergrad Grade MBA Grade Work Experience
## Min. : 1.00 Art : 6 Min. : 61.20 Min. :58.14 Length:100
## 1st Qu.: 25.75 Business :35 1st Qu.: 71.47 1st Qu.:71.14 Class :character
## Median : 50.50 Computer Science:25 Median : 76.65 Median :76.38 Mode :character
## Mean : 50.50 Engineering : 9 Mean : 76.90 Mean :76.04
## 3rd Qu.: 75.25 Finance :25 3rd Qu.: 81.70 3rd Qu.:82.15
## Max. :100.00 Max. :100.00 Max. :95.00
## Employability (Before) Employability (After) Status Annual Salary
## Min. :101.0 Min. :119.0 Length:100 Min. : 20000
## 1st Qu.:245.8 1st Qu.:312.0 Class :character 1st Qu.: 87125
## Median :256.8 Median :435.6 Mode :character Median :103500
## Mean :257.9 Mean :422.7 Mean :109058
## 3rd Qu.:261.0 3rd Qu.:529.0 3rd Qu.:124000
## Max. :421.0 Max. :631.0 Max. :340000
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
describe(MBA$`Annual Salary`)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 100 109058 41501.49 103500 104600.2 25945.5 20000 340000 320000 2.22 9.41 4150.15
ggplot(MBA,
aes(x = `Annual Salary`)) +
geom_histogram(binwidth = 100000, fill = "skyblue", color = "black") +
scale_x_continuous(labels = scales ::comma) +
labs(title = "Annual salary distribution",
x = "Annual salary",
y = "Frequency")
shapiro.test(MBA$`Annual Salary`)
##
## Shapiro-Wilk normality test
##
## data: MBA$`Annual Salary`
## W = 0.81808, p-value = 9.253e-10
t.test(MBA$`MBA Grade`,
mu = 74,
alternative = "two.sided")
##
## One Sample t-test
##
## data: MBA$`MBA Grade`
## t = 2.6587, df = 99, p-value = 0.00915
## alternative hypothesis: true mean is not equal to 74
## 95 percent confidence interval:
## 74.51764 77.56346
## sample estimates:
## mean of x
## 76.04055
library(effectsize)
##
## Attaching package: 'effectsize'
## The following object is masked from 'package:psych':
##
## phi
cohens_d(MBA$`MBA Grade`,
mu = 74)
## Cohen's d | 95% CI
## ------------------------
## 0.27 | [0.07, 0.46]
##
## - Deviation from a difference of 74.
interpret_cohens_d(0.27, rules = "sawilowsky2009")
## [1] "small"
## (Rules: sawilowsky2009)
library(readxl)
ApartmentsRaw <- read_excel("~/Program R/IMB Bootcamp/R Take Home Exam 2024/Task 3/Apartments.xlsx")
head(ApartmentsRaw)
## # A tibble: 6 × 5
## Age Distance Price Parking Balcony
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
## 6 28 12 1770 0 1
Apartments <- as.data.frame(ApartmentsRaw)
head(Apartments)
## Age Distance Price Parking Balcony
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
## 6 28 12 1770 0 1
Description:
Apartments$ParkingF <- factor(Apartments$Parking,
levels = c(0,1),
labels = c("No", "Yes"))
Apartments$BalconyF <- factor(Apartments$Balcony,
levels = c(0,1),
labels = c("No", "Yes"))
t.test(Apartments$Price,
mu = 1900,
alternative = "two.sided")
##
## One Sample t-test
##
## data: Apartments$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
We reject H0 at p=0,005 and accept H1. Average price of the apartment is different than 1.900€.
fit1 <- lm(Price ~ Age,
data = Apartments)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = Apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
cor(Apartments$Price, Apartments$Age)
## [1] -0.230255
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
scatterplotMatrix(Apartments[,c(-4,-5,-6,-7)],
smooth = FALSE)
From the graphs we don’t see multicolinearity. Multicolinearity means that all observations need to be close to regression function and it must be steep, in our case all trendlines are horitzontal and we can’t draw clear line through the observations.
fit2 <- lm(Price ~ Age + Distance,
data = Apartments)
vif(fit2)
## Age Distance
## 1.001845 1.001845
mean(vif(fit2))
## [1] 1.001845
From vif statistics we see there is no problem with multicolinearity since vif of both age and distance is less than 5 and mean of both vif-s is very close to 1.
Apartments$StdResiduals <- round(rstandard(fit2), 3)
Apartments$CooksDistances <- round(cooks.distance(fit2), 3)
hist(Apartments$StdResiduals,
main = "Distribution of standardized residuals",
xlab = "Standardized residuals",
ylab = "Frequency")
hist(Apartments$CooksDistances,
main = "Distribution of Cooks distances",
xlab = "Cooks distances",
ylab = "Frequency")
head(Apartments[order(-Apartments$CooksDistances),])
## Age Distance Price Parking Balcony ParkingF BalconyF StdResiduals CooksDistances
## 38 5 45 2180 1 1 Yes Yes 2.577 0.320
## 55 43 37 1740 0 0 No No 1.445 0.104
## 33 2 11 2790 1 0 Yes No 2.051 0.069
## 53 7 2 1760 0 1 No Yes -2.152 0.066
## 22 37 3 2540 1 1 Yes Yes 1.576 0.061
## 39 40 2 2400 0 1 No Yes 1.091 0.038
Apartments <- Apartments[-38,]
We removed just unit 38 because it had highest Cooks distance (gap on histogram), we didn’t remove any units due to standardized residuals because all units were between -3 and 3.
fit2 <- lm(Price ~ Age + Distance,
data = Apartments)
Apartments$StdFittedValues <-scale(fit2$fitted.values)
ggplot(Apartments,
aes(x=StdFittedValues, y=StdResiduals)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
ols_test_breusch_pagan(fit2)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## ---------------------------------
## Response : Price
## Variables: fitted values of Price
##
## Test Summary
## -----------------------------
## DF = 1
## Chi2 = 2.927455
## Prob > Chi2 = 0.08708469
Based on the graph we didn’t see any heteroskedasticity, we also checked with Breusch Pagan test, where we cannot reject H0 at p=0,087, we concude that variance is constant and we have homoskedasticity which is not a problem.
ggplot(Apartments,
aes(x=StdResiduals)) +
geom_histogram(fill = "brown") +
labs(title = "Histogram of standardized residuals",
x = "Standardized residuals",
y = "Frequency")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
shapiro.test(Apartments$StdResiduals)
##
## Shapiro-Wilk normality test
##
## data: Apartments$StdResiduals
## W = 0.94879, p-value = 0.002187
Our histogram looks all over the place and not really normally distributed so we conducted shapiro wilk test. Where H0 means that standardized resicuals are normally distributed and H1 means they are not. We reject H0 at p=0,002 and accept H1, we conduct that standardized residuals are not normally distributed which could potentialy be a problem. However we have in our sample more than 30 observartions so we assume t-distribution (CLT)
fit2 <- lm(Price ~ Age + Distance,
data = Apartments)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = Apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -604.92 -229.63 -56.49 192.97 599.35
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2456.076 73.931 33.221 < 2e-16 ***
## Age -6.464 3.159 -2.046 0.044 *
## Distance -22.955 2.786 -8.240 2.52e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 276.1 on 81 degrees of freedom
## Multiple R-squared: 0.4838, Adjusted R-squared: 0.4711
## F-statistic: 37.96 on 2 and 81 DF, p-value: 2.339e-12
fit3 <- lm(Price ~ Age + Distance + ParkingF + BalconyF,
data = Apartments)
anova(fit2, fit3)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + ParkingF + BalconyF
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 81 6176767
## 2 79 5654480 2 522287 3.6485 0.03051 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + ParkingF + BalconyF, data = Apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -473.21 -192.37 -28.89 204.17 558.77
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2329.724 93.066 25.033 < 2e-16 ***
## Age -5.821 3.074 -1.894 0.06190 .
## Distance -20.279 2.886 -7.026 6.66e-10 ***
## ParkingFYes 167.531 62.864 2.665 0.00933 **
## BalconyFYes -15.207 59.201 -0.257 0.79795
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 267.5 on 79 degrees of freedom
## Multiple R-squared: 0.5275, Adjusted R-squared: 0.5035
## F-statistic: 22.04 on 4 and 79 DF, p-value: 3.018e-12
Apartments$FittedValues <-fitted.values(fit3)
Apartments$Residuals <- residuals(fit3)
head(Apartments[colnames(Apartments) %in% c("FittedValues", "Residuals")])
## FittedValues Residuals
## 1 1705.952 -65.95206
## 2 2372.197 427.80292
## 3 1721.159 -61.15894
## 4 1563.431 286.56890
## 5 2012.244 -372.24396
## 6 1908.177 -138.17733
Residual for ID 2 is equal to 427,8 meaning that difference between true ID 2 and fitted value is equal to residual.