mydata <- force(airquality)
#Interpretation of the data. We can see that some values are missing, which we might remove later.
head(mydata)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
Explanation of variables:
#Replacing the name of variable, so it has better visual representation in dataframe.
colnames(mydata)[2]<-"SolarRadiation"
#Changing the the number of the month to factor
mydata$MonthN <- factor(mydata$Month,
levels = c(5, 6, 7, 8, 9),
labels = c("May", "June", "July", "August", "September"))
library(pastecs)
round(stat.desc(mydata[,c(-5,-6,-7)]),2)
## Ozone SolarRadiation Wind Temp
## nbr.val 116.00 146.00 153.00 153.00
## nbr.null 0.00 0.00 0.00 0.00
## nbr.na 37.00 7.00 0.00 0.00
## min 1.00 7.00 1.70 56.00
## max 168.00 334.00 20.70 97.00
## range 167.00 327.00 19.00 41.00
## sum 4887.00 27146.00 1523.50 11916.00
## median 31.50 205.00 9.70 79.00
## mean 42.13 185.93 9.96 77.88
## SE.mean 3.06 7.45 0.28 0.77
## CI.mean.0.95 6.07 14.73 0.56 1.51
## var 1088.20 8110.52 12.41 89.59
## std.dev 32.99 90.06 3.52 9.47
## coef.var 0.78 0.48 0.35 0.12
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:pastecs':
##
## extract
mydata1 <- drop_na(mydata) #Removing units with missing values
library(pastecs)
round(stat.desc(mydata1[,c(-5,-6,-7)]),2)
## Ozone SolarRadiation Wind Temp
## nbr.val 111.00 111.00 111.00 111.00
## nbr.null 0.00 0.00 0.00 0.00
## nbr.na 0.00 0.00 0.00 0.00
## min 1.00 7.00 2.30 57.00
## max 168.00 334.00 20.70 97.00
## range 167.00 327.00 18.40 40.00
## sum 4673.00 20513.00 1103.30 8635.00
## median 31.00 207.00 9.70 79.00
## mean 42.10 184.80 9.94 77.79
## SE.mean 3.16 8.65 0.34 0.90
## CI.mean.0.95 6.26 17.15 0.67 1.79
## var 1107.29 8308.74 12.66 90.82
## std.dev 33.28 91.15 3.56 9.53
## coef.var 0.79 0.49 0.36 0.12
As we can see, statistics does not really change that much when we remove the missing values, therefore we will keep the missing values in order to get better representation of the months (for example, if we remove missing data, there will be observed only 9 days of June).
library(pastecs)
round(stat.desc(mydata[,c(-5,-6,-7)]),2)
## Ozone SolarRadiation Wind Temp
## nbr.val 116.00 146.00 153.00 153.00
## nbr.null 0.00 0.00 0.00 0.00
## nbr.na 37.00 7.00 0.00 0.00
## min 1.00 7.00 1.70 56.00
## max 168.00 334.00 20.70 97.00
## range 167.00 327.00 19.00 41.00
## sum 4887.00 27146.00 1523.50 11916.00
## median 31.50 205.00 9.70 79.00
## mean 42.13 185.93 9.96 77.88
## SE.mean 3.06 7.45 0.28 0.77
## CI.mean.0.95 6.07 14.73 0.56 1.51
## var 1088.20 8110.52 12.41 89.59
## std.dev 32.99 90.06 3.52 9.47
## coef.var 0.78 0.48 0.35 0.12
From May to the end of September, mean value of ozone equals to 42,13 ppb, solar radiation 185,93 lang, average wind speed is 9,96 mph and average temperature is 77,88 °F.
Standard deviation tells us, that Solar radiation has biggest oscilating values around its mean (90,06), since the values of Solar radiation are quite bigger, while the wind has the smallest oscilation of values around the mean (3,52).
Range of values in the dataset is the largest for variable called Solar radiation (range = 327 lang), while the smallest range has the wind (range = 19 mph).
library(ggplot2)
library(gridExtra) # For arranging multiple plots
# Histogram for Ozone
p1 <- ggplot(mydata, aes(x = Ozone)) +
geom_histogram(binwidth = 10, fill = "skyblue", color = "black") +labs(title = "Ozone frequency", x = "Ozone Levels", y = "Frequency") + scale_x_continuous( breaks = seq(0,170,30))+
theme_minimal()
# Histogram for Solar Radiation
p2 <- ggplot(mydata, aes(x = SolarRadiation)) +
geom_histogram(binwidth = 10, fill = "lightgreen", color = "black") + labs(title = "Solar Radiation frequency", x = "Solar Radiation", y = "Frequency") + theme_minimal()+ scale_x_continuous( breaks = seq(0,300,50))
# Histogram for Wind speed
p3 <- ggplot(mydata, aes(x = Wind)) + geom_histogram(binwidth = 1, fill = "orange", color = "black") + labs(title = " Wind speed frequency", x = "Wind Speed", y = "Frequency") + theme_minimal()
# Histogram for Temperature
p4 <- ggplot(mydata, aes(x = Temp)) + geom_histogram(binwidth = 2, fill = "lightcoral", color = "black") + labs(title = "Temperature frequency", x = "Temperature", y = "Frequency") + theme_minimal()
# Arrange all histograms in a 2x2 grid
grid.arrange(p1, p2, p3, p4, ncol = 2)
## Warning: Removed 37 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_bin()`).
From the set of histograms we can spot, that throughout those 5 months, the most frequent values for different variables are:
library(ggplot2)
library(tidyr)
# Create scatter plots for each variable
p1 <- ggplot(mydata, aes(x = Day, y = Ozone, color = MonthN)) +
geom_point(size = 1) +
labs(title = "Ozone Levels (Month)", x = "Day of the month", y = "Ozone Levels [ppb]") +
theme_minimal()
p2 <- ggplot(mydata, aes(x = Day, y = SolarRadiation, color = MonthN)) +
geom_point(size = 1) +
labs(title = "Solar Radiation (Month)", x = "Day of the month", y = "Solar Radiation [lang]") +
theme_minimal()
p3 <- ggplot(mydata, aes(x = Day, y = Wind, color = MonthN)) +
geom_point(size = 1) +
labs(title = "Wind speed (Month)", x = "Day of the month", y = "Wind speed [mph]") +
theme_minimal()
p4 <- ggplot(mydata, aes(x = Day, y = Temp, color = MonthN)) +
geom_point(size = 1) +
labs(title = "Temperature (Month)", x = "Day of the month", y = "Temperature [°F]") +
theme_minimal()
# Arrange all scatter plots in a 2x2 grid
grid.arrange(p1, p2, p3, p4, ncol = 2)
## Warning: Removed 37 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_point()`).
From the graphs above we can see:
#Importing the dataset Business School.xlsx
library(readxl)
mydata <- read_xlsx("./Business School.xlsx")
mydata <- as.data.frame(mydata)
#head(mydata)
colnames(mydata) <- gsub(" ", "", colnames(mydata)) #removes the space between variables for further use
head(mydata)
## StudentID UndergradDegree UndergradGrade MBAGrade WorkExperience
## 1 1 Business 68.4 90.2 No
## 2 2 Computer Science 70.2 68.7 Yes
## 3 3 Finance 76.4 83.3 No
## 4 4 Business 82.6 88.7 No
## 5 5 Finance 76.9 75.4 No
## 6 6 Computer Science 83.3 82.1 No
## Employability(Before) Employability(After) Status AnnualSalary
## 1 252 276 Placed 111000
## 2 101 119 Placed 107000
## 3 401 462 Placed 109000
## 4 287 342 Placed 148000
## 5 275 347 Placed 255500
## 6 254 313 Placed 103500
library(ggplot2)
ggplot(mydata, aes(x = UndergradDegree)) +
geom_bar(colour = "black") +
ylab("Frequency")+
xlab("Undergrad Degree")+
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Frequency distribution of Undergrad Degree")
The most common degree is Business degree.
library(pastecs)
round(stat.desc(mydata$AnnualSalary))
## nbr.val nbr.null nbr.na min max range
## 100 0 0 20000 340000 320000
## sum median mean SE.mean CI.mean.0.95 var
## 10905800 103500 109058 4150 8235 1722373475
## std.dev coef.var
## 41501 0
library(ggplot2)
ggplot(mydata, aes(x = AnnualSalary)) +
geom_histogram(colour = "black") +
scale_x_continuous(labels = scales::label_number())+
ylab("Frequency")+
xlab("Annual salary")+
theme(plot.title = element_text(hjust = 0.5))+
scale_x_continuous( breaks = seq(0,400000,50000))+
ggtitle("Frequency distribution of annual salary")
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Distribution is asymmetrical to the right side (positive coefficient of assimetricity).The most frequent annual salary is 100k € per year.
t.test(mydata$UndergradGrade,
mu = 74,
alternative = "two.sided")
##
## One Sample t-test
##
## data: mydata$UndergradGrade
## t = 3.8851, df = 99, p-value = 0.0001849
## alternative hypothesis: true mean is not equal to 74
## 95 percent confidence interval:
## 75.41841 78.37959
## sample estimates:
## mean of x
## 76.899
library(effectsize)
effectsize::cohens_d(mydata$UndergradGrade,mu = 74)
## Cohen's d | 95% CI
## ------------------------
## 0.39 | [0.18, 0.59]
##
## - Deviation from a difference of 74.
interpret_cohens_d(0.39,rules = "sawilowsky2009")
## [1] "small"
## (Rules: sawilowsky2009)
We can conclude that we must reject null hypothesis, since p < 0.001 , which is smaller as 0.05.
Average grade is higher - the average grade increase is “small” sized.
library(readxl)
mydata <- read_xlsx("./Apartments.xlsx")
mydata <- as.data.frame(mydata)
head(mydata)
## Age Distance Price Parking Balcony
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
## 6 28 12 1770 0 1
Description:
mydata$ParkingN <- factor(mydata$Parking,
levels = c (0, 1),
labels = c ("No", "Yes"))
mydata$BalconyN <- factor(mydata$Balcony,
levels = c (0, 1),
labels = c ("No", "Yes"))
summary(mydata[c(6,7)])
## ParkingN BalconyN
## No :42 No :48
## Yes:43 Yes:37
t.test(mydata$Price,
mu = 1900,
alternative = "two.sided")
##
## One Sample t-test
##
## data: mydata$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
library(effectsize)
effectsize::cohens_d(mydata$Price,mu = 1900)
## Cohen's d | 95% CI
## ------------------------
## 0.31 | [0.10, 0.53]
##
## - Deviation from a difference of 1900.
interpret_cohens_d(0.31,rules = "sawilowsky2009")
## [1] "small"
## (Rules: sawilowsky2009)
We can conclude that we must reject null hypothesis, since p = 0.0047, which is smaller as 0.05.
Price increased - the price increase is “small” sized.
library(car)
## Loading required package: carData
scatterplot(x=mydata$Age,
y=mydata$Price,
xlab="Age of the apartment",
ylab = "Price of the apartment",
smooth = FALSE,
boxplot = FALSE)
fit1 <- lm(Price ~ Age,
data = mydata)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
cor(mydata$Price, mydata$Age)
## [1] -0.230255
Price_estimated = 2185.455 - 8.975 * Age
If age goes up by one year, the price of the apartment decreases by 8.975€/m^2 on average (p = 0.034).
The linear relationship between age and price of the apartment is negative and weak (|r| = 0,23).
5.3% of variability in price is explained by linear effect of the age of the apartment.
library(car)
scatterplotMatrix(mydata[ , c(1,2,3)], #Scatterplot matrix
smooth = FALSE)
Based on the dispersion of the values between different variables we can assume we don’t have problems with multicolinearity.
library(Hmisc)
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
rcorr(as.matrix(mydata[ , c(1,2,3)])) #Correlation matrix
## Age Distance Price
## Age 1.00 0.04 -0.23
## Distance 0.04 1.00 -0.63
## Price -0.23 -0.63 1.00
##
## n= 85
##
##
## P
## Age Distance Price
## Age 0.6966 0.0340
## Distance 0.6966 0.0000
## Price 0.0340 0.0000
fit2 <- lm(Price ~ Age + Distance, #Dependent and explanatory variables
data = mydata) #Name of the data frame
fit4 <- lm(Price ~ Age + Distance, #for later
data = mydata)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
vif(fit2) #Multicolinearity
## Age Distance
## 1.001845 1.001845
Since VIF values for both explainatory variables are 1, it means that there is no strong multicolinearity between those 2 variables (they don’t strongly influence each other).
mydata$StdResid <- round(rstandard(fit2), 3) #Standardized residuals
mydata$CooksD <- round(cooks.distance(fit2), 3) #Cooks distances
hist(mydata$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
hist(mydata$CooksD,
xlab = "Cooks distance",
ylab = "Frequency",
main = "Histogram of Cooks distances")
head(mydata[order(-mydata$CooksD),c(-4,-5,-6,-7)], 6) #Six units with highest value of Cooks distance
## Age Distance Price StdResid CooksD
## 38 5 45 2180 2.577 0.320
## 55 43 37 1740 1.445 0.104
## 33 2 11 2790 2.051 0.069
## 53 7 2 1760 -2.152 0.066
## 22 37 3 2540 1.576 0.061
## 39 40 2 2400 1.091 0.038
mydata$ID <- 1:nrow(mydata) #added ID variable
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
##
## src, summarize
## The following object is masked from 'package:car':
##
## recode
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:pastecs':
##
## first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mydata <- mydata %>%
filter (!ID %in% c(33,38,53,55)) #Removing units, which have big impact
fit2 <- lm(Price ~ Age + Distance,
data = mydata)
#repeating the calculations of Residuals and Cooks Distances
mydata$StdResid <- round(rstandard(fit2), 3)
mydata$CooksD <- round(cooks.distance(fit2), 3)
#repeating the graphical interpretation
hist(mydata$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
hist(mydata$CooksD,
xlab = "Cooks distance",
ylab = "Frequency",
main = "Histogram of Cooks distances")
mydata$StdFitted <- scale(fit2$fitted.values)
library(car)
scatterplot(y = mydata$StdResid, x = mydata$StdFitted,
ylab = "Standardized residuals",
xlab = "Standardized fitted values",
boxplots = FALSE,
regLine = FALSE,
smooth = FALSE)
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
ols_test_breusch_pagan(fit2)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## ---------------------------------
## Response : Price
## Variables: fitted values of Price
##
## Test Summary
## ----------------------------
## DF = 1
## Chi2 = 1.64762
## Prob > Chi2 = 0.1992832
P value from Breusch-Pagan test indicates, that we cannot reject null hypothesis (p = 0,199 (which is bigger as 0.05)), therefore we can assume homoskedasticity.
shapiro.test(mydata$StdResid)
##
## Shapiro-Wilk normality test
##
## data: mydata$StdResid
## W = 0.94297, p-value = 0.001286
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -408.72 -217.93 -38.86 220.60 506.18
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2492.715 75.544 32.997 < 2e-16 ***
## Age -7.496 3.169 -2.365 0.0205 *
## Distance -24.574 2.700 -9.100 6.9e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 259.4 on 78 degrees of freedom
## Multiple R-squared: 0.5324, Adjusted R-squared: 0.5204
## F-statistic: 44.4 on 2 and 78 DF, p-value: 1.335e-13
If the age of the apartment increases by 1 year, the price of the apartment decreases by 7.934€ per m^2, assuming that the other explainatory variables remain unchanged.
If the distance of the apartment from the city center increases by 1 km, the price of the apartment decreases by 20.667€ per m^2, assuming that the other explainatory variables remain unchanged.
53,24% of the variabillity of the price per m^2 is explained by the linear effect of age and distance of the apartment.
fit3 <- lm(Price ~ Age + Distance + ParkingN + BalconyN,
data = mydata)
anova(fit2, fit3) #Comparison of two regression models that differ in the number of expl. variables
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + ParkingN + BalconyN
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 78 5248925
## 2 76 4915944 2 332981 2.5739 0.08287 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
By the p value we can see, that fit2 and fit3 models are equal ( p = 0.083)
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + ParkingN + BalconyN, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -403.16 -204.08 -41.24 239.52 528.62
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2372.693 93.344 25.419 < 2e-16 ***
## Age -6.906 3.118 -2.215 0.0298 *
## Distance -22.254 2.839 -7.837 2.25e-11 ***
## ParkingNYes 137.479 60.854 2.259 0.0267 *
## BalconyNYes 17.234 57.099 0.302 0.7636
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 254.3 on 76 degrees of freedom
## Multiple R-squared: 0.5621, Adjusted R-squared: 0.539
## F-statistic: 24.38 on 4 and 76 DF, p-value: 5.29e-13
Given the age, distance and availability of balcony of the apartment,the group of the apartments with the parking, are on average more expensive for 137,479€, then the group of the apartments without the parking.
Balcony - due to high p (0.76), the variable is not significant.
Hypothesis:
H0: dRo^2 = 0
H1: dRo^2 > 0
The F- statistics shows that we can reject null hypothesis and this model explains at least some of the variability.
mydata$Fitted <- fitted.values(fit3)
mydata$Residuals <- residuals(fit3)
head(mydata[ ,colnames(mydata)%in% c("Price", "Fitted", "Residuals")],3)
## Price Fitted Residuals
## 1 1640 1718.475 -78.47475
## 2 2800 2363.611 436.38945
## 3 1660 1701.241 -41.24101
Based on the estimation of regression function, we would expect the price for the apartment ID2 to be 2363.511 € per m^2, but the actual price of ID2 is 2800€ per m^2. Therefore the residual is the difference between those 2 values (436,3894).