mydata <- read.csv("./car_sales_data.csv")
We have 50.000 observations (cars) and 7 variables
Unit: 1 car
We have 7 variables, numerical and categorical:
mydata1.1 <- mydata[c(1, 2, 3, 7)]
mydata_cars_after_2015 <- subset (mydata, Year.of.manufacture > 2015)
summary(mydata_cars_after_2015$Year.of.manufacture)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2016 2017 2018 2018 2019 2022
nrow(mydata_cars_after_2015)
## [1] 7811
mydata$Fuel.type.example <- factor (mydata$Fuel.type,
levels = c("Diesel", "Petrol", "Hybrid"),
labels = c("D", "P", "H"))
str(mydata)
## 'data.frame': 50000 obs. of 8 variables:
## $ Manufacturer : chr "Ford" "Porsche" "Ford" "Toyota" ...
## $ Model : chr "Fiesta" "718 Cayman" "Mondeo" "RAV4" ...
## $ Engine.size : num 1 4 1.6 1.8 1 1.4 1.8 1.4 1.2 2 ...
## $ Fuel.type : chr "Petrol" "Petrol" "Diesel" "Hybrid" ...
## $ Year.of.manufacture: int 2002 2016 2014 1988 2006 2018 2010 2015 2012 1992 ...
## $ Mileage : int 127300 57850 39190 210814 127869 33603 86686 30663 73470 262514 ...
## $ Price : int 3074 49704 24072 1705 4101 29204 14350 30297 9977 1049 ...
## $ Fuel.type.example : Factor w/ 3 levels "D","P","H": 2 2 1 3 2 2 1 3 2 1 ...
mydata_cars_removed <- mydata[-c(2, 7, 10, 100, 5000, 10000, 15555), ]
library(psych)
describe(mydata[c("Price", "Mileage", "Engine.size")])
## vars n mean sd median trimmed mad min
## Price 1 50000 13828.90 16416.68 7971.5 10735.19 8851.86 76
## Mileage 2 50000 112497.32 71632.52 100987.5 106408.91 75488.06 630
## Engine.size 3 50000 1.77 0.73 1.6 1.65 0.59 1
## max range skew kurtosis se
## Price 168081 168005 2.85 12.81 73.42
## Mileage 453537 452907 0.73 0.08 320.35
## Engine.size 5 4 2.08 5.32 0.00
Mean = it shows the central tendency of the data
Standard deviation (sd) = measures of variability
Median = half of the units have a value equal to or lower than the median
HISTOGRAM
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
ggplot(mydata, aes(x = Engine.size)) +
geom_histogram(binwidth = 0.2, fill = "lightcoral", color = "black") +
labs(title = "Distribution of Engine Sizes", x= "Engine Size (L)", y = "Frequency")
Explanation:
mydata_no_outliers <- subset(mydata, Engine.size < 3L)
library(ggplot2)
ggplot(mydata_no_outliers, aes(x = Engine.size)) +
geom_histogram(binwidth = 0.2, fill = "lightcoral", color = "black") +
labs(title = "Distribution of Engine Sizes", x= "Engine Size (L)", y = "Frequency")
Explanation:
SCATTERPLOT
ggplot(mydata, aes(x = Mileage, y = Price)) +
geom_point(alpha = 0.3, color = "pink2")+
labs (title = "Relationship between Mileage and Price",
x = "Mileage (km)", y = "Price (EUR)")
Explanation:
BAR CHART
ggplot(mydata, aes(x = Fuel.type)) +
geom_bar(fill = "darkblue") +
labs (title = "Distribution of Fuel Types", x = "Fuel Type", y = "Frequency")
Explanation:
BOXPLOT
ggplot(mydata_no_outliers, aes(x = Engine.size, y = Price)) +
geom_boxplot(fill = "lightyellow") +
labs (title = "Car Prices by Engine Size", x = "Engine Size (L)", y = "Price (EUR)")
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
Explanation:
mydata_no_outliers2 <- subset(mydata, Price < 50000)
ggplot(mydata_no_outliers2, aes(x = Engine.size, y = Price)) +
geom_boxplot(fill = "lightyellow") +
labs (title = "Car Prices by Engine Size", x = "Engine Size (L)", y = "Price (EUR)")
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
Explanation:
library(readxl)
mydata_Business_School <- read_excel("./Business School.xlsx")
library(ggplot2)
ggplot(mydata_Business_School, aes(x = `Undergrad Degree`)) +
geom_bar(fill = "darkviolet") +
ylab("Frequency") +
theme_minimal() +
geom_text(stat = "count",
aes(label = ..count..),
vjust = -0.3)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Explanation:
summary(mydata_Business_School$`Annual Salary`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20000 87125 103500 109058 124000 340000
library(ggplot2)
ggplot(mydata_Business_School, aes(x = `Annual Salary`)) +
geom_histogram(bins = 20, fill = "pink", color = "black", alpha = 0.7) +
geom_density(aes(y = ..count.. * 1000), color = "red2", size = 1)+
theme_minimal() +
labs (title = "Distribution of Annual Salary",
x = "Annual Salary",
y = "Frequency")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Explanation:
H₀: 𝜇MBA Grade = 74
t.test(mydata_Business_School$`MBA Grade`, mu = 74)
##
## One Sample t-test
##
## data: mydata_Business_School$`MBA Grade`
## t = 2.6587, df = 99, p-value = 0.00915
## alternative hypothesis: true mean is not equal to 74
## 95 percent confidence interval:
## 74.51764 77.56346
## sample estimates:
## mean of x
## 76.04055
library(effectsize)
##
## Attaching package: 'effectsize'
## The following object is masked from 'package:psych':
##
## phi
cohens_d(mydata_Business_School$`MBA Grade`, mu = 74, na.rm = TRUE)
## Cohen's d | 95% CI
## ------------------------
## 0.27 | [0.07, 0.46]
##
## - Deviation from a difference of 74.
Explanation:
We reject the null hypothesis, because p value (p = 0.009) is smaller than 0.05. The average MBA grade this year is significantly different from 74, since the mean (76.04) is higher than 74, students performed better on average than the previous. According to Cohen’s guidelines:
which means, that the 0.27 is a small effect size. The difference (76.04 and 74) is statistically sigificant, but the magnitude of difference is small.
library(readxl)
mydata_Apartments <- read_excel("./Apartments.xlsx")
Description:
mydata_Apartments$Parking <- factor(mydata_Apartments$Parking,
levels = c(0, 1),
labels = c("No", "Yes"))
mydata_Apartments$Balcony <- factor(mydata_Apartments$Balcony,
levels = c(0, 1),
labels = c("No", "Yes"))
t.test(mydata_Apartments$Price, mu = 1900)
##
## One Sample t-test
##
## data: mydata_Apartments$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
Conclusion:
fit1 <- lm(mydata_Apartments$Price ~ mydata_Apartments$Age)
summary(fit1)
##
## Call:
## lm(formula = mydata_Apartments$Price ~ mydata_Apartments$Age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## mydata_Apartments$Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
cor(mydata_Apartments$Price, mydata_Apartments$Age)
## [1] -0.230255
Explanation:
Estimate of regression coefficient: On average, for each additional year of an apartment’s age the price decreases by about 8.98 units.
Coefficient of correlation: As age increases, prices tend to decrease, but the relationship is not very strong (weak negative linear relationship).
Coefficient of determination: Only 5.3 % of price variation is explained by Age, therefore, Age alone is not a strong predictor of price of the apartments.
library(GGally)
ggpairs(mydata_Apartments[, c("Price", "Age", "Distance")],
lower = list(continuous = wrap("smooth_lm", color = "darkblue")),
diag = list(continuous = "densityDiag"),
title = "Scatterplot Matrix with Trend Lines")
Explanation:
fit2 <- lm(mydata_Apartments$Price ~ mydata_Apartments$Age + mydata_Apartments$Distance)
summary(fit2)
##
## Call:
## lm(formula = mydata_Apartments$Price ~ mydata_Apartments$Age +
## mydata_Apartments$Distance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## mydata_Apartments$Age -7.934 3.225 -2.46 0.016 *
## mydata_Apartments$Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
vif(fit2)
## mydata_Apartments$Age mydata_Apartments$Distance
## 1.001845 1.001845
Explanation:
mydata_Apartments$StdResid <- round(rstandard(fit2), 3)
hist(mydata_Apartments$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
mydata_Apartments$CooksD <- round(cooks.distance(fit2), 3)
hist(mydata_Apartments$CooksD,
xlab = "Cooks distance",
ylab = "Frequency",
main = "Histogram of Cook's distances")
head(mydata_Apartments[order(-mydata_Apartments$CooksD),], 6)
## # A tibble: 6 × 7
## Age Distance Price Parking Balcony StdResid CooksD
## <dbl> <dbl> <dbl> <fct> <fct> <dbl> <dbl>
## 1 5 45 2180 Yes Yes 2.58 0.32
## 2 43 37 1740 No No 1.44 0.104
## 3 2 11 2790 Yes No 2.05 0.069
## 4 7 2 1760 No Yes -2.15 0.066
## 5 37 3 2540 Yes Yes 1.58 0.061
## 6 40 2 2400 No Yes 1.09 0.038
threshold <- 0.06
mydata_Apartments <- mydata_Apartments [mydata_Apartments$CooksD <= threshold, ]
Explanation: 0.06 is based on the the visible gaps in the distribution
hist(mydata_Apartments$CooksD,
xlab = "Cook's distance",
main = "Histogram of Cook's distances")
Explanation:
fit2 <- lm(Price ~ Age + Distance, data = mydata_Apartments)
mydata_Apartments$StdResid <- rstandard(fit2)
mydata_Apartments$StdFitted <- as.numeric(scale(fit2$fitted.values))
library(car)
scatterplot(y= mydata_Apartments$StdResid, x = mydata_Apartments$StdFitted,
ylab = "Standardized residuals",
xlab = "Standardized fitted values",
boxplots = FALSE, regline = FALSE, smooth = FALSE)
## Warning in plot.window(...): "regline" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "regline" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "regline" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "regline" is not a
## graphical parameter
## Warning in box(...): "regline" is not a graphical parameter
## Warning in title(...): "regline" is not a graphical parameter
Explanation:
mydata_Apartments$StdResid <- round(rstandard(fit2), 3)
hist(mydata_Apartments$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
shapiro.test(mydata_Apartments$StdResid)
##
## Shapiro-Wilk normality test
##
## data: mydata_Apartments$StdResid
## W = 0.94154, p-value = 0.001166
Explanation:
fit2 <- lm(Price ~ Age + Distance, data = mydata_Apartments)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata_Apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -411.50 -203.69 -45.24 191.11 492.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2502.467 75.024 33.356 < 2e-16 ***
## Age -8.674 3.221 -2.693 0.00869 **
## Distance -24.063 2.692 -8.939 1.57e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared: 0.5361, Adjusted R-squared: 0.524
## F-statistic: 44.49 on 2 and 77 DF, p-value: 1.437e-13
Explanation:
fit3 <- lm(Price ~ Age + Distance + factor(Parking) + factor(Balcony),
data = mydata_Apartments)
anova(fit2, fit3)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + factor(Parking) + factor(Balcony)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 77 5077362
## 2 75 4791128 2 286234 2.2403 0.1135
Explanation:
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + factor(Parking) + factor(Balcony),
## data = mydata_Apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -390.93 -198.19 -53.64 186.73 518.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2393.316 93.930 25.480 < 2e-16 ***
## Age -7.970 3.191 -2.498 0.0147 *
## Distance -21.961 2.830 -7.762 3.39e-11 ***
## factor(Parking)Yes 128.700 60.801 2.117 0.0376 *
## factor(Balcony)Yes 6.032 57.307 0.105 0.9165
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 252.7 on 75 degrees of freedom
## Multiple R-squared: 0.5623, Adjusted R-squared: 0.5389
## F-statistic: 24.08 on 4 and 75 DF, p-value: 7.764e-13
Explanation:
Regression coefficient:
H₀: None of the predictors (Age, Distance, Parking, Balcony) have any effect on the apartment price.
βAge = βDistance = βParking =βBalcony =0
H₁: At least one of the predictors (Age, Distance, Parking, Balcony) has a significant effect on the apartment price.
At least one βi≠0
fitted_values <- fitted (fit3)
residuals_fit3 <- resid(fit3)
residuals_fit3[2]
## 2
## 443.4026
Explanation: