data(mtcars)
attach(mtcars) # this attach the data to the current enviroment
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'mtcars':
##
## mpg
?mtcars
## starting httpd help server ... done
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
summary(mtcars)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb
## Min. :0.0000 Min. :3.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
## Median :0.0000 Median :4.000 Median :2.000
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
Visualizations with ggplot
ggplot(mtcars, aes(x=mpg)) +
geom_histogram(binwidth=4, fill="blue", alpha=0.7) +
ggtitle("Distribution of Miles/(US) gallon") +
xlab("Miles/(US) gallon") +
ylab("Value")
ggplot(mtcars, aes(x = hp, y = cyl)) +
geom_point(color='blue', size=3) +
ggtitle("Gross Horespower vs. Number of Cylinders") +
xlab("Gross horsepower") +
ylab("Number of cylinders")
Trend identified: (1.2.) There is a positive
correlation between Number of cylinders and Gross Horsepower. Cars with
more cylinders tend to have more horsepower.
ggplot(mtcars, aes(x=hp, y=qsec))+
geom_point(color='blue',size=3) +
ggtitle("Gross Horsepower vs. 1/4 mile time") +
xlab("Horse Power") +
ylab("1/4 mile time")
Trend identified: (1.2.) There is a negative
correlation between Horse Power and time driven in a quarter mile. Cars
with more Horse Power are generally faster.
corr_matrix <- cor(mtcars)
print(corr_matrix)
## mpg cyl disp hp drat wt
## mpg 1.0000000 -0.8521620 -0.8475514 -0.7761684 0.68117191 -0.8676594
## cyl -0.8521620 1.0000000 0.9020329 0.8324475 -0.69993811 0.7824958
## disp -0.8475514 0.9020329 1.0000000 0.7909486 -0.71021393 0.8879799
## hp -0.7761684 0.8324475 0.7909486 1.0000000 -0.44875912 0.6587479
## drat 0.6811719 -0.6999381 -0.7102139 -0.4487591 1.00000000 -0.7124406
## wt -0.8676594 0.7824958 0.8879799 0.6587479 -0.71244065 1.0000000
## qsec 0.4186840 -0.5912421 -0.4336979 -0.7082234 0.09120476 -0.1747159
## vs 0.6640389 -0.8108118 -0.7104159 -0.7230967 0.44027846 -0.5549157
## am 0.5998324 -0.5226070 -0.5912270 -0.2432043 0.71271113 -0.6924953
## gear 0.4802848 -0.4926866 -0.5555692 -0.1257043 0.69961013 -0.5832870
## carb -0.5509251 0.5269883 0.3949769 0.7498125 -0.09078980 0.4276059
## qsec vs am gear carb
## mpg 0.41868403 0.6640389 0.59983243 0.4802848 -0.55092507
## cyl -0.59124207 -0.8108118 -0.52260705 -0.4926866 0.52698829
## disp -0.43369788 -0.7104159 -0.59122704 -0.5555692 0.39497686
## hp -0.70822339 -0.7230967 -0.24320426 -0.1257043 0.74981247
## drat 0.09120476 0.4402785 0.71271113 0.6996101 -0.09078980
## wt -0.17471588 -0.5549157 -0.69249526 -0.5832870 0.42760594
## qsec 1.00000000 0.7445354 -0.22986086 -0.2126822 -0.65624923
## vs 0.74453544 1.0000000 0.16834512 0.2060233 -0.56960714
## am -0.22986086 0.1683451 1.00000000 0.7940588 0.05753435
## gear -0.21268223 0.2060233 0.79405876 1.0000000 0.27407284
## carb -0.65624923 -0.5696071 0.05753435 0.2740728 1.00000000
Correlation identified: According to the correlation
matrix, wt is most strongly correlated with
mpg, with correlation value equal to -0.87. This means as
weight goes up, the speed of a car will to go down.
library(corrplot)
## corrplot 0.95 loaded
corrplot(corr_matrix, method="circle", type="upper", order="hclust",
tl.col="black", tl.srt=40)
## 2. Data Preprocessing
colSums(is.na(mtcars))
## mpg cyl disp hp drat wt qsec vs am gear carb
## 0 0 0 0 0 0 0 0 0 0 0
The dataset has no missing data.
table(mtcars$mpg)
##
## 10.4 13.3 14.3 14.7 15 15.2 15.5 15.8 16.4 17.3 17.8 18.1 18.7 19.2 19.7 21
## 2 1 1 1 1 2 1 1 1 1 1 1 1 2 1 2
## 21.4 21.5 22.8 24.4 26 27.3 30.4 32.4 33.9
## 2 1 2 1 1 1 2 1 1
unique(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
mtcars_lm <- lm(mpg ~ wt + hp, data = mtcars)
summary(mtcars_lm)
##
## Call:
## lm(formula = mpg ~ wt + hp, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.941 -1.600 -0.182 1.050 5.854
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.22727 1.59879 23.285 < 2e-16 ***
## wt -3.87783 0.63273 -6.129 1.12e-06 ***
## hp -0.03177 0.00903 -3.519 0.00145 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.593 on 29 degrees of freedom
## Multiple R-squared: 0.8268, Adjusted R-squared: 0.8148
## F-statistic: 69.21 on 2 and 29 DF, p-value: 9.109e-12
Model:
The p-value is < 0.05, meaning the model is significant,
wt and hp have significant effect on
mpg.
The residuals value show that the predictions are not too far off from
the observed values, ranging from (-3.941 ; 5.854).
The adjusted R-squared means 81.48% of the variance of mpg
is explained by wt and hp, adjusted for the
number of the independent variables.
Coefficients:
The coefficents of wt and hp show that though
having the same direction (negative), wt has a greater
effect on mpg because the coefficent of wt is
farther from 0 than that of hp.
Averagely, with hp held constant, when one unit increase in
wt, mpg decreases by 3.88.
Similarly, when wt is controlled, with every unit increase
in hp, 0.031 unit decreased observed in
mpg.
The intercept value shows that when wt and hp
equal to 0, the mpg = 37.23.
We can check whether the assumptions of linear regression are met using diagnostics plots:
# Diagnostic Plots
par(mfrow = c(2, 2))
plot(mtcars_lm)
lm_mse <- mean((mtcars_lm$fitted.values - mtcars$mpg)^2)
print(paste("Mean Squared Error for Linear Model:", round(lm_mse, 2)))
## [1] "Mean Squared Error for Linear Model: 6.1"
The mean squared error represents the average squared residual. The MSE of 6.1 is relatively small, meaning the data points fall closer to the regression line.
mtcars_lm_interaction <- lm(mpg ~ wt + hp + wt*hp, data = mtcars)
summary(mtcars_lm_interaction)
##
## Call:
## lm(formula = mpg ~ wt + hp + wt * hp, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0632 -1.6491 -0.7362 1.4211 4.5513
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.80842 3.60516 13.816 5.01e-14 ***
## wt -8.21662 1.26971 -6.471 5.20e-07 ***
## hp -0.12010 0.02470 -4.863 4.04e-05 ***
## wt:hp 0.02785 0.00742 3.753 0.000811 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.153 on 28 degrees of freedom
## Multiple R-squared: 0.8848, Adjusted R-squared: 0.8724
## F-statistic: 71.66 on 3 and 28 DF, p-value: 2.981e-13
With the p-value < 0.05, the new model is significant. The
adjusted R-squared 0.87 for this model is greater than that of the
mtcars_lm, meaning more variance of mpg is
explained in this model.
boxplot(mtcars, las=2, cex.axis=0.6)
# Calculate the 1st and 99th percentiles for 'hp'
lower_bound_hp <- quantile(mtcars$hp, 0.05, na.rm = TRUE)
upper_bound_hp <- quantile(mtcars$hp, 0.95, na.rm = TRUE)
#create a mtcars_winsorized df
mtcars_winsorized <- mtcars
# Winsorize the data, replacing value that are greater than the threshold with the threshold.
mtcars_winsorized$hp[mtcars$hp < lower_bound_hp] <- lower_bound_hp
mtcars_winsorized$hp[mtcars$hp > upper_bound_hp] <- upper_bound_hp
library(car)
## Loading required package: carData
car::vif(mtcars_lm) # usually 5 or 10 is considered as high.
## wt hp
## 1.766625 1.766625
With the value = 1.76, this model is not concerning in terms of Multicollinearity.