library(knitr )
library(psych)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(Rmisc)
## Loading required package: lattice
## Loading required package: plyr
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library(pastecs)
##
## Attaching package: 'pastecs'
## The following object is masked from 'package:tidyr':
##
## extract
## The following objects are masked from 'package:dplyr':
##
## first, last
library(Hmisc)
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:plyr':
##
## is.discrete, summarize
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following object is masked from 'package:psych':
##
## describe
## The following objects are masked from 'package:base':
##
## format.pval, units
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:psych':
##
## logit
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(lm.beta)
library(naniar)
library(Hmisc)
library(estimatr)
Used data set regarding prices of used Audi cars in the UK
##2:Import, description and manipulation
set.seed(1)
mydata1 <- read.table("~/Downloads/archive/audi.csv",
header=TRUE,
sep=",",
dec=",") %>% sample_n(400)
mydata1 <- mydata1[ , c(-1, -2, -7,-9) ]
colnames(mydata1) <- c("Price", "Transmission", "Mileage", "FuelType","MilesPerGallon")
#I renamed the variables for easier examination
mydata2 <- transform(mydata1, Price=as.numeric(Price),Mileage=as.numeric(Mileage), MilesPerGallon=as.numeric(MilesPerGallon))
#Because some of the variables were characters I transformed them to numeric so I could properly conduct the analysis
mydata2 <- mydata2[c(-204,-304), ]
#For more precise results I will throw away Hybrid fuel type, as there are only 2 cars that are Hybrid among 2500 observations
head(mydata2)
## Price Transmission Mileage FuelType MilesPerGallon
## 1 11495 Manual 33397 Petrol 67.3
## 2 17336 Automatic 62118 Diesel 56.5
## 3 19490 Manual 23800 Petrol 49.6
## 4 20000 Automatic 8500 Petrol 47.9
## 5 14000 Semi-Auto 56441 Petrol 36.7
## 6 13250 Automatic 40000 Petrol 58.9
Description of used variables:
Price: Price of a car in £ Transmission: What kind of a gearbox does the car contain Mileage: Distance alearady travelled with the car in km FuelType: Engine fuel that the car uses MilesPerGallon: Shows how far you can travel for every gallon of fuel used.
mydata2$FuelTypeF <- factor(mydata2$FuelType,
levels = c("Petrol", "Diesel"),
labels = c("P", "D"))
mydata2$TransmissionF <- factor(mydata2$Transmission,
levels = c("Manual", "Semi-Auto", "Automatic"),
labels = c("M", "SA", "A"))
I threw fuel type and transmission into factors in order to have a better representation of petrol diesel and manual, semiauto, autmoatic
summary(mydata2[ , c(-2, -4)])
## Price Mileage MilesPerGallon FuelTypeF TransmissionF
## Min. : 3495 Min. : 10 Min. :22.10 P:201 M :171
## 1st Qu.: 14995 1st Qu.: 5372 1st Qu.:39.80 D:197 SA:108
## Median : 20422 Median : 19982 Median :49.60 A :119
## Mean : 23227 Mean : 25122 Mean :49.58
## 3rd Qu.: 28980 3rd Qu.: 37338 3rd Qu.:57.60
## Max. :102544 Max. :138649 Max. :83.10
Average price for an audi car was 23227 pounds. Highest price was 102544 pounds. Average mileage of a car was 25122 km The highest mileage on a car was 138649 km On aveage for every gallon of fuel used an audi car could run for 49.58 miles
We can also calculate the probability that a randomly selected car in a sample will work on petrol.
201/398 -> 0,505. This means that there is a 0.505 percent probability that a randomly selected used audi car will run on petrol
##3:Binary Logistic Regression Model
fit1 <- glm(FuelTypeF ~ 1,
family = binomial,
data = mydata2)
summary(fit1) #FIRST LOGISTIC REGRESSION MODEL
##
## Call:
## glm(formula = FuelTypeF ~ 1, family = binomial, data = mydata2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.169 -1.169 -1.169 1.186 1.186
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.0201 0.1003 -0.2 0.841
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 551.7 on 397 degrees of freedom
## Residual deviance: 551.7 on 397 degrees of freedom
## AIC: 553.7
##
## Number of Fisher Scoring iterations: 3
I used the generalized linear model.
I was able to conduct a test that tells the difference of of used audi cars running on petrol or diesel. This was done with the 2 functions below which are exp and pseudo r squared
exp(cbind(odds = fit1$coefficients, confint.default(fit1)))
## odds 2.5 % 97.5 %
## (Intercept) 0.9800995 0.8052525 1.192912
head(fitted(fit1))
## 1 2 3 4 5 6
## 0.4949749 0.4949749 0.4949749 0.4949749 0.4949749 0.4949749
Pseudo_R2_fit2 <- 201/398
Pseudo_R2_fit2
## [1] 0.5050251
Firstly we can see the odds and the confidence interval. Afterwards we can inspect that for each car the probabilities are the same, cause we didnt include any explanatory variables. They are all 0.4949. We got this number with odds(y=1)/1+odds(y=1). 0.98/1.98 -> 0.4949
We form 2 groups. One is 0,5 and lower. And the other is 0,5 and higher. For each car in my sample the probability is 0.49. So we classify all the units into group 0. If we do that than 201 cars will run on petrol. So we will correclty classify 0.505 percent by chance. This can also be explained that there is a 0.49 percent probability that a car will work on Petrol.
fit2 <- glm(FuelTypeF ~ Mileage,
family = binomial,
data = mydata2)
summary(fit2)
##
## Call:
## glm(formula = FuelTypeF ~ Mileage, family = binomial, data = mydata2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.856 -1.108 -1.003 1.182 1.365
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.321e-01 1.491e-01 -2.897 0.003763 **
## Mileage 1.663e-05 4.505e-06 3.691 0.000224 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 551.70 on 397 degrees of freedom
## Residual deviance: 536.94 on 396 degrees of freedom
## AIC: 540.94
##
## Number of Fisher Scoring iterations: 4
Now we also add the Mileage as a explanatory variable. We can see some changes in deviances. Atop of that we can see that mileage is statistically significant at p<0.001.
After adding an explanatory variable we can compate the simple model (fit1) with the new model fit2 with anova.
anova(fit1, fit2, test = "Chi")
## Analysis of Deviance Table
##
## Model 1: FuelTypeF ~ 1
## Model 2: FuelTypeF ~ Mileage
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 397 551.70
## 2 396 536.94 1 14.76 0.0001221 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Model 1 is fit1, Model 2 is fit2, as test we use Chi!
Deviance -> difference of (-2ll.SIMPLE )-(-2LL.COMPLEX)= 551,7-536,94 = 14,76
h0: Simple model is better h1: Complex model is better
We can conlcude here and reject the null hypthesis and say that the complex model is better
exp(cbind(OR = fit2$coefficients, confint.default(fit2))) #Odds ratio for Y=1 (with 95% CI)
## OR 2.5 % 97.5 %
## (Intercept) 0.6491769 0.4846559 0.8695459
## Mileage 1.0000166 1.0000078 1.0000255
fit3 <- glm(FuelTypeF ~ Mileage + TransmissionF + MilesPerGallon + Price,
family = binomial,
data = mydata2)
vif(fit3)
## GVIF Df GVIF^(1/(2*Df))
## Mileage 1.686580 1 1.298684
## TransmissionF 1.630273 2 1.129965
## MilesPerGallon 2.514019 1 1.585566
## Price 3.677014 1 1.917554
mydata2$StdResid <- rstandard(fit3)
mydata2$Cook <- cooks.distance(fit3)
This helpes us with multicolinearity and we can see that there may be some concern about multicollinearity between “MilesPerGallon” and the other predictor variables, and a more serious issue with multicollinearity between “Price” and the other predictor variables. But given thath they are all bellow 5, it will do ok. But to be sure I will also check outliers and units with high impact.
hist(mydata2$StdResid,
main = "Histogram of standardized residuals",
ylab = "Frequency",
xlab = "Standardized residuals")
head(mydata2[order(mydata2$StdResid),], 10)
## Price Transmission Mileage FuelType MilesPerGallon FuelTypeF TransmissionF
## 166 102544 Semi-Auto 2000 Petrol 22.1 P SA
## 104 9500 Manual 117740 Petrol 57.6 P M
## 290 50888 Automatic 12500 Petrol 32.1 P A
## 284 54990 Automatic 4474 Petrol 29.7 P A
## 33 56990 Semi-Auto 20814 Petrol 29.4 P SA
## 324 17999 Manual 50450 Petrol 60.1 P M
## 192 9912 Manual 50000 Petrol 67.3 P M
## 240 16991 Semi-Auto 60892 Petrol 56.5 P SA
## 6 13250 Automatic 40000 Petrol 58.9 P A
## 116 13995 Automatic 18833 Petrol 62.8 P A
## StdResid Cook
## 166 -3.768107 0.332071963
## 104 -2.253542 0.053960706
## 290 -2.014753 0.028004560
## 284 -1.999206 0.032988940
## 33 -1.968007 0.043204765
## 324 -1.908212 0.010578115
## 192 -1.861454 0.009323132
## 240 -1.786779 0.013194773
## 6 -1.785366 0.011808125
## 116 -1.780448 0.015918891
We need to remove the standard residual that is over -3. Number 166 We need to also mention here that we always see bimodal distribution. There is no requirements. If this would be normal, we would use classical OLS regression.
mydata2 <- mydata2[-166, ]
We removed it Now for the high impact units
head(mydata2[order(-mydata2$Cook),], 10)
## Price Transmission Mileage FuelType MilesPerGallon FuelTypeF TransmissionF
## 242 3495 Manual 138649 Diesel 33.2 D M
## 104 9500 Manual 117740 Petrol 57.6 P M
## 33 56990 Semi-Auto 20814 Petrol 29.4 P SA
## 284 54990 Automatic 4474 Petrol 29.7 P A
## 334 8995 Automatic 95000 Diesel 37.6 D A
## 290 50888 Automatic 12500 Petrol 32.1 P A
## 291 54995 Semi-Auto 2000 Petrol 30.1 P SA
## 163 52950 Semi-Auto 5000 Petrol 30.7 P SA
## 116 13995 Automatic 18833 Petrol 62.8 P A
## 150 46465 Automatic 19963 Petrol 29.4 P A
## StdResid Cook
## 242 2.113275 0.10287877
## 104 -2.253542 0.05396071
## 33 -1.968007 0.04320476
## 284 -1.999206 0.03298894
## 334 1.613527 0.02818114
## 290 -2.014753 0.02800456
## 291 -1.552404 0.01978059
## 163 -1.496867 0.01636506
## 116 -1.780448 0.01591889
## 150 -1.621615 0.01541537
hist(mydata2$Cook,
main = "Histogram of cooks distances",
ylab = "Frequency",
xlab = "Cooks distance")
Because 242, 104, 33, 284, 334, 290 are all units with high impact we
need to remove them
mydata2 <- mydata2[c(-242,-104, -33, -284, -334, -290), ]
Now we have to estimate again. And so we compare it to fit 2. Cause we removed 3 units, and we need to check again.
fit2 <- glm(FuelTypeF ~ Mileage,
family = binomial,
data = mydata2)
fit3 <- glm(FuelTypeF ~ Mileage + TransmissionF + MilesPerGallon + Price,
family = binomial,
data = mydata2)
We must figure out which model fits our data better Mileage or Mileage + TransmissionF + MilesPerGallon + Price
H0:Mileage H1:Mileage + TransmissionF + MilesPerGallon + Price
Again this will be done with anova
anova(fit2, fit3, test = "Chi")
## Analysis of Deviance Table
##
## Model 1: FuelTypeF ~ Mileage
## Model 2: FuelTypeF ~ Mileage + TransmissionF + MilesPerGallon + Price
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 389 525.24
## 2 385 365.02 4 160.22 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
From the results we can conclude that model 2 in this case fits the data better and we reject the h0 at 0<0.001. Now we go on to check about the significance of the variables used.
summary(fit3)
##
## Call:
## glm(formula = FuelTypeF ~ Mileage + TransmissionF + MilesPerGallon +
## Price, family = binomial, data = mydata2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.42128 -0.75763 0.02143 0.76140 2.01666
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.573e+01 1.654e+00 -9.512 < 2e-16 ***
## Mileage 5.172e-05 8.245e-06 6.273 3.54e-10 ***
## TransmissionFSA -1.702e-01 3.774e-01 -0.451 0.6519
## TransmissionFA 7.764e-01 3.667e-01 2.117 0.0342 *
## MilesPerGallon 1.891e-01 2.133e-02 8.864 < 2e-16 ***
## Price 2.180e-04 2.858e-05 7.629 2.36e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 542.04 on 390 degrees of freedom
## Residual deviance: 365.02 on 385 degrees of freedom
## AIC: 377.02
##
## Number of Fisher Scoring iterations: 5
From the summary we can see that almost all the variables are ok, with the exception of Semi-Automatic Transmissions. Furthermore we can say that this variable is insignificant.
exp(cbind(OR = fit3$coefficients, confint.default(fit3)))
## OR 2.5 % 97.5 %
## (Intercept) 1.468077e-07 5.738054e-09 3.756064e-06
## Mileage 1.000052e+00 1.000036e+00 1.000068e+00
## TransmissionFSA 8.434754e-01 4.025696e-01 1.767274e+00
## TransmissionFA 2.173552e+00 1.059336e+00 4.459705e+00
## MilesPerGallon 1.208115e+00 1.158652e+00 1.259689e+00
## Price 1.000218e+00 1.000162e+00 1.000274e+00
####Mileage If mileage increases by 1 km, than the odds of picking a used audi car that runs on petrol is equal to 1.0000052 times initial odds, under the assumption that the remaining explanatory variables do not change.
####TransmissionFA If we pick a car that is an automatic, than the odds of picking a used audi car that runs on petrol is equal to 2.17 times initial odds, under the assumption that the remaining explanatory variables do not change.
####MilesPerGallon If miles per gallon increases by 1 gallon per mile, than the odds of picking a used audi car that runs on petrol is equal to 1.2 times initial odds, under the assumption that the remaining explanatory variables do not change.
####Price If we the price of a car changes by one pound, than the odds of picking a used audi car that runs on petrol is equal to 1.00021 times initial odds, under the assumption that the remaining explanatory variables do not change.
mydata2$EstProb <- fitted(fit3) #Estimates of probabilities for Y=1: P(Y=1)
head(mydata2)
## Price Transmission Mileage FuelType MilesPerGallon FuelTypeF TransmissionF
## 1 11495 Manual 33397 Petrol 67.3 P M
## 2 17336 Automatic 62118 Diesel 56.5 D A
## 3 19490 Manual 23800 Petrol 49.6 P M
## 4 20000 Automatic 8500 Petrol 47.9 P A
## 5 14000 Semi-Auto 56441 Petrol 36.7 P SA
## 6 13250 Automatic 40000 Petrol 58.9 P A
## StdResid Cook EstProb
## 1 -1.6934574 0.0068758884 0.77258778
## 2 0.4139692 0.0001537643 0.93798600
## 3 -0.8187360 0.0006007681 0.29382185
## 4 -0.9306278 0.0018695895 0.24934089
## 5 -0.3677133 0.0001508213 0.04767494
## 6 -1.7853661 0.0118081245 0.75682508
Here we can se the probabilities that a used audi car will run on petrol or diesel.
Probability that the first car (first row) will run on petrol, is 77 percents. Probability that the sixth car (6th row) will run on petrol is 75 percents.
Now we also want to see the classification table
mydata2$Classification <- ifelse(test = mydata2$EstProb < 0.50,
yes = "Petrol",
no = "Diesel")
#if else how to read -> if estimated probabilty is less than 0.5, than the car gets the value no. Otherwise gets value yes.
#If estimated probability is below 0.50, a used audi is classified into a car that uses petrol. If it is over 0.50 than it is classified into a car that uses diesel.
mydata2$Classification <- factor(mydata2$Classification,
levels = c("Petrol", "Diesel"),
labels = c("Petrol", "Diesel"))
ClassificationTable <- table(mydata2$FuelTypeF, mydata2$Classification) #Classification table based on 2 categorical variables.
ClassificationTable
##
## Petrol Diesel
## P 154 41
## D 47 149
Pseudo_R2_fit3 <- (ClassificationTable[1, 1] + ClassificationTable[2, 2] )/ nrow(mydata2)
Pseudo_R2_fit3
## [1] 0.7749361
The model implies that 88 used audi cars were wrongly classified. And 303 used audi cars were correctly classified We can see that 77 percent units were correctly classified.