library(knitr )
library(psych)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
library(Rmisc)
## Loading required package: lattice
## Loading required package: plyr
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
library(pastecs)
## 
## Attaching package: 'pastecs'
## The following object is masked from 'package:tidyr':
## 
##     extract
## The following objects are masked from 'package:dplyr':
## 
##     first, last
library(Hmisc)
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:plyr':
## 
##     is.discrete, summarize
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following object is masked from 'package:psych':
## 
##     describe
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:psych':
## 
##     logit
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(olsrr)
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
library(lm.beta)
library(naniar)
library(Hmisc)
library(estimatr)

Homework 4 - TINE DOBNIK

1: Source

Used data set regarding prices of used Audi cars in the UK

##2:Import, description and manipulation

set.seed(1) 
mydata1 <- read.table("~/Downloads/archive/audi.csv", 
                     header=TRUE, 
                     sep=",", 
                     dec=",") %>% sample_n(400)

mydata1 <- mydata1[ , c(-1, -2, -7,-9) ] 

colnames(mydata1) <- c("Price", "Transmission", "Mileage", "FuelType","MilesPerGallon") 
#I renamed the variables for easier examination

mydata2 <- transform(mydata1, Price=as.numeric(Price),Mileage=as.numeric(Mileage), MilesPerGallon=as.numeric(MilesPerGallon)) 
#Because some of the variables were characters I transformed them to numeric so I could properly conduct the analysis

mydata2 <- mydata2[c(-204,-304), ] 
#For more precise results I will throw away Hybrid fuel type, as there are only 2 cars that are Hybrid among 2500 observations


head(mydata2)
##   Price Transmission Mileage FuelType MilesPerGallon
## 1 11495       Manual   33397   Petrol           67.3
## 2 17336    Automatic   62118   Diesel           56.5
## 3 19490       Manual   23800   Petrol           49.6
## 4 20000    Automatic    8500   Petrol           47.9
## 5 14000    Semi-Auto   56441   Petrol           36.7
## 6 13250    Automatic   40000   Petrol           58.9

Description of used variables:

Price: Price of a car in £ Transmission: What kind of a gearbox does the car contain Mileage: Distance alearady travelled with the car in km FuelType: Engine fuel that the car uses MilesPerGallon: Shows how far you can travel for every gallon of fuel used.

mydata2$FuelTypeF <- factor(mydata2$FuelType,  
                          levels = c("Petrol", "Diesel"), 
                          labels = c("P", "D"))

mydata2$TransmissionF <- factor(mydata2$Transmission, 
                            levels = c("Manual", "Semi-Auto", "Automatic"),
                            labels = c("M", "SA", "A"))

I threw fuel type and transmission into factors in order to have a better representation of petrol diesel and manual, semiauto, autmoatic

summary(mydata2[ , c(-2, -4)])
##      Price           Mileage       MilesPerGallon  FuelTypeF TransmissionF
##  Min.   :  3495   Min.   :    10   Min.   :22.10   P:201     M :171       
##  1st Qu.: 14995   1st Qu.:  5372   1st Qu.:39.80   D:197     SA:108       
##  Median : 20422   Median : 19982   Median :49.60             A :119       
##  Mean   : 23227   Mean   : 25122   Mean   :49.58                          
##  3rd Qu.: 28980   3rd Qu.: 37338   3rd Qu.:57.60                          
##  Max.   :102544   Max.   :138649   Max.   :83.10

Average price for an audi car was 23227 pounds. Highest price was 102544 pounds. Average mileage of a car was 25122 km The highest mileage on a car was 138649 km On aveage for every gallon of fuel used an audi car could run for 49.58 miles

We can also calculate the probability that a randomly selected car in a sample will work on petrol.

201/398 -> 0,505. This means that there is a 0.505 percent probability that a randomly selected used audi car will run on petrol

##3:Binary Logistic Regression Model

fit1 <- glm(FuelTypeF ~ 1, 
            family = binomial, 
            data = mydata2)

summary(fit1) #FIRST LOGISTIC REGRESSION MODEL
## 
## Call:
## glm(formula = FuelTypeF ~ 1, family = binomial, data = mydata2)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.169  -1.169  -1.169   1.186   1.186  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  -0.0201     0.1003    -0.2    0.841
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 551.7  on 397  degrees of freedom
## Residual deviance: 551.7  on 397  degrees of freedom
## AIC: 553.7
## 
## Number of Fisher Scoring iterations: 3

I used the generalized linear model.

I was able to conduct a test that tells the difference of of used audi cars running on petrol or diesel. This was done with the 2 functions below which are exp and pseudo r squared

exp(cbind(odds = fit1$coefficients, confint.default(fit1))) 
##                  odds     2.5 %   97.5 %
## (Intercept) 0.9800995 0.8052525 1.192912
head(fitted(fit1)) 
##         1         2         3         4         5         6 
## 0.4949749 0.4949749 0.4949749 0.4949749 0.4949749 0.4949749
Pseudo_R2_fit2 <- 201/398 

Pseudo_R2_fit2
## [1] 0.5050251

Firstly we can see the odds and the confidence interval. Afterwards we can inspect that for each car the probabilities are the same, cause we didnt include any explanatory variables. They are all 0.4949. We got this number with odds(y=1)/1+odds(y=1). 0.98/1.98 -> 0.4949

We form 2 groups. One is 0,5 and lower. And the other is 0,5 and higher. For each car in my sample the probability is 0.49. So we classify all the units into group 0. If we do that than 201 cars will run on petrol. So we will correclty classify 0.505 percent by chance. This can also be explained that there is a 0.49 percent probability that a car will work on Petrol.

fit2 <- glm(FuelTypeF ~ Mileage,  
            family = binomial, 
            data = mydata2)

summary(fit2)
## 
## Call:
## glm(formula = FuelTypeF ~ Mileage, family = binomial, data = mydata2)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.856  -1.108  -1.003   1.182   1.365  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.321e-01  1.491e-01  -2.897 0.003763 ** 
## Mileage      1.663e-05  4.505e-06   3.691 0.000224 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 551.70  on 397  degrees of freedom
## Residual deviance: 536.94  on 396  degrees of freedom
## AIC: 540.94
## 
## Number of Fisher Scoring iterations: 4

Now we also add the Mileage as a explanatory variable. We can see some changes in deviances. Atop of that we can see that mileage is statistically significant at p<0.001.

After adding an explanatory variable we can compate the simple model (fit1) with the new model fit2 with anova.

anova(fit1, fit2, test = "Chi") 
## Analysis of Deviance Table
## 
## Model 1: FuelTypeF ~ 1
## Model 2: FuelTypeF ~ Mileage
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       397     551.70                          
## 2       396     536.94  1    14.76 0.0001221 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Model 1 is fit1, Model 2 is fit2, as test we use Chi!

Deviance -> difference of (-2ll.SIMPLE )-(-2LL.COMPLEX)= 551,7-536,94 = 14,76

h0: Simple model is better h1: Complex model is better

We can conlcude here and reject the null hypthesis and say that the complex model is better

exp(cbind(OR = fit2$coefficients, confint.default(fit2))) #Odds ratio for Y=1 (with 95% CI)
##                    OR     2.5 %    97.5 %
## (Intercept) 0.6491769 0.4846559 0.8695459
## Mileage     1.0000166 1.0000078 1.0000255
fit3 <- glm(FuelTypeF ~ Mileage + TransmissionF + MilesPerGallon + Price,  
            family = binomial, 
            data = mydata2)


vif(fit3)
##                    GVIF Df GVIF^(1/(2*Df))
## Mileage        1.686580  1        1.298684
## TransmissionF  1.630273  2        1.129965
## MilesPerGallon 2.514019  1        1.585566
## Price          3.677014  1        1.917554
mydata2$StdResid <- rstandard(fit3)
mydata2$Cook <- cooks.distance(fit3)

This helpes us with multicolinearity and we can see that there may be some concern about multicollinearity between “MilesPerGallon” and the other predictor variables, and a more serious issue with multicollinearity between “Price” and the other predictor variables. But given thath they are all bellow 5, it will do ok. But to be sure I will also check outliers and units with high impact.

hist(mydata2$StdResid,
     main = "Histogram of standardized residuals",
     ylab = "Frequency",
     xlab = "Standardized residuals")

head(mydata2[order(mydata2$StdResid),], 10)
##      Price Transmission Mileage FuelType MilesPerGallon FuelTypeF TransmissionF
## 166 102544    Semi-Auto    2000   Petrol           22.1         P            SA
## 104   9500       Manual  117740   Petrol           57.6         P             M
## 290  50888    Automatic   12500   Petrol           32.1         P             A
## 284  54990    Automatic    4474   Petrol           29.7         P             A
## 33   56990    Semi-Auto   20814   Petrol           29.4         P            SA
## 324  17999       Manual   50450   Petrol           60.1         P             M
## 192   9912       Manual   50000   Petrol           67.3         P             M
## 240  16991    Semi-Auto   60892   Petrol           56.5         P            SA
## 6    13250    Automatic   40000   Petrol           58.9         P             A
## 116  13995    Automatic   18833   Petrol           62.8         P             A
##      StdResid        Cook
## 166 -3.768107 0.332071963
## 104 -2.253542 0.053960706
## 290 -2.014753 0.028004560
## 284 -1.999206 0.032988940
## 33  -1.968007 0.043204765
## 324 -1.908212 0.010578115
## 192 -1.861454 0.009323132
## 240 -1.786779 0.013194773
## 6   -1.785366 0.011808125
## 116 -1.780448 0.015918891

We need to remove the standard residual that is over -3. Number 166 We need to also mention here that we always see bimodal distribution. There is no requirements. If this would be normal, we would use classical OLS regression.

mydata2 <- mydata2[-166, ] 

We removed it Now for the high impact units

head(mydata2[order(-mydata2$Cook),], 10)
##     Price Transmission Mileage FuelType MilesPerGallon FuelTypeF TransmissionF
## 242  3495       Manual  138649   Diesel           33.2         D             M
## 104  9500       Manual  117740   Petrol           57.6         P             M
## 33  56990    Semi-Auto   20814   Petrol           29.4         P            SA
## 284 54990    Automatic    4474   Petrol           29.7         P             A
## 334  8995    Automatic   95000   Diesel           37.6         D             A
## 290 50888    Automatic   12500   Petrol           32.1         P             A
## 291 54995    Semi-Auto    2000   Petrol           30.1         P            SA
## 163 52950    Semi-Auto    5000   Petrol           30.7         P            SA
## 116 13995    Automatic   18833   Petrol           62.8         P             A
## 150 46465    Automatic   19963   Petrol           29.4         P             A
##      StdResid       Cook
## 242  2.113275 0.10287877
## 104 -2.253542 0.05396071
## 33  -1.968007 0.04320476
## 284 -1.999206 0.03298894
## 334  1.613527 0.02818114
## 290 -2.014753 0.02800456
## 291 -1.552404 0.01978059
## 163 -1.496867 0.01636506
## 116 -1.780448 0.01591889
## 150 -1.621615 0.01541537
hist(mydata2$Cook,
     main = "Histogram of cooks distances",
     ylab = "Frequency",
     xlab = "Cooks distance")

Because 242, 104, 33, 284, 334, 290 are all units with high impact we need to remove them

mydata2 <- mydata2[c(-242,-104, -33, -284, -334, -290), ] 

Now we have to estimate again. And so we compare it to fit 2. Cause we removed 3 units, and we need to check again.

fit2 <- glm(FuelTypeF ~ Mileage,  
            family = binomial, 
            data = mydata2)

fit3 <- glm(FuelTypeF ~ Mileage + TransmissionF + MilesPerGallon + Price,  
            family = binomial, 
            data = mydata2)

We must figure out which model fits our data better Mileage or Mileage + TransmissionF + MilesPerGallon + Price

H0:Mileage H1:Mileage + TransmissionF + MilesPerGallon + Price

Again this will be done with anova

anova(fit2, fit3, test = "Chi")
## Analysis of Deviance Table
## 
## Model 1: FuelTypeF ~ Mileage
## Model 2: FuelTypeF ~ Mileage + TransmissionF + MilesPerGallon + Price
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       389     525.24                          
## 2       385     365.02  4   160.22 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

From the results we can conclude that model 2 in this case fits the data better and we reject the h0 at 0<0.001. Now we go on to check about the significance of the variables used.

summary(fit3)
## 
## Call:
## glm(formula = FuelTypeF ~ Mileage + TransmissionF + MilesPerGallon + 
##     Price, family = binomial, data = mydata2)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.42128  -0.75763   0.02143   0.76140   2.01666  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -1.573e+01  1.654e+00  -9.512  < 2e-16 ***
## Mileage          5.172e-05  8.245e-06   6.273 3.54e-10 ***
## TransmissionFSA -1.702e-01  3.774e-01  -0.451   0.6519    
## TransmissionFA   7.764e-01  3.667e-01   2.117   0.0342 *  
## MilesPerGallon   1.891e-01  2.133e-02   8.864  < 2e-16 ***
## Price            2.180e-04  2.858e-05   7.629 2.36e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 542.04  on 390  degrees of freedom
## Residual deviance: 365.02  on 385  degrees of freedom
## AIC: 377.02
## 
## Number of Fisher Scoring iterations: 5

From the summary we can see that almost all the variables are ok, with the exception of Semi-Automatic Transmissions. Furthermore we can say that this variable is insignificant.

exp(cbind(OR = fit3$coefficients, confint.default(fit3)))
##                           OR        2.5 %       97.5 %
## (Intercept)     1.468077e-07 5.738054e-09 3.756064e-06
## Mileage         1.000052e+00 1.000036e+00 1.000068e+00
## TransmissionFSA 8.434754e-01 4.025696e-01 1.767274e+00
## TransmissionFA  2.173552e+00 1.059336e+00 4.459705e+00
## MilesPerGallon  1.208115e+00 1.158652e+00 1.259689e+00
## Price           1.000218e+00 1.000162e+00 1.000274e+00

####Mileage If mileage increases by 1 km, than the odds of picking a used audi car that runs on petrol is equal to 1.0000052 times initial odds, under the assumption that the remaining explanatory variables do not change.

####TransmissionFA If we pick a car that is an automatic, than the odds of picking a used audi car that runs on petrol is equal to 2.17 times initial odds, under the assumption that the remaining explanatory variables do not change.

####MilesPerGallon If miles per gallon increases by 1 gallon per mile, than the odds of picking a used audi car that runs on petrol is equal to 1.2 times initial odds, under the assumption that the remaining explanatory variables do not change.

####Price If we the price of a car changes by one pound, than the odds of picking a used audi car that runs on petrol is equal to 1.00021 times initial odds, under the assumption that the remaining explanatory variables do not change.

mydata2$EstProb <- fitted(fit3) #Estimates of probabilities for Y=1: P(Y=1)
head(mydata2)
##   Price Transmission Mileage FuelType MilesPerGallon FuelTypeF TransmissionF
## 1 11495       Manual   33397   Petrol           67.3         P             M
## 2 17336    Automatic   62118   Diesel           56.5         D             A
## 3 19490       Manual   23800   Petrol           49.6         P             M
## 4 20000    Automatic    8500   Petrol           47.9         P             A
## 5 14000    Semi-Auto   56441   Petrol           36.7         P            SA
## 6 13250    Automatic   40000   Petrol           58.9         P             A
##     StdResid         Cook    EstProb
## 1 -1.6934574 0.0068758884 0.77258778
## 2  0.4139692 0.0001537643 0.93798600
## 3 -0.8187360 0.0006007681 0.29382185
## 4 -0.9306278 0.0018695895 0.24934089
## 5 -0.3677133 0.0001508213 0.04767494
## 6 -1.7853661 0.0118081245 0.75682508

Here we can se the probabilities that a used audi car will run on petrol or diesel.

Probability that the first car (first row) will run on petrol, is 77 percents. Probability that the sixth car (6th row) will run on petrol is 75 percents.

Now we also want to see the classification table

mydata2$Classification <- ifelse(test = mydata2$EstProb < 0.50, 
                                yes = "Petrol", 
                                no = "Diesel")
#if else how to read -> if estimated probabilty is less than 0.5, than the car gets the value no. Otherwise gets value yes.
#If estimated probability is below 0.50, a used audi is classified into a car that uses petrol. If it is over 0.50 than it is classified into a car that uses diesel.

mydata2$Classification <- factor(mydata2$Classification,
                                 levels = c("Petrol", "Diesel"), 
                                 labels = c("Petrol", "Diesel"))


ClassificationTable <- table(mydata2$FuelTypeF, mydata2$Classification) #Classification table based on 2 categorical variables.

ClassificationTable
##    
##     Petrol Diesel
##   P    154     41
##   D     47    149
Pseudo_R2_fit3 <- (ClassificationTable[1, 1] + ClassificationTable[2, 2] )/ nrow(mydata2)
Pseudo_R2_fit3
## [1] 0.7749361

The model implies that 88 used audi cars were wrongly classified. And 303 used audi cars were correctly classified We can see that 77 percent units were correctly classified.