LUKA ČERNILA

Resarch question:

Can we exlain the Curb weight of a car using the five explanatory variables?

mydata <- read.table("./auto_clean_1985_dataset.csv", header=TRUE, sep=",")

head(mydata)
##   symboling normalized_losses     CarName wheelbase carlength carwidth carheight curbweight enginesize boreratio stroke
## 1         3               115 alfa-romero      88.6     168.8     64.1      48.8       2548        130      3.47   2.68
## 2         3               115 alfa-romero      88.6     168.8     64.1      48.8       2548        130      3.47   2.68
## 3         1               115 alfa-romero      94.5     171.2     65.5      52.4       2823        152      2.68   3.47
## 4         2               164        audi      99.8     176.6     66.2      54.3       2337        109      3.19   3.40
## 5         2               164        audi      99.4     176.6     66.4      54.3       2824        136      3.19   3.40
## 6         2               115        audi      99.8     177.3     66.3      53.1       2507        136      3.19   3.40
##   compressionratio horsepower peakrpm citympg highwaympg price fueltype_gas aspiration_turbo doornumber_doortwo
## 1              9.0        111    5000      21         27 13495            1                0                  1
## 2              9.0        111    5000      21         27 16500            1                0                  1
## 3              9.0        154    5000      19         26 16500            1                0                  1
## 4             10.0        102    5500      24         30 13950            1                0                  0
## 5              8.0        115    5500      18         22 17450            1                0                  0
## 6              8.5        110    5500      19         25 15250            1                0                  1
##   carbody_hardtop carbody_hatchback carbody_sedan carbody_wagon drivewheel_fwd drivewheel_rwd enginelocation_rear
## 1               0                 0             0             0              0              1                   0
## 2               0                 0             0             0              0              1                   0
## 3               0                 1             0             0              0              1                   0
## 4               0                 0             1             0              1              0                   0
## 5               0                 0             1             0              0              0                   0
## 6               0                 0             1             0              1              0                   0
##   enginetype_dohcv enginetype_l enginetype_ohc enginetype_ohcf enginetype_ohcv enginetype_rotor cylindernumber_five
## 1                0            0              0               0               0                0                   0
## 2                0            0              0               0               0                0                   0
## 3                0            0              0               0               1                0                   0
## 4                0            0              1               0               0                0                   0
## 5                0            0              1               0               0                0                   1
## 6                0            0              1               0               0                0                   1
##   cylindernumber_four cylindernumber_six cylindernumber_three cylindernumber_twelve cylindernumber_two fuelsystem_2bbl
## 1                   1                  0                    0                     0                  0               0
## 2                   1                  0                    0                     0                  0               0
## 3                   0                  1                    0                     0                  0               0
## 4                   1                  0                    0                     0                  0               0
## 5                   0                  0                    0                     0                  0               0
## 6                   0                  0                    0                     0                  0               0
##   fuelsystem_4bbl fuelsystem_idi fuelsystem_mfi fuelsystem_mpfi fuelsystem_spdi fuelsystem_spfi
## 1               0              0              0               1               0               0
## 2               0              0              0               1               0               0
## 3               0              0              0               1               0               0
## 4               0              0              0               1               0               0
## 5               0              0              0               1               0               0
## 6               0              0              0               1               0               0
mydata$symboling <- NULL

mydata$normalized_losses <- NULL

mydata$enginesize <- NULL

mydata$boreratio <- NULL

mydata$stroke <- NULL

mydata$compressionratio <- NULL

mydata$wheelbase <- NULL

mydata$peakrpm <- NULL

mydata$citympg <- NULL

mydata$highwaympg <- NULL

mydata$carlength <- NULL

mydata$aspiration_turbo <- NULL

mydata$doornumber_doortwo <- NULL

mydata$carbody_hardtop <- NULL

mydata$carbody_hatchback <- NULL

mydata$carbody_sedan <- NULL

mydata$carbody_wagon <- NULL

mydata$drivewheel_fwd <- NULL

mydata$drivewheel_rwd <- NULL

mydata$enginelocation_rear <- NULL

mydata$enginetype_dohcv <- NULL

mydata$enginetype_l <- NULL

mydata$enginetype_ohc <- NULL

mydata$enginetype_ohcf <- NULL

mydata$enginetype_ohcv <- NULL

mydata$enginetype_rotor <- NULL

mydata$cylindernumber_five <- NULL

mydata$cylindernumber_four <- NULL

mydata$cylindernumber_six <- NULL

mydata$cylindernumber_three <- NULL

mydata$cylindernumber_twelve <- NULL

mydata$cylindernumber_two <- NULL

mydata$fuelsystem_2bbl <- NULL

mydata$fuelsystem_4bbl <- NULL

mydata$fuelsystem_idi <- NULL

mydata$fuelsystem_mfi <- NULL

mydata$fuelsystem_mpfi <- NULL

mydata$fuelsystem_spdi <- NULL

mydata$fuelsystem_spfi <- NULL

mydata$CarName <- NULL

head(mydata)
##   carwidth carheight curbweight horsepower price fueltype_gas
## 1     64.1      48.8       2548        111 13495            1
## 2     64.1      48.8       2548        111 16500            1
## 3     65.5      52.4       2823        154 16500            1
## 4     66.2      54.3       2337        102 13950            1
## 5     66.4      54.3       2824        115 17450            1
## 6     66.3      53.1       2507        110 15250            1
tail(mydata)
##     carwidth carheight curbweight horsepower price fueltype_gas
## 200     67.2      57.5       3157        162 18950            1
## 201     68.9      55.5       2952        114 16845            1
## 202     68.8      55.5       3049        160 19045            1
## 203     68.9      55.5       3012        134 21485            1
## 204     68.9      55.5       3217        106 22470            0
## 205     68.9      55.5       3062        114 22625            1

Unit of observation: a car

Fueltpegas: Type of fuel a car uses (0:Diesel, 1: Gasoline).
Price: price of the car in dollars
Horsepower: power of the car in hp.
Carwidth: width of a car in inches.
Carheight: hight of a car in inches.
Curbweight: weight of a car with a full tank and standard equipment in kg.

I have found this dataset on Kaggle.com with the title Auto clean 1985 dataset. In this sample I have 205 units.

mydata$ID <- seq(1, nrow(mydata))
head(mydata)
##   carwidth carheight curbweight horsepower price fueltype_gas ID
## 1     64.1      48.8       2548        111 13495            1  1
## 2     64.1      48.8       2548        111 16500            1  2
## 3     65.5      52.4       2823        154 16500            1  3
## 4     66.2      54.3       2337        102 13950            1  4
## 5     66.4      54.3       2824        115 17450            1  5
## 6     66.3      53.1       2507        110 15250            1  6
library(psych) 
## 
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
## 
##     describe
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
describe(mydata[ , -7]) 
##              vars   n     mean      sd  median  trimmed     mad    min     max range  skew kurtosis     se
## carwidth        1 205    65.91    2.15    65.5    65.66    2.08   60.3    72.3    12  0.89     0.62   0.15
## carheight       2 205    53.72    2.44    54.1    53.70    2.37   47.8    59.8    12  0.06    -0.49   0.17
## curbweight      3 205  2555.57  520.68  2414.0  2513.05  572.28 1488.0  4066.0  2578  0.67    -0.10  36.37
## horsepower      4 205   104.17   39.53    95.0    99.17   37.06   48.0   288.0   240  1.38     2.54   2.76
## price           5 205 13150.31 7879.12 10295.0 11643.44 4750.25 5118.0 45400.0 40282  1.81     3.20 550.30
## fueltype_gas    6 205     0.90    0.30     1.0     1.00    0.00    0.0     1.0     1 -2.69     5.28   0.02

Here we have a short descriptive statistics for this dataset. We can see for instance that in this sample the average width of a car was 65.91 inches, median value of horsepower was 95hp and shorthest car height 47.8 inches.

LM Model

mydata$fueltype_gasF <- factor(mydata$fueltype_gas, 
                         levels = c(0, 1),
                         labels = c("Diesel", "Gasoline"))

fit <- lm(curbweight ~ price + horsepower + carwidth + carheight + fueltype_gasF, 
          data = mydata) 

#install.packages("car")
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
## The following object is masked from 'package:dplyr':
## 
##     recode
vif(fit) 
##         price    horsepower      carwidth     carheight fueltype_gasF 
##      3.136396      3.412993      2.780248      1.315260      1.336947
mean(vif(fit))
## [1] 2.396369

From the theoretical point of view this vif values are still okay since they are all bellow 5. However we mentioned at lectures that those are still a bit high but we continued regardless and Im going to do the same since time is of the essence.

MGK

mydata_PCA <- mydata[, c("price", "horsepower", "carwidth", "carheight")] 

#install.packages("pastecs")
library(pastecs) 
## 
## Attaching package: 'pastecs'
## The following object is masked from 'package:magrittr':
## 
##     extract
## The following objects are masked from 'package:dplyr':
## 
##     first, last
round(stat.desc(mydata_PCA, basic = FALSE), 2) 
##                    price horsepower carwidth carheight
## median          10295.00      95.00    65.50     54.10
## mean            13150.31     104.17    65.91     53.72
## SE.mean           550.30       2.76     0.15      0.17
## CI.mean.0.95     1085.01       5.44     0.30      0.34
## var          62080552.87    1562.60     4.60      5.97
## std.dev          7879.12      39.53     2.15      2.44
## coef.var            0.60       0.38     0.03      0.05

Since price has the largest variance, if we wouldnt standardise, it would capture just that data.

R <- cor(mydata_PCA) 
round(R, 3)
##            price horsepower carwidth carheight
## price      1.000      0.750    0.725     0.140
## horsepower 0.750      1.000    0.641    -0.109
## carwidth   0.725      0.641    1.000     0.279
## carheight  0.140     -0.109    0.279     1.000
library(psych)
cortest.bartlett(R, n = nrow(mydata_PCA)) 
## $chisq
## [1] 381.2312
## 
## $p.value
## [1] 3.023662e-79
## 
## $df
## [1] 6

H0:P=I
H1: not the same

We can reject the H0 at p<o,oo1 and conclude that the population correlation matrix is different to the idenntity matrix.

det(R) 
## [1] 0.1512469
library(psych)
KMO(R) 
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = R)
## Overall MSA =  0.65
## MSA for each item = 
##      price horsepower   carwidth  carheight 
##       0.70       0.63       0.72       0.26

BSD on the KMO or MSA criteria, this data is mediocre, but in the PCA we dont have to put that much focus on this its more relevant in FA.

we will lose the most data for car height.

library(FactoMineR) 
components <- PCA(mydata_PCA, 
                  scale.unit = TRUE,
                  graph = FALSE) 

library(factoextra) 
get_eigenvalue(components) 
##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1  2.4343836        60.859589                    60.85959
## Dim.2  1.0824703        27.061759                    87.92135
## Dim.3  0.2725821         6.814554                    94.73590
## Dim.4  0.2105639         5.264099                   100.00000

From this table we can see that the first Pc captures 60.86% of all the information.

library(factoextra)
fviz_eig(components,
         choice = "eigenvalue",
         main = "Scree plot",
         ylab = "Eigenvalue",
         xlab = "Principal component",
         addlabels = TRUE)

The most evident break on this graph is at PC3, so the theory suggest we should take one less, therefore two, but I will do another test just to be sure.

library(psych)
fa.parallel(mydata_PCA, 
            sim = FALSE, 
            fa = "pc") 

## Parallel analysis suggests that the number of factors =  NA  and the number of components =  1

Also this test suggest that we should take two principle components, since this is the last number where the empirical are larger than theoretical.

library(FactoMineR)
components <- PCA(mydata_PCA, 
                  scale.unit = TRUE, 
                  graph = FALSE,
                  ncp = 2) 

components
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 205 individuals, described by 4 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"
print(components$var$cor) 
##                Dim.1      Dim.2
## price      0.9223757 -0.0430613
## horsepower 0.8657838 -0.3493165
## carwidth   0.8915518  0.1697441
## carheight  0.1978899  0.9642515

Linear relationship between horsepower and PC1 are positive and strong.

The first and second PCs capture 85% info of price.

we can also see that the PC2 represents contrast.

print(components$var$contrib) 
##                Dim.1      Dim.2
## price      34.948352  0.1713004
## horsepower 30.791432 11.2725512
## carwidth   32.651579  2.6617877
## carheight   1.608637 85.8943607

Price represents 34,95% of info in the PC1.

library(factoextra)
fviz_pca_var(components, 
             repel = TRUE)

library(factoextra)
fviz_pca_biplot(components) 

Unit 69 is above average and is high and wide

Unit 1 is bellow average and is high in price and horsepower.

I see here that those are maybe not the best variables since i cant group the two and two into a category like we did at lectures (soft and hard skills), especially the bottom two, but all a man can do is learn.

head(components$ind$coord)
##        Dim.1      Dim.2
## 1 -0.6168804 -2.0703360
## 2 -0.3908630 -2.0861597
## 3  0.7753753 -0.9767696
## 4  0.1376175  0.2551759
## 5  0.6372047  0.1413076
## 6  0.3122337 -0.2684167
components$ind$coord[103, ]
##     Dim.1     Dim.2 
## 1.0487485 0.5343532
mydata_PCA_std <- scale(mydata_PCA) 
mydata_PCA_std[103, ] 
##      price horsepower   carwidth  carheight 
##  0.1584812  1.2100802  0.2760554  0.9720076

This car is above average in all categories, its especially shines in its horsepower.

Conclusion:

Principal component analysis was performed on 4 standardized variables (n = 205). The KMO measure confirms the appropriateness of the variables, KMO = 0.65, so its mediocre. The MSA statistics for the individual variables are above 0.50 for three variables, while for the car height the value is only 0.27, but we retained it in the analysis anyway. Based on the parallel analysis (and other rules as well), it makes most sense to retain the first two principal components, which together summarize almost 88% of the information in the original 4 variables. Based on the component’s loadings, we conclude that PC1 (𝜆 = 2.4) represents car measurements, while PC2 (𝜆 = 1.08) represents the contrast between cars measurements and its power and price.