library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.2
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ stringr 1.4.1
## ✔ tidyr   1.2.1     ✔ forcats 0.5.2
## ✔ readr   2.1.2
## Warning: package 'ggplot2' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
CarMarket <- read.table("~/IMB/Multivariate analysis/HOMEWORK MVA/bmw.csv", header=TRUE, sep=",", dec=";")
CarMarket <- CarMarket[order(CarMarket$price, decreasing = TRUE), ] 
#I've sorted the data by price in a descending manner. 
head(CarMarket)
##          model year  price transmission mileage fuelType tax  mpg engineSize
## 3639  2 Series 2015 123456    Semi-Auto   33419   Diesel  20 68.9        2.0
## 5363        M4 2016  99950    Automatic     771   Petrol 300 33.2        3.0
## 2910        M4 2017  89990    Semi-Auto    1336   Petrol 145 33.2        3.0
## 4777        M5 2019  89900    Semi-Auto    2269   Petrol 145 24.1        4.4
## 1814  8 Series 2019  88980    Semi-Auto      88   Petrol 145 24.4        4.4
## 722   8 Series 2019  84898    Semi-Auto    3185   Petrol 145 24.4        4.4

This data set regards the British used car market, or used BMW market to be precise. The unit of observation are specific cars, with their respective propreties, which are denoted as variables in the data set. The sample size consists of 10781 observations and 9 respective variables. The variables consist of: -Model of car: this variable entails specific models which the BMW line offers ranging from 1-8 and x1-x8 -year of production: the year the specific unit of car was produced -price: price of the unit in GBP -transmission: what kind of transmission is used in the specific vehicle (either manual or automatic) -mileage: the distance the vehicle has already travelled in miles -fuel type: type of fuel used to power powertrain -tax: required tax payment upon purchase of the vehicle -mpg: fuel economy of the vehicle denoted in distance driven (in miles) per gallon of fuel (4L) -engine size: the displacement of the powertrain in the vehicle

This data is sourced from Kaggle: https://www.kaggle.com/datasets/adityadesai13/used-car-dataset-ford-and-mercedes?select=bmw.csv

With the use of this data, I will try to examine, which of the two chosen variables have a larger effect on the price of the vehicle in the British used car market specifc to BMW vehicles. The variables I have chosen are mileage and fuel economy. I will be using this data in further assingments, and the main purpuse of this dataset will be developed through the assignments.

For the first assignment, this data will be used to generally assess the propreties of the dataset, specifically presenting the descriptive statistics of price movements, namely prices in relation to mileage.

CarMarket$transmission <- factor(CarMarket$transmission, levels = c ("Manual", "Semi-Auto","Automatic"), labels = c("1", "2", "3") )  
CarMarket$fuelType<- factor(CarMarket$fuelType, levels = c ("Diesel", "Petrol","Hybrid","Other"), labels = c("1", "2", "3","4") ) 

Here I’ve converted the transmission and fuel type variables to factors, in order to make further analysis possible.

CarMarket$mileage <- as.numeric(CarMarket$mileage)
CarMarket$mpg <- as.numeric(CarMarket$mpg)
CarMarket$engineSize <- as.numeric(CarMarket$engineSize)
CarMarket <- CarMarket

Following up, I’ve coerced the variables: mileage, fuel economy and engine size into numeric variables.

CarMarketfit <- CarMarket[, c(-1, -4, -6, -7)]

I’ve excluded variables transmission, fuel type and tax because they are not relevant in this phase.

library(ggplot2)
ggplot(CarMarket, aes(x = price)) +
  geom_histogram(, colour = "black") +
  ylab("Frequency") + 
  xlab("price")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Using ggplot, I’ve graphically presented the price distribution of BMW vehicles on the used car market in the UK. We can observe the prices are skewed to the right.

library(pastecs)
## 
## Attaching package: 'pastecs'
## The following object is masked from 'package:tidyr':
## 
##     extract
## The following objects are masked from 'package:dplyr':
## 
##     first, last
round(stat.desc(CarMarketfit))
##                  year     price   mileage    mpg engineSize
## nbr.val         10781     10781     10781  10781      10781
## nbr.null            0         0         0      0         47
## nbr.na              0         0         0      0          0
## min              1996      1200         1      6          0
## max              2020    123456    214000    471          7
## range              24    122256    213999    465          7
## sum          21746128 245088881 274883012 608038      23371
## median           2017     20462     18347     53          2
## mean             2017     22733     25497     56          2
## SE.mean             0       110       242      0          0
## CI.mean.0.95        0       216       475      1          0
## var                 6 130314284 632180132    982          0
## std.dev             2     11416     25143     31          1
## coef.var            0         1         1      1          0

Using the stat.desc function, I’ve obtained the basic statistic values of the data set. The year variable ranges from a minimum of 1996 to a 2020 maximum. The price variable ranges from a minimum of 1200 to a maximum of 122256 GBP. The Mileage variable ranges from a minimum of 1 to a maximum of 214000 Miles. The MPG variable ranges from a minimum of 6 MPG to a maximum of 471 MPG. The engine size ranges from a mimimum of 0 (electric powertrain) to a maximum of 7 liters of displacement. The arithmetic mean and the median of the variables year, price, mileage, MPG, and engine size are as follows: Year: mean = 2017 ; Median = 2017 Price: mean = 22733 GBP ; median = 20462 Mileage: mean = 25497 miles; median = 18347 MPG: mean = 56 ; median = 53 Engine size: mean = 2 ; median = 2

The median denotes the level of which 50% of observations have a higher value. The mean denotes arithmetic mean, which reflects the central tendency of the dataset.

CarMarketfit2020 <- filter(CarMarketfit, year >= 2020) 

head(CarMarketfit2020)
##   year price mileage  mpg engineSize
## 1 2020 79991     151 26.7          3
## 2 2020 79566    1000 24.8          3
## 3 2020 78490    4919 31.4          3
## 4 2020 78000    5000 31.4          3
## 5 2020 77995    7500 31.4          3
## 6 2020 77990    5656 31.4          3

I’ve filtered the dataset, to include only units of vehicles, which were produced in the year 2020 or after in order to see the differences in the observed variables between the whole used BMW market, and the 2020 market.

library(ggplot2)
ggplot(CarMarketfit2020, aes(x = price)) +
  geom_histogram(bimwidth = 5 , colour = "black") +
  ylab("Frequency") + 
  xlab("price")
## Warning in geom_histogram(bimwidth = 5, colour = "black"): Ignoring unknown
## parameters: `bimwidth`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Once again we can see that the prices are skewed to the right, that is to be expected, since the units which are priced higher are less atainable by the general consumer.

library(pastecs)

round(stat.desc(CarMarketfit2020))
##                 year     price mileage   mpg engineSize
## nbr.val          733       733     733   733        733
## nbr.null           0         0       0     0          0
## nbr.na             0         0       0     0          0
## min             2020     11995       1     6          2
## max             2020     79991   11512   188          4
## range              0     67996   11511   183          3
## sum          1480660  25931840 1096086 33646       1612
## median          2020     32583     500    48          2
## mean            2020     35378    1495    46          2
## SE.mean            0       418      74     0          0
## CI.mean.0.95       0       821     146     1          0
## var                0 128228784 4058854   167          0
## std.dev            0     11324    2015    13          1
## coef.var           0         0       1     0          0

Using the stat.desc function, I’ve obtained the basic statistic values of the data set consisting only of vehicles produced in 2020.

The price variable ranges from a minimum of 11995 to a maximum of 79991 GBP. The Mileage variable ranges from a minimum of 1 to a maximum of 11512 Miles. The MPG variable ranges from a minimum of 6 MPG to a maximum of 188 MPG. The engine size ranges from a mimimum of 2 (electric powertrain) to a maximum of 4 liters of displacement. The arithmetic mean and the median of the variables year, price, mileage, MPG, and engine size are as follows: Price: mean = 35378 GBP ; median = 500 Mileage: mean = 1495 miles; median = 18347 MPG: mean = 46 ; median = 48 Engine size: mean = 2 ; median = 2

The median denotes the level of which 50% of observations have a higher value. The mean denotes arithmetic mean, which reflects the central tendency of the dataset.

CarMarket2020 <- filter(CarMarket, year >= 2020) 

head(CarMarket2020)
##       model year price transmission mileage fuelType tax  mpg engineSize
## 1  8 Series 2020 79991            2     151        1 145 26.7          3
## 2        X7 2020 79566            2    1000        2 145 24.8          3
## 3        X7 2020 78490            2    4919        1 145 31.4          3
## 4        X7 2020 78000            2    5000        1 150 31.4          3
## 5        X7 2020 77995            2    7500        1 150 31.4          3
## 6        X7 2020 77990            2    5656        1 150 31.4          3
boxplot(price~model,data=CarMarket2020, main="Price by model",
   xlab="Model", ylab="Price", cex.axis = 0.5) 

In the latter graphic, we can observe the distribution of BMW prices manufactured in 2020, by model of the specific vehicle