library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.2
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ stringr 1.4.1
## ✔ tidyr 1.2.1 ✔ forcats 0.5.2
## ✔ readr 2.1.2
## Warning: package 'ggplot2' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
CarMarket <- read.table("~/IMB/Multivariate analysis/HOMEWORK MVA/bmw.csv", header=TRUE, sep=",", dec=";")
CarMarket <- CarMarket[order(CarMarket$price, decreasing = TRUE), ]
#I've sorted the data by price in a descending manner.
head(CarMarket)
## model year price transmission mileage fuelType tax mpg engineSize
## 3639 2 Series 2015 123456 Semi-Auto 33419 Diesel 20 68.9 2.0
## 5363 M4 2016 99950 Automatic 771 Petrol 300 33.2 3.0
## 2910 M4 2017 89990 Semi-Auto 1336 Petrol 145 33.2 3.0
## 4777 M5 2019 89900 Semi-Auto 2269 Petrol 145 24.1 4.4
## 1814 8 Series 2019 88980 Semi-Auto 88 Petrol 145 24.4 4.4
## 722 8 Series 2019 84898 Semi-Auto 3185 Petrol 145 24.4 4.4
This data set regards the British used car market, or used BMW market to be precise. The unit of observation are specific cars, with their respective propreties, which are denoted as variables in the data set. The sample size consists of 10781 observations and 9 respective variables. The variables consist of: -Model of car: this variable entails specific models which the BMW line offers ranging from 1-8 and x1-x8 -year of production: the year the specific unit of car was produced -price: price of the unit in GBP -transmission: what kind of transmission is used in the specific vehicle (either manual or automatic) -mileage: the distance the vehicle has already travelled in miles -fuel type: type of fuel used to power powertrain -tax: required tax payment upon purchase of the vehicle -mpg: fuel economy of the vehicle denoted in distance driven (in miles) per gallon of fuel (4L) -engine size: the displacement of the powertrain in the vehicle
This data is sourced from Kaggle: https://www.kaggle.com/datasets/adityadesai13/used-car-dataset-ford-and-mercedes?select=bmw.csv
With the use of this data, I will try to examine, which of the two chosen variables have a larger effect on the price of the vehicle in the British used car market specifc to BMW vehicles. The variables I have chosen are mileage and fuel economy. I will be using this data in further assingments, and the main purpuse of this dataset will be developed through the assignments.
For the first assignment, this data will be used to generally assess the propreties of the dataset, specifically presenting the descriptive statistics of price movements, namely prices in relation to mileage.
CarMarket$transmission <- factor(CarMarket$transmission, levels = c ("Manual", "Semi-Auto","Automatic"), labels = c("1", "2", "3") )
CarMarket$fuelType<- factor(CarMarket$fuelType, levels = c ("Diesel", "Petrol","Hybrid","Other"), labels = c("1", "2", "3","4") )
Here I’ve converted the transmission and fuel type variables to factors, in order to make further analysis possible.
CarMarket$mileage <- as.numeric(CarMarket$mileage)
CarMarket$mpg <- as.numeric(CarMarket$mpg)
CarMarket$engineSize <- as.numeric(CarMarket$engineSize)
CarMarket <- CarMarket
Following up, I’ve coerced the variables: mileage, fuel economy and engine size into numeric variables.
CarMarketfit <- CarMarket[, c(-1, -4, -6, -7)]
I’ve excluded variables transmission, fuel type and tax because they are not relevant in this phase.
library(ggplot2)
ggplot(CarMarket, aes(x = price)) +
geom_histogram(, colour = "black") +
ylab("Frequency") +
xlab("price")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Using ggplot, I’ve graphically presented the price distribution of BMW
vehicles on the used car market in the UK. We can observe the prices are
skewed to the right.
library(pastecs)
##
## Attaching package: 'pastecs'
## The following object is masked from 'package:tidyr':
##
## extract
## The following objects are masked from 'package:dplyr':
##
## first, last
round(stat.desc(CarMarketfit))
## year price mileage mpg engineSize
## nbr.val 10781 10781 10781 10781 10781
## nbr.null 0 0 0 0 47
## nbr.na 0 0 0 0 0
## min 1996 1200 1 6 0
## max 2020 123456 214000 471 7
## range 24 122256 213999 465 7
## sum 21746128 245088881 274883012 608038 23371
## median 2017 20462 18347 53 2
## mean 2017 22733 25497 56 2
## SE.mean 0 110 242 0 0
## CI.mean.0.95 0 216 475 1 0
## var 6 130314284 632180132 982 0
## std.dev 2 11416 25143 31 1
## coef.var 0 1 1 1 0
Using the stat.desc function, I’ve obtained the basic statistic values of the data set. The year variable ranges from a minimum of 1996 to a 2020 maximum. The price variable ranges from a minimum of 1200 to a maximum of 122256 GBP. The Mileage variable ranges from a minimum of 1 to a maximum of 214000 Miles. The MPG variable ranges from a minimum of 6 MPG to a maximum of 471 MPG. The engine size ranges from a mimimum of 0 (electric powertrain) to a maximum of 7 liters of displacement. The arithmetic mean and the median of the variables year, price, mileage, MPG, and engine size are as follows: Year: mean = 2017 ; Median = 2017 Price: mean = 22733 GBP ; median = 20462 Mileage: mean = 25497 miles; median = 18347 MPG: mean = 56 ; median = 53 Engine size: mean = 2 ; median = 2
The median denotes the level of which 50% of observations have a higher value. The mean denotes arithmetic mean, which reflects the central tendency of the dataset.
CarMarketfit2020 <- filter(CarMarketfit, year >= 2020)
head(CarMarketfit2020)
## year price mileage mpg engineSize
## 1 2020 79991 151 26.7 3
## 2 2020 79566 1000 24.8 3
## 3 2020 78490 4919 31.4 3
## 4 2020 78000 5000 31.4 3
## 5 2020 77995 7500 31.4 3
## 6 2020 77990 5656 31.4 3
I’ve filtered the dataset, to include only units of vehicles, which were produced in the year 2020 or after in order to see the differences in the observed variables between the whole used BMW market, and the 2020 market.
library(ggplot2)
ggplot(CarMarketfit2020, aes(x = price)) +
geom_histogram(bimwidth = 5 , colour = "black") +
ylab("Frequency") +
xlab("price")
## Warning in geom_histogram(bimwidth = 5, colour = "black"): Ignoring unknown
## parameters: `bimwidth`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Once again we can see that the prices are skewed to the right, that is
to be expected, since the units which are priced higher are less
atainable by the general consumer.
library(pastecs)
round(stat.desc(CarMarketfit2020))
## year price mileage mpg engineSize
## nbr.val 733 733 733 733 733
## nbr.null 0 0 0 0 0
## nbr.na 0 0 0 0 0
## min 2020 11995 1 6 2
## max 2020 79991 11512 188 4
## range 0 67996 11511 183 3
## sum 1480660 25931840 1096086 33646 1612
## median 2020 32583 500 48 2
## mean 2020 35378 1495 46 2
## SE.mean 0 418 74 0 0
## CI.mean.0.95 0 821 146 1 0
## var 0 128228784 4058854 167 0
## std.dev 0 11324 2015 13 1
## coef.var 0 0 1 0 0
Using the stat.desc function, I’ve obtained the basic statistic values of the data set consisting only of vehicles produced in 2020.
The price variable ranges from a minimum of 11995 to a maximum of 79991 GBP. The Mileage variable ranges from a minimum of 1 to a maximum of 11512 Miles. The MPG variable ranges from a minimum of 6 MPG to a maximum of 188 MPG. The engine size ranges from a mimimum of 2 (electric powertrain) to a maximum of 4 liters of displacement. The arithmetic mean and the median of the variables year, price, mileage, MPG, and engine size are as follows: Price: mean = 35378 GBP ; median = 500 Mileage: mean = 1495 miles; median = 18347 MPG: mean = 46 ; median = 48 Engine size: mean = 2 ; median = 2
The median denotes the level of which 50% of observations have a higher value. The mean denotes arithmetic mean, which reflects the central tendency of the dataset.
CarMarket2020 <- filter(CarMarket, year >= 2020)
head(CarMarket2020)
## model year price transmission mileage fuelType tax mpg engineSize
## 1 8 Series 2020 79991 2 151 1 145 26.7 3
## 2 X7 2020 79566 2 1000 2 145 24.8 3
## 3 X7 2020 78490 2 4919 1 145 31.4 3
## 4 X7 2020 78000 2 5000 1 150 31.4 3
## 5 X7 2020 77995 2 7500 1 150 31.4 3
## 6 X7 2020 77990 2 5656 1 150 31.4 3
boxplot(price~model,data=CarMarket2020, main="Price by model",
xlab="Model", ylab="Price", cex.axis = 0.5)
In the latter graphic, we can observe the distribution of BMW prices
manufactured in 2020, by model of the specific vehicle