library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
CarSale <- read_csv("ToyotaCorolla.csv", col_names = TRUE)
## Rows: 1436 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): model, fuel_type, color
## dbl (35): price, age_08_04, mfg_month, mfg_year, km, hp, met_color, automati...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(CarSale)
## [1] 1436 38
head(CarSale)
summary(CarSale)
## model price age_08_04 mfg_month
## Length:1436 Min. : 4350 Min. : 1.00 Min. : 1.000
## Class :character 1st Qu.: 8450 1st Qu.:44.00 1st Qu.: 3.000
## Mode :character Median : 9900 Median :61.00 Median : 5.000
## Mean :10731 Mean :55.95 Mean : 5.549
## 3rd Qu.:11950 3rd Qu.:70.00 3rd Qu.: 8.000
## Max. :32500 Max. :80.00 Max. :12.000
## mfg_year km fuel_type hp
## Min. :1998 Min. : 1 Length:1436 Min. : 69.0
## 1st Qu.:1998 1st Qu.: 43000 Class :character 1st Qu.: 90.0
## Median :1999 Median : 63390 Mode :character Median :110.0
## Mean :2000 Mean : 68533 Mean :101.5
## 3rd Qu.:2001 3rd Qu.: 87021 3rd Qu.:110.0
## Max. :2004 Max. :243000 Max. :192.0
## met_color color automatic cc
## Min. :0.0000 Length:1436 Min. :0.00000 Min. : 1300
## 1st Qu.:0.0000 Class :character 1st Qu.:0.00000 1st Qu.: 1400
## Median :1.0000 Mode :character Median :0.00000 Median : 1600
## Mean :0.6748 Mean :0.05571 Mean : 1577
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.: 1600
## Max. :1.0000 Max. :1.00000 Max. :16000
## doors cylinders gears quarterly_tax weight
## Min. :2.000 Min. :4 Min. :3.000 Min. : 19.00 Min. :1000
## 1st Qu.:3.000 1st Qu.:4 1st Qu.:5.000 1st Qu.: 69.00 1st Qu.:1040
## Median :4.000 Median :4 Median :5.000 Median : 85.00 Median :1070
## Mean :4.033 Mean :4 Mean :5.026 Mean : 87.12 Mean :1072
## 3rd Qu.:5.000 3rd Qu.:4 3rd Qu.:5.000 3rd Qu.: 85.00 3rd Qu.:1085
## Max. :5.000 Max. :4 Max. :6.000 Max. :283.00 Max. :1615
## mfr_guarantee bovag_guarantee guarantee_period abs
## Min. :0.0000 Min. :0.0000 Min. : 3.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.: 3.000 1st Qu.:1.0000
## Median :0.0000 Median :1.0000 Median : 3.000 Median :1.0000
## Mean :0.4095 Mean :0.8955 Mean : 3.815 Mean :0.8134
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.: 3.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :36.000 Max. :1.0000
## airbag_1 airbag_2 airco automatic_airco
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :1.0000 Median :1.0000 Median :1.0000 Median :0.00000
## Mean :0.9708 Mean :0.7228 Mean :0.5084 Mean :0.05641
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## boardcomputer cd_player central_lock powered_windows
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000
## Median :0.0000 Median :0.0000 Median :1.0000 Median :1.000
## Mean :0.2946 Mean :0.2187 Mean :0.5801 Mean :0.562
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.000
## power_steering radio mistlamps sport_model
## Min. :0.0000 Min. :0.0000 Min. :0.000 Min. :0.0000
## 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000
## Median :1.0000 Median :0.0000 Median :0.000 Median :0.0000
## Mean :0.9777 Mean :0.1462 Mean :0.257 Mean :0.3001
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.000 Max. :1.0000
## backseat_divider metallic_rim radio_cassette parking_assistant
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.000000
## 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000000
## Median :1.0000 Median :0.0000 Median :0.0000 Median :0.000000
## Mean :0.7702 Mean :0.2047 Mean :0.1455 Mean :0.002786
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.000000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.000000
## tow_bar
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.2779
## 3rd Qu.:1.0000
## Max. :1.0000
2.11 The dataset ToyotaCorolla.csv contains data on used cars on sale during the late summer of 2004 in the Netherlands. It has 1436 records containing details on 38 attributes, including Price, Age, Kilometers, HP, and other specifications. a. Explore the data using the data visualization capabilities of R.Which of the pairs among the variables seem to be correlated?
library(ggplot2)
ggplot(CarSale) +
geom_histogram(mapping=aes(x=price), binwidth = 1000)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
df.num <-CarSale %>%
select(price, age_08_04, km,hp,quarterly_tax, weight)
df.cor <- round(cor(df.num),2)
head(df.cor)
## price age_08_04 km hp quarterly_tax weight
## price 1.00 -0.88 -0.57 0.31 0.22 0.58
## age_08_04 -0.88 1.00 0.51 -0.16 -0.20 -0.47
## km -0.57 0.51 1.00 -0.33 0.28 -0.03
## hp 0.31 -0.16 -0.33 1.00 -0.30 0.09
## quarterly_tax 0.22 -0.20 0.28 -0.30 1.00 0.63
## weight 0.58 -0.47 -0.03 0.09 0.63 1.00
melt_df.cor <- melt(df.cor)
head(melt_df.cor)
ggplot(data = melt_df.cor, aes(x=Var1, y=Var2,
fill=value)) +
geom_tile()
We can see that quartly_tax and weight has high corraltion of 0.63.
Age_08_P_04 has high correlation of 0.51 with km.
unique(CarSale$metallic_rim) # looks like Metallic is already binary
## [1] 0 1
unique(CarSale$fuel_type)
## [1] "Diesel" "Petrol" "CNG"
unique(CarSale$color) # color is another factor column
## [1] "Blue" "Silver" "Black" "White" "Grey" "Red" "Green" "Yellow"
## [9] "Violet" "Beige"
Use model.matrix to convert the categorical data into binary data.
library(fastDummies)
## Thank you for using fastDummies!
## To acknowledge our work, please cite the package:
## Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.
df.dummies <- dummy_cols(CarSale, select_columns = "fuel_type")
#df.dummies <- as.data.frame(df.dummies)
t(t(names(df.dummies)))
## [,1]
## [1,] "model"
## [2,] "price"
## [3,] "age_08_04"
## [4,] "mfg_month"
## [5,] "mfg_year"
## [6,] "km"
## [7,] "fuel_type"
## [8,] "hp"
## [9,] "met_color"
## [10,] "color"
## [11,] "automatic"
## [12,] "cc"
## [13,] "doors"
## [14,] "cylinders"
## [15,] "gears"
## [16,] "quarterly_tax"
## [17,] "weight"
## [18,] "mfr_guarantee"
## [19,] "bovag_guarantee"
## [20,] "guarantee_period"
## [21,] "abs"
## [22,] "airbag_1"
## [23,] "airbag_2"
## [24,] "airco"
## [25,] "automatic_airco"
## [26,] "boardcomputer"
## [27,] "cd_player"
## [28,] "central_lock"
## [29,] "powered_windows"
## [30,] "power_steering"
## [31,] "radio"
## [32,] "mistlamps"
## [33,] "sport_model"
## [34,] "backseat_divider"
## [35,] "metallic_rim"
## [36,] "radio_cassette"
## [37,] "parking_assistant"
## [38,] "tow_bar"
## [39,] "fuel_type_CNG"
## [40,] "fuel_type_Diesel"
## [41,] "fuel_type_Petrol"
set.seed(1)
train.rows <- sample(rownames(df.dummies), dim(df.dummies)[1]*0.5)
The validation partition (sometimes called the test partition) is used to assess the predictive performance of each model so that you can compare models and choose the best one. In some algorithms
valid.rows <- sample(setdiff(rownames(df.dummies), train.rows),
dim(df.dummies)[1]*0.3)
The test partition (sometimes called the holdout or evaluation partition) is used to assess the performance of the chosen model with new data.
test.rows <- setdiff(rownames(df.dummies), union(train.rows, valid.rows))
Create the 3 data frames by collecting all columns from the appropriate rows
train.data <- df.dummies[train.rows, ]
valid.data <- df.dummies[valid.rows, ]
test.data <- df.dummies[test.rows, ]