data(cars,package = "caret")
str(cars) # Data description
## 'data.frame': 804 obs. of 18 variables:
## $ Price : num 22661 21725 29143 30732 33359 ...
## $ Mileage : int 20105 13457 31655 22479 17590 23635 17381 27558 25049 17319 ...
## $ Cylinder : int 6 6 4 4 4 4 4 4 4 4 ...
## $ Doors : int 4 2 2 2 2 2 2 2 2 4 ...
## $ Cruise : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Sound : int 0 1 1 0 1 0 1 0 0 0 ...
## $ Leather : int 0 0 1 0 1 0 1 1 0 1 ...
## $ Buick : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Cadillac : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Chevy : int 0 1 0 0 0 0 0 0 0 0 ...
## $ Pontiac : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Saab : int 0 0 1 1 1 1 1 1 1 1 ...
## $ Saturn : int 0 0 0 0 0 0 0 0 0 0 ...
## $ convertible: int 0 0 1 1 1 1 1 1 1 0 ...
## $ coupe : int 0 1 0 0 0 0 0 0 0 0 ...
## $ hatchback : int 0 0 0 0 0 0 0 0 0 0 ...
## $ sedan : int 1 0 0 0 0 0 0 0 0 1 ...
## $ wagon : int 0 0 0 0 0 0 0 0 0 0 ...
plot_num(cars)
df_status(cars) # Number of NA's
## variable q_zeros p_zeros q_na p_na q_inf p_inf type unique
## 1 Price 0 0.00 0 0 0 0 numeric 798
## 2 Mileage 0 0.00 0 0 0 0 integer 791
## 3 Cylinder 0 0.00 0 0 0 0 integer 3
## 4 Doors 0 0.00 0 0 0 0 integer 2
## 5 Cruise 199 24.75 0 0 0 0 integer 2
## 6 Sound 258 32.09 0 0 0 0 integer 2
## 7 Leather 222 27.61 0 0 0 0 integer 2
## 8 Buick 724 90.05 0 0 0 0 integer 2
## 9 Cadillac 724 90.05 0 0 0 0 integer 2
## 10 Chevy 484 60.20 0 0 0 0 integer 2
## 11 Pontiac 654 81.34 0 0 0 0 integer 2
## 12 Saab 690 85.82 0 0 0 0 integer 2
## 13 Saturn 744 92.54 0 0 0 0 integer 2
## 14 convertible 754 93.78 0 0 0 0 integer 2
## 15 coupe 664 82.59 0 0 0 0 integer 2
## 16 hatchback 744 92.54 0 0 0 0 integer 2
## 17 sedan 314 39.05 0 0 0 0 integer 2
## 18 wagon 740 92.04 0 0 0 0 integer 2
profiling_num(cars) # The mean, standart deviation, ...
mx <- round(cor(cars),2)
sort((abs(mx[,1])),decreasing = TRUE)
## Price Cadillac Cylinder convertible Cruise Chevy
## 1.00 0.66 0.57 0.51 0.43 0.40
## Saab Saturn hatchback coupe Leather Mileage
## 0.34 0.21 0.21 0.17 0.16 0.14
## Doors Pontiac Sound wagon sedan Buick
## 0.14 0.14 0.12 0.05 0.03 0.02
pca1 <- prcomp(cars) # PCA without scaling the data
pca2 <- prcomp(cars, scale = TRUE) # PCA scaling the data
eigVal <- get_eigenvalue(pca1)
eigVal[order(-eigVal$eigenvalue),]
eigVal <- get_eigenvalue(pca2)
eigVal[order(-eigVal$eigenvalue),]
The number of components to be selected depends on the scale of the data. If the scale the data, multiple variables are need, at least some of the most relevant(not including the Price) If we do not scale, apart from the Price, the only variable that affects the data is the Mileage column.
fviz_eig(pca1)
fviz_eig(pca2)
distMtx <- dist(cars)
mds1 <- cmdscale(distMtx, k = 2)
require("ggrepel")
## Loading required package: ggrepel
mds1df <- as.data.frame(mds1)
rownames(mds1df) <- rownames(cars)
mds1df %>% ggplot(aes(x=V1,y=V2))+
geom_point() +
geom_label(label=rownames(mds1df))
It is different, we can see the distance between the rows, as we can see a few outliers at the right
The previous plot, the values at the most right, those are the top 6 vehicles from the nº196 all the way down to the nº199
ds <- arrange(subset(cars), -Price)
head(ds)
# Function that returns Root Mean Squared Error
rmse <- function(error)
{
sqrt(mean(error^2))
}
# Function that returns Mean Absolute Error
mae <- function(error)
{
mean(abs(error))
}
mod31f1 <- glm(Price~Mileage,data=cars) # Simple linear regression
ypred <- predict(mod31f1) # Get the predicted values
yreal <- cars %>% select(Price) %>% pull()# Get the real values
rmse((yreal - ypred))
## [1] 9777.105
mae((yreal - ypred))
## [1] 7596.28
cars %>% ggplot(aes(y=Price,x=Mileage)) + geom_point() + geom_smooth(method='lm',formula=y~x)+theme_bw()
mod31f2 <- glm(Price~Cylinder,data=cars) # Simple linear regression
ypred <- predict(mod31f2) # Get the predicted values
yreal <- cars %>% select(Price) %>% pull()# Get the real values
rmse((yreal - ypred))
## [1] 8123.04
mae((yreal - ypred))
## [1] 6287.96
cars %>% ggplot(aes(y=Price,x=Cylinder)) + geom_point() + geom_smooth(method='lm',formula=y~x)+theme_bw()
mod31f3 <- glm(Price~Cadillac,data=cars) # Simple linear regression
ypred <- predict(mod31f3) # Get the predicted values
yreal <- cars %>% select(Price) %>% pull()# Get the real values
rmse((yreal - ypred))
## [1] 7427.627
mae((yreal - ypred))
## [1] 5779.996
cars %>% ggplot(aes(y=Price,x=Cadillac)) + geom_point() + geom_smooth(method='lm',formula=y~x)+theme_bw()
mod32f1 <- glm(Price~Mileage+Cylinder,data=cars)
ypred <- predict(mod32f1) # Get the predicted values
yreal <- cars %>% select(Price) %>% pull()# Get the real values
rmse((yreal - ypred))
## [1] 8026.587
mae((yreal - ypred))
## [1] 6247.506
mod32f2 <- glm(Price~Cadillac+Cylinder+convertible+Cruise,data=cars)
ypred <- predict(mod32f2) # Get the predicted values
yreal <- cars %>% select(Price) %>% pull()# Get the real values
rmse((yreal - ypred))
## [1] 5079.622
mae((yreal - ypred))
## [1] 3819.142
mod32f3 <- train(Price~.,data = cars, method = "lmStepAIC", trace = FALSE)
ypred <- predict(mod32f3) # Get the predicted values
yreal <- cars %>% select(Price) %>% pull()# Get the real values
rmse((yreal - ypred))
## [1] 2883.754
mae((yreal - ypred))
## [1] 2107.142
training <- cars
xtrain <- training %>% dplyr::select(-Price)
ytrain <- training %>% dplyr::select(Price)
xtrainM <- as.matrix(xtrain)
ytrainM <- as.matrix(ytrain)
mod41f1 <- xgboost(data = xtrainM, nfold = 10, label = ytrainM, nrounds = 10)
## [1] train-rmse:16754.613281
## [2] train-rmse:11996.326172
## [3] train-rmse:8694.697266
## [4] train-rmse:6371.326172
## [5] train-rmse:4802.218262
## [6] train-rmse:3695.559814
## [7] train-rmse:2947.271484
## [8] train-rmse:2458.736816
## [9] train-rmse:2143.066895
## [10] train-rmse:1918.636719
ypred <- predict(mod41f1,xtrainM) # Get the predicted values
yreal <- ytrainM # Get the real values
error <- yreal - ypred
rmse((yreal - ypred))
## [1] 1918.637
mae((yreal - ypred))
## [1] 1415.713
dfResidAnal <- data.frame(er=as.double(error),predy=ypred)
dfResidAnal %>% ggplot(aes(x=predy,y=er))+
geom_point() + theme_bw()
mod41f2 <- xgboost(data = xtrainM, nfold = 10, label = ytrainM, nrounds = 50)
## [1] train-rmse:16754.613281
## [2] train-rmse:11996.326172
## [3] train-rmse:8694.698242
## [4] train-rmse:6371.326172
## [5] train-rmse:4802.218262
## [6] train-rmse:3695.559814
## [7] train-rmse:2947.271240
## [8] train-rmse:2458.736816
## [9] train-rmse:2143.066650
## [10] train-rmse:1918.636719
## [11] train-rmse:1789.489014
## [12] train-rmse:1704.964478
## [13] train-rmse:1629.782837
## [14] train-rmse:1582.604126
## [15] train-rmse:1553.871338
## [16] train-rmse:1507.060669
## [17] train-rmse:1480.638672
## [18] train-rmse:1443.814209
## [19] train-rmse:1425.562378
## [20] train-rmse:1412.217285
## [21] train-rmse:1377.383911
## [22] train-rmse:1346.654785
## [23] train-rmse:1331.696045
## [24] train-rmse:1308.482056
## [25] train-rmse:1266.443115
## [26] train-rmse:1258.182495
## [27] train-rmse:1238.344238
## [28] train-rmse:1232.592529
## [29] train-rmse:1218.445190
## [30] train-rmse:1190.526978
## [31] train-rmse:1175.537354
## [32] train-rmse:1171.509033
## [33] train-rmse:1164.208740
## [34] train-rmse:1146.049438
## [35] train-rmse:1104.667847
## [36] train-rmse:1094.211670
## [37] train-rmse:1071.189819
## [38] train-rmse:1062.838867
## [39] train-rmse:1014.900452
## [40] train-rmse:999.406677
## [41] train-rmse:981.311890
## [42] train-rmse:974.878662
## [43] train-rmse:957.185425
## [44] train-rmse:949.822876
## [45] train-rmse:946.817200
## [46] train-rmse:942.984009
## [47] train-rmse:927.162415
## [48] train-rmse:916.901794
## [49] train-rmse:906.028015
## [50] train-rmse:899.349121
ypred <- predict(mod41f2,xtrainM) # Get the predicted values
yreal <- ytrainM # Get the real values
error <- yreal - ypred
rmse((yreal - ypred))
## [1] 899.3492
mae((yreal - ypred))
## [1] 615.4764
dfResidAnal <- data.frame(er=as.double(error),predy=ypred)
dfResidAnal %>% ggplot(aes(x=predy,y=er))+
geom_point() + theme_bw()
xtrainM <- scale(xtrain,center= TRUE, scale=TRUE)
ytrainM <- as.matrix(ytrain)
mod41f3 <- xgboost(data = xtrainM, nfold = 10, label = ytrainM, nrounds = 10)
## [1] train-rmse:16754.613281
## [2] train-rmse:11996.326172
## [3] train-rmse:8694.697266
## [4] train-rmse:6371.326172
## [5] train-rmse:4802.218262
## [6] train-rmse:3695.559814
## [7] train-rmse:2947.271484
## [8] train-rmse:2458.736816
## [9] train-rmse:2143.066650
## [10] train-rmse:1918.636719
ypred <- predict(mod41f3,xtrainM) # Get the predicted values
yreal <- ytrainM # Get the real values
error <- yreal - ypred
rmse((yreal - ypred))
## [1] 1918.637
mae((yreal - ypred))
## [1] 1415.713
# Data split for training-testing
set.seed(1357)
ixTrain <- createDataPartition(cars$Cylinder,p=0.8,list=FALSE)
training <- cars[ixTrain,]
testing <- cars[-ixTrain,]
# Test set
xtest <- testing %>% dplyr::select(-Price)
ytest <- testing %>% dplyr::select(Price)
# Training set
xtrain <- training %>% dplyr::select(-Price)
ytrain <- training %>% dplyr::select(Price)
# Matrix transformation
xtrainM <- as.matrix(xtrain)
ytrainM <- as.matrix(ytrain)
xtestM <- as.matrix(xtest)
ytestM <- as.matrix(ytest)
mod41f4 <- xgboost(data = xtrainM, nfold = 10, label = ytrainM, nrounds = 50)
## [1] train-rmse:16778.535156
## [2] train-rmse:12058.943359
## [3] train-rmse:8751.042969
## [4] train-rmse:6445.369629
## [5] train-rmse:4847.310059
## [6] train-rmse:3740.356934
## [7] train-rmse:2982.217041
## [8] train-rmse:2477.098633
## [9] train-rmse:2135.483887
## [10] train-rmse:1908.361084
## [11] train-rmse:1765.570801
## [12] train-rmse:1654.715332
## [13] train-rmse:1591.402954
## [14] train-rmse:1529.247070
## [15] train-rmse:1497.849121
## [16] train-rmse:1430.571899
## [17] train-rmse:1380.578491
## [18] train-rmse:1358.297974
## [19] train-rmse:1329.761841
## [20] train-rmse:1313.660034
## [21] train-rmse:1301.674561
## [22] train-rmse:1290.874023
## [23] train-rmse:1281.781250
## [24] train-rmse:1262.485352
## [25] train-rmse:1240.275269
## [26] train-rmse:1208.300415
## [27] train-rmse:1162.070923
## [28] train-rmse:1159.219360
## [29] train-rmse:1134.512329
## [30] train-rmse:1107.663818
## [31] train-rmse:1091.618530
## [32] train-rmse:1074.956665
## [33] train-rmse:1059.479248
## [34] train-rmse:1030.848999
## [35] train-rmse:1017.783997
## [36] train-rmse:992.748169
## [37] train-rmse:973.890320
## [38] train-rmse:962.496948
## [39] train-rmse:947.634338
## [40] train-rmse:923.874390
## [41] train-rmse:906.229370
## [42] train-rmse:903.823730
## [43] train-rmse:885.896667
## [44] train-rmse:871.834656
## [45] train-rmse:860.201172
## [46] train-rmse:850.613831
## [47] train-rmse:842.893250
## [48] train-rmse:826.080383
## [49] train-rmse:816.890320
## [50] train-rmse:805.922119
ypred <- predict(mod41f4,xtestM) # Get the predicted values
yreal <- ytestM # Get the real values
error <- yreal - ypred
rmse((yreal - ypred))
## [1] 2443.1
mae((yreal - ypred))
## [1] 1739.462
require(keras)
#install_keras() # ONLY ONCE
training <- cars
xtrain <- training %>% dplyr::select(-Price)
ytrain <- training %>% dplyr::select(Price)
# Matrix transformation
xtrainM <- as.matrix(xtrain)
ytrainM <- as.matrix(ytrain)
mod51f1 <- keras_model_sequential()
mod51f1 %>%
layer_dense(units = 17, activation = 'relu', input_shape = 17) %>%
layer_dense(units = 10, activation = 'relu') %>%
layer_dense(units = 1)
mod51f1 %>% summary
## Model: "sequential"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense (Dense) (None, 17) 306
## ________________________________________________________________________________
## dense_1 (Dense) (None, 10) 180
## ________________________________________________________________________________
## dense_2 (Dense) (None, 1) 11
## ================================================================================
## Total params: 497
## Trainable params: 497
## Non-trainable params: 0
## ________________________________________________________________________________
mod51f1 %>% compile(
loss = "mse",
optimizer = optimizer_rmsprop(),
metrics = list("mean_absolute_error")
)
history = mod51f1 %>% fit(
x = xtrainM, y = ytrainM,
epochs = 200,
batch_size = 20,
validation_split = 0
)
plot(history)
ypred <- mod51f1 %>% predict(xtrainM)
yreal <- ytrainM # Get the real values
error <- yreal - ypred
rmse((yreal - ypred))
## [1] 23511.07
mae((yreal - ypred))
## [1] 21334.98
xtrainM <- scale(xtrain,center= TRUE, scale=TRUE)
ytrainM <- as.matrix(ytrain)
mod51f2 <- keras_model_sequential()
mod51f2 %>%
layer_dense(units = 17, activation = 'relu', input_shape = 17) %>%
layer_dense(units = 10, activation = 'relu') %>%
layer_dense(units = 1)
mod51f1 %>% summary
## Model: "sequential"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense (Dense) (None, 17) 306
## ________________________________________________________________________________
## dense_1 (Dense) (None, 10) 180
## ________________________________________________________________________________
## dense_2 (Dense) (None, 1) 11
## ================================================================================
## Total params: 497
## Trainable params: 497
## Non-trainable params: 0
## ________________________________________________________________________________
mod51f2 %>% compile(
loss = "mse",
optimizer = optimizer_rmsprop(),
metrics = list("mean_absolute_error")
)
history = mod51f2 %>% fit(
x = xtrainM, y = ytrainM,
epochs = 50,
batch_size = 20,
validation_split = 0
)
plot(history)
history = mod51f2 %>% fit(
x = xtrainM, y = ytrainM,
epochs = 500,
batch_size = 20,
validation_split = 0
)
plot(history)
#### The model continues improving for many epochs, in this one 300 would be enough, by as we see it doesn’t stop improving
ypred <- mod51f2 %>% predict(xtrainM)
yreal <- ytrainM # Get the real values
error <- yreal - ypred
rmse((yreal - ypred))
## [1] 2150.205
mae((yreal - ypred))
## [1] 1574.713
# data split for trainning-testing
set.seed(1357)
ixTrain <- createDataPartition(cars$Cylinder,p=0.8,list=FALSE)
training <- cars[ixTrain,]
testing <- cars[-ixTrain,]
# Test set
xtest <- testing %>% dplyr::select(-Price)
ytest <- testing %>% dplyr::select(Price)
xtestM <- scale(xtest,center= TRUE, scale=TRUE) # Scaling already converts to a matrix
ytestM <- as.matrix(ytest)
# Training set
xtrain <- training %>% dplyr::select(-Price)
ytrain <- training %>% dplyr::select(Price)
xtrainM <- scale(xtrain,center= TRUE, scale=TRUE)
ytrainM <- as.matrix(ytrain)
mod51f3 <- keras_model_sequential()
mod51f3 %>%
layer_dense(units = 17, activation = 'relu', input_shape = 17) %>%
layer_dense(units = 10, activation = 'relu') %>%
layer_dense(units = 1)
mod51f1 %>% summary
## Model: "sequential"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense (Dense) (None, 17) 306
## ________________________________________________________________________________
## dense_1 (Dense) (None, 10) 180
## ________________________________________________________________________________
## dense_2 (Dense) (None, 1) 11
## ================================================================================
## Total params: 497
## Trainable params: 497
## Non-trainable params: 0
## ________________________________________________________________________________
mod51f3 %>% compile(
loss = "mse",
optimizer = optimizer_rmsprop(),
metrics = list("mean_absolute_error")
)
history = mod51f3 %>% fit(
x = xtrainM, y = ytrainM,
epochs = 500,
batch_size = 20,
validation_split = 0.25
)
plot(history)
ypred <- mod51f3 %>% predict(xtestM)
yreal <- ytestM # Get the real values
error <- yreal - ypred
rmse((yreal - ypred))
## [1] 2705.337
mae((yreal - ypred))
## [1] 2054.201
The models have improved, but there’s a limit in the case of this data set, this could be the result of a lack of information for the interpretation of the outliers, as there is some columns related to the Price, but not many of them are good a correlation or not always. At the end, without tuning too much the models the predictions have improved by scaling the data or using multiple columns in the case of the linear regression.