What is it?
References: What is Supervised Machine Learning? IBM. Source: https://www.ibm.com/topics/supervised-learning
Variables’ description can be found in the following link -> https://search.r-project.org/CRAN/refmans/RgoogleMaps/html/columbus.html
# data manipulation & data visualization
library(foreign) # Read Data Stored by 'Minitab', 'S', 'SAS', 'SPSS', 'Stata', 'Systat', 'Weka', 'dBase'
library(ggplot2) # It is a system for creating graphics
library(dplyr) # A fast, consistent tool for working with data frame like objects
library(mapview) # Quickly and conveniently create interactive visualizations of spatial data with or without background maps
library(naniar) # Provides data structures and functions that facilitate the plotting of missing values and examination of imputations.
library(tmaptools) # A collection of functions to create spatial weights matrix objects from polygon 'contiguities', for summarizing these objects, and for permitting their use in spatial data analysis
library(tmap) # For drawing thematic maps
library(RColorBrewer) # It offers several color palettes
library(dlookr) # A collection of tools that support data diagnosis, exploration, and transformation
# predictive modeling
library(regclass) # Contains basic tools for visualizing, interpreting, and building regression models
library(mctest) # Multicollinearity diagnostics
library(lmtest) # Testing linear regression models
library(spdep) # A collection of functions to create spatial weights matrix objects from polygon 'contiguities', for summarizing these objects, and for permitting their use in spatial data analysis
library(sf) # A standardized way to encode spatial vector data
library(spData) # Diverse spatial datasets for demonstrating, benchmarking and teaching spatial data analysis
library(spatialreg) # A collection of all the estimation functions for spatial cross-sectional models
library(caret) # The caret package (short for Classification And Rgression Training) contains functions to streamline the model training process for complex regression and classification problems.
library(e1071) # Functions for latent class analysis, short time Fourier transform, fuzzy clustering, support vector machines, shortest path computation, bagged clustering, naive Bayes classifier, generalized k-nearest neighbor.
library(SparseM) # Provides some basic R functionality for linear algebra with sparse matrices
library(Metrics) # An implementation of evaluation metrics in R that are commonly used in supervised machine learning
library(randomForest) # Classification and regression based on a forest of trees using random inputs
library(jtools) # This is a collection of tools for more efficiently understanding and sharing the results of (primarily) regression analyses
library(xgboost) # The package includes efficient linear model solver and tree learning algorithms
library(DiagrammeR) # Build graph/network structures using functions for stepwise addition and deletion of nodes and edges
library(effects) # Graphical and tabular effect displays, e.g., of interactions, for various statistical models with linear predictors
library(rpart.plot) # Displays a tree diagram that shows the decision rules of the model
library(shinyjs)
library(sp)
#library(geoR)
library(gstat)
library(caret)
library(st)
library(entropy)
library(corpcor)
library(fdrtool)
library(sda)
library(corrplot)
library(lattice)
library(datasets)
library(DataExplorer)
library(car)
columbus <- st_read(system.file("etc/shapes/columbus.shp", package="spdep"))
## Reading layer `columbus' from data source
## `/Library/Frameworks/R.framework/Versions/4.2/Resources/library/spdep/etc/shapes/columbus.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 49 features and 20 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: 5.874907 ymin: 10.78863 xmax: 11.28742 ymax: 14.74245
## CRS: NA
col.gal.nb <- read.gal(system.file("etc/weights/columbus.gal", package="spdep"))
columbus_sf <- read_sf(system.file("etc/shapes/columbus.shp", package="spdep"))
tm_shape(columbus) + tm_polygons(col='wheat') +
tm_style("classic") +
tm_text(text='POLYID',size=0.7)
## Warning: Currect projection of shape columbus unknown. Long-lat (WGS84) is
## assumed.
# map option # 1
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(columbus) +
tm_fill("HOVAL", style="quantile", title = "House Prices (Quantile)") +
tm_layout(main.title = "Columbus, Ohio", legend.position = c("left", "top"),
legend.title.size = 0.8, legend.text.size = 0.7)
## Warning: Currect projection of shape columbus unknown. Long-lat (WGS84) is
## assumed.
## legend.postion is used for plot mode. Use view.legend.position in tm_view to set the legend position in view mode.
# map option # 2
ggplot(data = columbus_sf) +
geom_sf(aes(fill = HOVAL)) +
ggtitle(label = "Columbus, Ohio", subtitle = "House Prices in $1,000")
tmap_mode("plot")
## tmap mode set to plotting
# Take a look of a palette of colors to display a map
#tmaptools::palette_explorer()
income_map <- tm_shape(columbus) +
tm_fill("INC", palette = "Blues", style = "quantile", title = "Income") +
tm_borders(alpha=.4) + tm_layout(legend.text.size = 0.8, legend.title.size = 1.1, frame = FALSE)
distance_map <- tm_shape(columbus) +
tm_fill("DISCBD", palette = "BuPu", style = "quantile", title = "Distance to CBD") +
tm_borders(alpha=.4) + tm_layout(legend.text.size = 0.8, legend.title.size = 1.1, frame = FALSE)
tmap_arrange(income_map,distance_map,nrow=1)
## Warning: Currect projection of shape columbus unknown. Long-lat (WGS84) is
## assumed.
## Warning: Currect projection of shape columbus unknown. Long-lat (WGS84) is
## assumed.
# to estimate a spatial regression analysis it is required to build a spatial matrix that connects the neighborhoods across Columbus, Ohio
#map_centroid <- coordinates(columbus)
map.linkW <- nb2listw(col.gal.nb, style="W")
plot(columbus,border="blue",axes=FALSE,las=1, main="Columbus Ohio - Spatial Connectivity Matrix")
## Warning: plotting the first 9 out of 20 attributes; use max.plot = 20 to plot
## all
#plot(columbus,col="grey",border=grey(0.9),axes=T,add=T)
#plot(map.linkW,coords=map_centroid,pch=19,cex=0.1,col="red",add=T)
# is it required to estimate a spatial regression model?
# what is the global moran's index? how to interpret the global moran's index?
moran.test(columbus$HOVAL, listw = map.linkW, zero.policy = TRUE, na.action = na.omit)
##
## Moran I test under randomisation
##
## data: columbus$HOVAL
## weights: map.linkW
##
## Moran I statistic standard deviate = 2.1001, p-value = 0.01786
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.173645208 -0.020833333 0.008575953
# Ho: data are randomly distributed across space
# Ha: clusters of data observations might be displayed across space
What is cross-validation? It is a statistical method to evaluate and compare learning algorithms by dividing data into two segments: One used to learn or train a model and the other used to validate or test the model. (Refaelizageh, Tang, and Liu, 2009).
columbus_data <- st_drop_geometry(columbus)
# the training set is used to build the model and the test set to evaluate its predictive accuracy.
set.seed(123) # What is set.seed()? We want to make sure that we get the same results for randomization each time you run the script.
partition <- createDataPartition(y = columbus_data$INC, p=0.7, list=F)
train = columbus_data[partition, ]
test = columbus_data[-partition, ]
ols_model <- lm(HOVAL ~ INC + CRIME + OPEN + PLUMB + DISCBD + EW, data = columbus_data)
summary(ols_model)
##
## Call:
## lm(formula = HOVAL ~ INC + CRIME + OPEN + PLUMB + DISCBD + EW,
## data = columbus_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.528 -7.594 -3.516 4.516 54.171
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.3415 15.7111 2.122 0.0398 *
## INC 0.1983 0.5413 0.366 0.7159
## CRIME -0.4842 0.2127 -2.276 0.0280 *
## OPEN 0.5697 0.4654 1.224 0.2278
## PLUMB 1.7626 0.7405 2.380 0.0219 *
## DISCBD 4.1607 2.4393 1.706 0.0954 .
## EW 2.7720 4.6952 0.590 0.5581
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.4 on 42 degrees of freedom
## Multiple R-squared: 0.468, Adjusted R-squared: 0.392
## F-statistic: 6.157 on 6 and 42 DF, p-value: 0.0001079
log_ols_model <- lm(log(HOVAL) ~ log(INC) + log(CRIME) + log(OPEN +0.01) + log(PLUMB) + log(DISCBD) + EW, data = columbus_data)
summary(log_ols_model)
##
## Call:
## lm(formula = log(HOVAL) ~ log(INC) + log(CRIME) + log(OPEN +
## 0.01) + log(PLUMB) + log(DISCBD) + EW, data = columbus_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43202 -0.18759 -0.04296 0.11548 0.92501
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.079652 0.532676 3.904 0.000337 ***
## log(INC) 0.600896 0.164719 3.648 0.000724 ***
## log(CRIME) -0.147720 0.044053 -3.353 0.001700 **
## log(OPEN + 0.01) 0.005243 0.020268 0.259 0.797142
## log(PLUMB) 0.245902 0.076432 3.217 0.002494 **
## log(DISCBD) 0.426722 0.125104 3.411 0.001442 **
## EW 0.032330 0.097511 0.332 0.741872
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3017 on 42 degrees of freedom
## Multiple R-squared: 0.5741, Adjusted R-squared: 0.5133
## F-statistic: 9.436 on 6 and 42 DF, p-value: 1.465e-06
AIC(ols_model) # AIC = 317.48
## [1] 408.8867
AIC(log_ols_model) # AIC = 28.85
## [1] 30.06762
RMSE_ols_model <- sqrt(mean(ols_model$residuals^2))
RMSE_ols_model
## [1] 13.33123
RMSE_log_ols_model <- sqrt(mean(log_ols_model$residuals^2))
RMSE_log_ols_model
## [1] 0.2793216
columbus$reg_residuals <- log_ols_model$residuals
columbus$fitted <- exp(log_ols_model$fitted.values)
map_residuals <- tm_shape(columbus) +
tm_fill("reg_residuals", palette = "PuRd", style = "quantile", title = "log OLS Residuals") +
tm_borders(alpha=.4) + tm_layout(legend.text.size = 0.8, legend.title.size = 1.1, frame = FALSE)
tmap_mode("plot")
observed <- tm_shape(columbus) +
tm_fill("HOVAL", palette = "Oranges", style = "quantile", title = "HOVAL") +
tm_borders(alpha=.4) + tm_layout(legend.text.size = 0.8, legend.title.size = 1.1, frame = FALSE)
fitted <- tm_shape(columbus) +
tm_fill("fitted", palette = "Oranges", style = "quantile", title = "Fitted HOVAL") +
tm_borders(alpha=.4) + tm_layout(legend.text.size = 0.8, legend.title.size = 1.1, frame = FALSE)
tmap_arrange(observed,fitted,nrow=1)
sar_model <- lagsarlm(log(HOVAL) ~ log(INC) + log(CRIME) + log(OPEN +0.01) + log(PLUMB) + log(DISCBD) + EW, data=columbus_data, map.linkW, method="Matrix")
summary(sar_model)
##
## Call:lagsarlm(formula = log(HOVAL) ~ log(INC) + log(CRIME) + log(OPEN +
## 0.01) + log(PLUMB) + log(DISCBD) + EW, data = columbus_data,
## listw = map.linkW, method = "Matrix")
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.415234 -0.196420 -0.035814 0.112857 0.912480
##
## Type: lag
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.9135339 0.6812155 2.8090 0.0049696
## log(INC) 0.5818087 0.1549324 3.7552 0.0001732
## log(CRIME) -0.1485391 0.0407521 -3.6449 0.0002675
## log(OPEN + 0.01) 0.0061916 0.0187851 0.3296 0.7417020
## log(PLUMB) 0.2367958 0.0717860 3.2986 0.0009716
## log(DISCBD) 0.4016667 0.1309609 3.0671 0.0021617
## EW 0.0306734 0.0901117 0.3404 0.7335605
##
## Rho: 0.068092, LR test value: 0.14613, p-value: 0.70226
## Asymptotic standard error: 0.17048
## z-value: 0.39941, p-value: 0.68959
## Wald statistic: 0.15953, p-value: 0.68959
##
## Log likelihood: -6.960748 for lag model
## ML residual variance (sigma squared): 0.077707, (sigma: 0.27876)
## Number of observations: 49
## Number of parameters estimated: 9
## AIC: NA (not available for weighted model), (AIC for lm: 30.068)
## LM test for residual autocorrelation
## test value: 3.076, p-value: 0.079455
RMSE_SAR <- sqrt(mean((columbus_data$HOVAL - sar_model$fitted.values)^2))
RMSE_SAR
## [1] 39.27636
RMSE_SAR_residual <- sqrt(mean((sar_model$residuals)^2))
RMSE_SAR_residual
## [1] 0.2787592
sem_model <- errorsarlm(log(HOVAL) ~ log(INC) + log(CRIME) + log(OPEN +0.01) + log(PLUMB) + log(DISCBD) + EW, data=columbus_data, map.linkW, method="Matrix")
summary(sem_model)
##
## Call:errorsarlm(formula = log(HOVAL) ~ log(INC) + log(CRIME) + log(OPEN +
## 0.01) + log(PLUMB) + log(DISCBD) + EW, data = columbus_data,
## listw = map.linkW, method = "Matrix")
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.464112 -0.167533 -0.056421 0.096777 0.921712
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.9574014 0.4827496 4.0547 5.020e-05
## log(INC) 0.6436861 0.1472091 4.3726 1.228e-05
## log(CRIME) -0.1505657 0.0411514 -3.6588 0.0002534
## log(OPEN + 0.01) 0.0039052 0.0188312 0.2074 0.8357152
## log(PLUMB) 0.2685289 0.0670014 4.0078 6.128e-05
## log(DISCBD) 0.4389545 0.1079582 4.0660 4.783e-05
## EW 0.0448952 0.0785217 0.5718 0.5674875
##
## Lambda: -0.19851, LR test value: 0.65105, p-value: 0.41974
## Asymptotic standard error: 0.21605
## z-value: -0.91883, p-value: 0.35818
## Wald statistic: 0.84426, p-value: 0.35818
##
## Log likelihood: -6.708288 for error model
## ML residual variance (sigma squared): 0.076342, (sigma: 0.2763)
## Number of observations: 49
## Number of parameters estimated: 9
## AIC: 31.417, (AIC for lm: 30.068)
RMSE_SEM <- sqrt(mean((columbus_data$HOVAL - sem_model$fitted.values)^2))
RMSE_SEM
## [1] 39.27346
RMSE_SEM_residual <- sqrt(mean((sem_model$residuals)^2))
RMSE_SEM_residual
## [1] 0.2763001
library(xgboost)
columbus_data_alt <- columbus_data %>% select(HOVAL, INC, CRIME, OPEN, PLUMB, DISCBD, EW)
columbus_data_alt$INC <- log(columbus_data_alt$INC)
columbus_data_alt$CRIME <- log(columbus_data_alt$CRIME)
columbus_data_alt$OPEN <- ((columbus_data_alt$OPEN) + 0.01)
columbus_data_alt$OPEN <- log(columbus_data_alt$OPEN)
columbus_data_alt$PLUMB <- log(columbus_data_alt$PLUMB)
columbus_data_alt$DISCBD <- log(columbus_data_alt$DISCBD)
summary(columbus_data_alt)
## HOVAL INC CRIME OPEN
## Min. :17.90 Min. :1.499 Min. :-1.724 Min. :-4.60517
## 1st Qu.:25.70 1st Qu.:2.299 1st Qu.: 2.998 1st Qu.:-1.30998
## Median :33.50 Median :2.594 Median : 3.526 Median : 0.01599
## Mean :38.44 Mean :2.591 Mean : 3.297 Mean :-0.54135
## 3rd Qu.:43.30 3rd Qu.:2.908 3rd Qu.: 3.883 3rd Qu.: 1.37281
## Max. :96.40 Max. :3.436 Max. : 4.233 Max. : 3.21920
## PLUMB DISCBD EW
## Min. :-2.01934 Min. :-0.9943 Min. :0.0000
## 1st Qu.:-1.10157 1st Qu.: 0.5306 1st Qu.:0.0000
## Median : 0.02361 Median : 0.9821 Median :1.0000
## Mean : 0.03361 Mean : 0.8864 Mean :0.5918
## 3rd Qu.: 0.92991 3rd Qu.: 1.3584 3rd Qu.:1.0000
## Max. : 2.93445 Max. : 1.7174 Max. :1.0000
set.seed(123) # What is set.seed()? We want to make sure that we get the same results for randomization each time you run the script.
cv_data <- createDataPartition(y = columbus_data_alt$INC, p=0.7, list=F)
cv_train = columbus_data_alt[cv_data, ]
cv_test = columbus_data_alt[-cv_data, ]
# define explanatory variables (X's) and dependent variable (Y) in training set
train_x = data.matrix(cv_train[, -1])
train_y = cv_train[,1]
# define explanatory variables (X's) and dependent variable (Y) in testing set
test_x = data.matrix(cv_test[, -1])
test_y = cv_test[, 1]
# define final training and testing sets
xgb_train = xgb.DMatrix(data = train_x, label = train_y)
xgb_test = xgb.DMatrix(data = test_x, label = test_y)
# Lets fit XGBoost regression model and display RMSE for both training and testing data at each round
watchlist = list(train=xgb_train, test=xgb_test)
model_xgb = xgb.train(data=xgb_train, max.depth=3, watchlist=watchlist, nrounds=70) # the more the number of rounds selected, the longer the time to display the results.
## [1] train-rmse:32.558935 test-rmse:28.951619
## [2] train-rmse:25.355109 test-rmse:22.671456
## [3] train-rmse:20.152249 test-rmse:18.730544
## [4] train-rmse:16.339010 test-rmse:16.795286
## [5] train-rmse:13.500239 test-rmse:16.129408
## [6] train-rmse:11.332169 test-rmse:16.509411
## [7] train-rmse:9.637591 test-rmse:17.161065
## [8] train-rmse:8.354674 test-rmse:17.943420
## [9] train-rmse:7.301888 test-rmse:18.285487
## [10] train-rmse:6.493047 test-rmse:18.437669
## [11] train-rmse:5.783834 test-rmse:19.281797
## [12] train-rmse:5.210777 test-rmse:20.024676
## [13] train-rmse:4.766990 test-rmse:20.409095
## [14] train-rmse:4.216826 test-rmse:20.960376
## [15] train-rmse:3.892447 test-rmse:21.462003
## [16] train-rmse:3.594895 test-rmse:21.755089
## [17] train-rmse:3.117573 test-rmse:22.122776
## [18] train-rmse:2.912595 test-rmse:22.249679
## [19] train-rmse:2.744708 test-rmse:22.289308
## [20] train-rmse:2.528766 test-rmse:22.453613
## [21] train-rmse:2.428361 test-rmse:22.469520
## [22] train-rmse:2.245889 test-rmse:22.488167
## [23] train-rmse:1.995308 test-rmse:22.695846
## [24] train-rmse:1.892313 test-rmse:22.762292
## [25] train-rmse:1.706430 test-rmse:22.803818
## [26] train-rmse:1.530626 test-rmse:22.917683
## [27] train-rmse:1.462334 test-rmse:22.956625
## [28] train-rmse:1.307067 test-rmse:22.853381
## [29] train-rmse:1.195997 test-rmse:22.904000
## [30] train-rmse:1.114445 test-rmse:22.882855
## [31] train-rmse:1.033702 test-rmse:22.894206
## [32] train-rmse:0.930637 test-rmse:22.885834
## [33] train-rmse:0.855417 test-rmse:22.954493
## [34] train-rmse:0.777327 test-rmse:22.989968
## [35] train-rmse:0.694897 test-rmse:23.009517
## [36] train-rmse:0.640067 test-rmse:23.024204
## [37] train-rmse:0.585913 test-rmse:23.046855
## [38] train-rmse:0.544947 test-rmse:23.103338
## [39] train-rmse:0.496445 test-rmse:23.130836
## [40] train-rmse:0.445176 test-rmse:23.174879
## [41] train-rmse:0.423205 test-rmse:23.195423
## [42] train-rmse:0.394193 test-rmse:23.194732
## [43] train-rmse:0.361441 test-rmse:23.219765
## [44] train-rmse:0.338051 test-rmse:23.241007
## [45] train-rmse:0.303142 test-rmse:23.258915
## [46] train-rmse:0.285972 test-rmse:23.256439
## [47] train-rmse:0.265118 test-rmse:23.277356
## [48] train-rmse:0.239570 test-rmse:23.290082
## [49] train-rmse:0.223512 test-rmse:23.306106
## [50] train-rmse:0.202481 test-rmse:23.314676
## [51] train-rmse:0.192101 test-rmse:23.326943
## [52] train-rmse:0.180221 test-rmse:23.315882
## [53] train-rmse:0.168303 test-rmse:23.313811
## [54] train-rmse:0.152294 test-rmse:23.322716
## [55] train-rmse:0.139646 test-rmse:23.332230
## [56] train-rmse:0.128065 test-rmse:23.340492
## [57] train-rmse:0.118687 test-rmse:23.349151
## [58] train-rmse:0.112001 test-rmse:23.356265
## [59] train-rmse:0.103464 test-rmse:23.349631
## [60] train-rmse:0.098326 test-rmse:23.355807
## [61] train-rmse:0.088950 test-rmse:23.361121
## [62] train-rmse:0.083060 test-rmse:23.356842
## [63] train-rmse:0.079157 test-rmse:23.355080
## [64] train-rmse:0.072423 test-rmse:23.357653
## [65] train-rmse:0.069043 test-rmse:23.357478
## [66] train-rmse:0.063032 test-rmse:23.356953
## [67] train-rmse:0.059618 test-rmse:23.358380
## [68] train-rmse:0.055434 test-rmse:23.354991
## [69] train-rmse:0.052059 test-rmse:23.352430
## [70] train-rmse:0.047479 test-rmse:23.356072
# Looks like the lowest RMSE for both training and test dataset is achieved at 59 round.
# Lets estimate our final regression model
reg_xgb = xgboost(data = xgb_train, max.depth = 3, nrounds = 59, verbose = 0) # setting verbose = 0 avoids to display the training and testing error for each round.
prediction_xgb_test<-predict(reg_xgb, xgb_test)
RMSE_XGB <- rmse(prediction_xgb_test, cv_test$HOVAL)
RMSE_XGB
## [1] 23.34963
# Lets do some diagnostic check of regression residuals
xgb_reg_residuals<-cv_test$HOVAL - prediction_xgb_test
plot(xgb_reg_residuals, xlab= "Dependent Variable", ylab = "Residuals", main = 'XGBoost Regression Residuals')
abline(0,0)
# Plot first 3 trees of model
xgb.plot.tree(model=reg_xgb, trees=0:2)
importance_matrix <- xgb.importance(model = reg_xgb)
xgb.plot.importance(importance_matrix, xlab = "Explanatory Variables X's Importance")
svm_model <- svm (formula = log(HOVAL) ~ log(INC) + log(CRIME) + log(OPEN + 0.01) + log(PLUMB) + log(DISCBD) + EW, data = train, type = 'eps-regression', kernel = 'radial')
# Create residual vs. fitted plot
plot(svm_model$fitted, svm_model$residuals, main="SVM Residual vs. Fitted Values", xlab="Fitted Values", ylab="Residuals")
abline(0,0)
# RMSE represents the average difference between the observed known outcome values in the test data and the predicted outcome values by the model.
# The lower the RMSE, the better the model.
predicted_dv=predict(svm_model, newdata = test)
RMSE_SVM <- rmse(predicted_dv, test$HOVAL)
RMSE_SVM
## [1] 37.21054
dv_svm<-data.frame(exp(svm_model$fitted),train$HOVAL)
ggplot(dv_svm, aes(x =exp.svm_model.fitted. , y = train.HOVAL)) +
geom_point() +
stat_smooth() +
labs(x='Predicted Values', y='Actual Values', title='SVM Predicted vs. Actual Values')
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
decision_tree_model <- rpart(log(HOVAL) ~ log(INC) + log(CRIME) + log(OPEN + 0.01) + log(PLUMB) + log(DISCBD) + EW, data = train)
# summary(decision_tree_regression)
plot(decision_tree_model, compress = TRUE)
text(decision_tree_model, use.n = TRUE)
rpart.plot(decision_tree_model)
decision_tree_prediction <- predict(decision_tree_model,test)
RMSE_decision_tree <- rmse(decision_tree_prediction, test$HOVAL)
RMSE_decision_tree
## [1] 37.14158
rf_model <- randomForest(HOVAL ~ INC + CRIME + OPEN + PLUMB + DISCBD + EW, data= cv_train, proximity=TRUE)
# random_forest<-randomForest(MEDV~.,data=train_alt,importance=TRUE, proximity=TRUE)
print(rf_model) ### the train data set model accuracy is around 85%.
##
## Call:
## randomForest(formula = HOVAL ~ INC + CRIME + OPEN + PLUMB + DISCBD + EW, data = cv_train, proximity = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 295.1348
## % Var explained: 18.62
# Prediction & Confusion Matrix – test data
rf_prediction <- predict(rf_model,cv_test)
rf_prediction
## 6 8 16 17 21 24 25 27
## 38.58272 33.17435 32.26577 42.98915 26.94611 36.46635 33.29235 37.05375
## 36 41 46 47
## 58.59132 49.51956 53.89565 48.00193
# confusionMatrix(rf_prediction_train_data, train$MEDV) # a confusion matrix is essentially a table that categorizes predictions against actual values.
RMSE_rf <- rmse(rf_prediction, cv_test$HOVAL)
RMSE_rf
## [1] 12.73012
# How to interpret varImpPlot()? The higher the value of mean decrease accuracy, the higher the importance of the variable in the model.
# In other words, mean decrease accuracy represents how much removing each variable reduces the accuracy of the model.
varImpPlot(rf_model, n.var = 5, main = "Top 10 - Variable") # It displays a variable importance plot from the random forest model.
importance(rf_model)
## IncNodePurity
## INC 2119.0391
## CRIME 3847.4504
## OPEN 1418.7295
## PLUMB 2138.0608
## DISCBD 2231.4207
## EW 218.2881
# It is worth mentioning that IncNodePurity by how much the model error increases by dropping each of the specified explanatory variables.
# Briefly, varImpPlot() indicates each variable's importance in explaining the performance of the dependent variable (Y).
str(columbus_data)
## 'data.frame': 49 obs. of 20 variables:
## $ AREA : num 0.3094 0.2593 0.1925 0.0838 0.4889 ...
## $ PERIMETER : num 2.44 2.24 2.19 1.43 3 ...
## $ COLUMBUS_ : num 2 3 4 5 6 7 8 9 10 11 ...
## $ COLUMBUS_I: num 5 1 6 2 7 8 4 3 18 10 ...
## $ POLYID : num 1 2 3 4 5 6 7 8 9 10 ...
## $ NEIG : int 5 1 6 2 7 8 4 3 18 10 ...
## $ HOVAL : num 80.5 44.6 26.4 33.2 23.2 ...
## $ INC : num 19.53 21.23 15.96 4.48 11.25 ...
## $ CRIME : num 15.7 18.8 30.6 32.4 50.7 ...
## $ OPEN : num 2.851 5.297 4.535 0.394 0.406 ...
## $ PLUMB : num 0.217 0.321 0.374 1.187 0.625 ...
## $ DISCBD : num 5.03 4.27 3.89 3.7 2.83 3.78 2.74 2.89 3.17 4.33 ...
## $ X : num 38.8 35.6 39.8 36.5 40 ...
## $ Y : num 44.1 42.4 41.2 40.5 38 ...
## $ NSA : num 1 1 1 1 1 1 1 1 1 1 ...
## $ NSB : num 1 1 1 1 1 1 1 1 1 1 ...
## $ EW : num 1 0 1 0 1 1 0 0 1 1 ...
## $ CP : num 0 0 0 0 0 0 0 0 0 0 ...
## $ THOUS : num 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 ...
## $ NEIGNO : num 1005 1001 1006 1002 1007 ...
# Identify the name of the variables
colnames(columbus_data)
## [1] "AREA" "PERIMETER" "COLUMBUS_" "COLUMBUS_I" "POLYID"
## [6] "NEIG" "HOVAL" "INC" "CRIME" "OPEN"
## [11] "PLUMB" "DISCBD" "X" "Y" "NSA"
## [16] "NSB" "EW" "CP" "THOUS" "NEIGNO"
# Identify missing values
columbus_missing_values <- sum(is.na(columbus_data))
columbus_missing_values
## [1] 0
columbus_descriptive_statistics <- summary(columbus_data)
columbus_descriptive_statistics
## AREA PERIMETER COLUMBUS_ COLUMBUS_I POLYID
## Min. :0.03438 Min. :0.9021 Min. : 2 Min. : 1 Min. : 1
## 1st Qu.:0.09315 1st Qu.:1.4023 1st Qu.:14 1st Qu.:13 1st Qu.:13
## Median :0.17477 Median :1.8410 Median :26 Median :25 Median :25
## Mean :0.18649 Mean :1.8887 Mean :26 Mean :25 Mean :25
## 3rd Qu.:0.24669 3rd Qu.:2.1992 3rd Qu.:38 3rd Qu.:37 3rd Qu.:37
## Max. :0.69926 Max. :5.0775 Max. :50 Max. :49 Max. :49
## NEIG HOVAL INC CRIME
## Min. : 1 Min. :17.90 Min. : 4.477 Min. : 0.1783
## 1st Qu.:13 1st Qu.:25.70 1st Qu.: 9.963 1st Qu.:20.0485
## Median :25 Median :33.50 Median :13.380 Median :34.0008
## Mean :25 Mean :38.44 Mean :14.375 Mean :35.1288
## 3rd Qu.:37 3rd Qu.:43.30 3rd Qu.:18.324 3rd Qu.:48.5855
## Max. :49 Max. :96.40 Max. :31.070 Max. :68.8920
## OPEN PLUMB DISCBD X
## Min. : 0.0000 Min. : 0.1327 Min. :0.370 Min. :24.25
## 1st Qu.: 0.2598 1st Qu.: 0.3323 1st Qu.:1.700 1st Qu.:36.15
## Median : 1.0061 Median : 1.0239 Median :2.670 Median :39.61
## Mean : 2.7709 Mean : 2.3639 Mean :2.852 Mean :39.46
## 3rd Qu.: 3.9364 3rd Qu.: 2.5343 3rd Qu.:3.890 3rd Qu.:43.44
## Max. :24.9981 Max. :18.8111 Max. :5.570 Max. :51.24
## Y NSA NSB EW
## Min. :24.96 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:28.26 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :31.91 Median :0.0000 Median :1.0000 Median :1.0000
## Mean :32.37 Mean :0.4898 Mean :0.5102 Mean :0.5918
## 3rd Qu.:35.92 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :44.07 Max. :1.0000 Max. :1.0000 Max. :1.0000
## CP THOUS NEIGNO
## Min. :0.0000 Min. :1000 Min. :1001
## 1st Qu.:0.0000 1st Qu.:1000 1st Qu.:1013
## Median :0.0000 Median :1000 Median :1025
## Mean :0.4898 Mean :1000 Mean :1025
## 3rd Qu.:1.0000 3rd Qu.:1000 3rd Qu.:1037
## Max. :1.0000 Max. :1000 Max. :1049
columbus_describe <- describe(columbus_data)
columbus_describe
## # A tibble: 20 × 26
## described_variables n na mean sd se_mean IQR skewness
## <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AREA 49 0 0.186 0.132 0.0189 0.154 1.77
## 2 PERIMETER 49 0 1.89 0.740 0.106 0.797 1.72
## 3 COLUMBUS_ 49 0 26 14.3 2.04 24 0
## 4 COLUMBUS_I 49 0 25 14.3 2.04 24 0
## 5 POLYID 49 0 25 14.3 2.04 24 0
## 6 NEIG 49 0 25 14.3 2.04 24 0
## 7 HOVAL 49 0 38.4 18.5 2.64 17.6 1.38
## 8 INC 49 0 14.4 5.70 0.815 8.36 0.956
## 9 CRIME 49 0 35.1 16.7 2.39 28.5 0.0353
## 10 OPEN 49 0 2.77 4.67 0.667 3.68 3.34
## 11 PLUMB 49 0 2.36 3.89 0.556 2.20 3.05
## 12 DISCBD 49 0 2.85 1.44 0.206 2.19 0.257
## 13 X 49 0 39.5 6.44 0.920 7.29 -0.315
## 14 Y 49 0 32.4 4.87 0.695 7.66 0.458
## 15 NSA 49 0 0.490 0.505 0.0722 1 0.0421
## 16 NSB 49 0 0.510 0.505 0.0722 1 -0.0421
## 17 EW 49 0 0.592 0.497 0.0709 1 -0.386
## 18 CP 49 0 0.490 0.505 0.0722 1 0.0421
## 19 THOUS 49 0 1000 0 0 0 NaN
## 20 NEIGNO 49 0 1025 14.3 2.04 24 0
## # ℹ 18 more variables: kurtosis <dbl>, p00 <dbl>, p01 <dbl>, p05 <dbl>,
## # p10 <dbl>, p20 <dbl>, p25 <dbl>, p30 <dbl>, p40 <dbl>, p50 <dbl>,
## # p60 <dbl>, p70 <dbl>, p75 <dbl>, p80 <dbl>, p90 <dbl>, p95 <dbl>,
## # p99 <dbl>, p100 <dbl>
columbus_variance <- var(columbus_data)
columbus_variance
## AREA PERIMETER COLUMBUS_ COLUMBUS_I POLYID
## AREA 0.0174900746 0.09389505 -0.3350848 -0.4436195 -0.3350848
## PERIMETER 0.0938950523 0.54800642 -1.4247040 -1.7632600 -1.4247040
## COLUMBUS_ -0.3350848333 -1.42470400 204.1666667 91.3958333 204.1666667
## COLUMBUS_I -0.4436195208 -1.76326004 91.3958333 204.1666667 91.3958333
## POLYID -0.3350848333 -1.42470400 204.1666667 91.3958333 204.1666667
## NEIG -0.4436195208 -1.76326004 91.3958333 204.1666667 91.3958333
## HOVAL 0.6180899225 3.40522690 -34.5375893 -52.6077534 -34.5375893
## INC 0.2962250444 1.54993137 25.8697917 -3.8608970 25.8697917
## CRIME -0.8523669523 -5.47117464 -41.2934108 56.5739759 -41.2934108
## OPEN -0.0501539033 -0.14672878 2.1703829 9.3173659 2.1703829
## PLUMB -0.1186181999 -0.53604968 -5.8507800 16.1927984 -5.8507800
## DISCBD 0.0457420709 0.27357457 1.2112500 -6.5493750 1.2112500
## X 0.0816190993 -0.08625353 2.2706250 -56.2004159 2.2706250
## Y 0.0589190895 0.16625803 -67.6410409 -36.9297916 -67.6410409
## NSA -0.0057696616 -0.04686561 -5.8750000 -1.9166667 -5.8750000
## NSB -0.0003917551 -0.02007516 -5.9791667 -2.0833333 -5.9791667
## EW 0.0095517874 0.01112927 1.5208333 -3.6250000 1.5208333
## CP -0.0270086199 -0.15644800 0.6041667 2.7708333 0.6041667
## THOUS 0.0000000000 0.00000000 0.0000000 0.0000000 0.0000000
## NEIGNO -0.4436195208 -1.76326004 91.3958333 204.1666667 91.3958333
## NEIG HOVAL INC CRIME OPEN
## AREA -0.4436195 0.6180899 0.2962250 -0.8523670 -0.05015390
## PERIMETER -1.7632600 3.4052269 1.5499314 -5.4711746 -0.14672878
## COLUMBUS_ 91.3958333 -34.5375893 25.8697917 -41.2934108 2.17038292
## COLUMBUS_I 204.1666667 -52.6077534 -3.8608970 56.5739759 9.31736585
## POLYID 91.3958333 -34.5375893 25.8697917 -41.2934108 2.17038292
## NEIG 204.1666667 -52.6077534 -3.8608970 56.5739759 9.31736585
## HOVAL -52.6077534 340.9957215 52.6466978 -177.5026028 21.70197547
## INC -3.8608970 52.6466978 32.5285216 -66.3797476 4.07917969
## CRIME 56.5739759 -177.5026028 -66.3797476 279.9629057 -5.09762159
## OPEN 9.3173659 21.7019755 4.0791797 -5.0976216 21.79095101
## PLUMB 16.1927984 -1.4651665 -5.6817997 28.1638021 3.48669905
## DISCBD -6.5493750 12.9327167 4.9422982 -17.8915214 0.13628510
## X -56.2004159 7.3276839 5.1147471 3.3762817 -3.37729145
## Y -36.9297916 12.4507076 -8.1855839 10.6761639 -0.77718288
## NSA -1.9166667 0.4628046 -1.1876152 1.8304042 -0.13868209
## NSB -2.0833333 1.3551124 -0.8398014 1.1032161 -0.08560566
## EW -3.6250000 -0.1463648 0.4262870 -0.3884953 -0.25329879
## CP 2.7708333 -4.5887789 -1.6890735 6.3524888 0.15550228
## THOUS 0.0000000 0.0000000 0.0000000 0.0000000 0.00000000
## NEIGNO 204.1666667 -52.6077534 -3.8608970 56.5739759 9.31736585
## PLUMB DISCBD X Y NSA
## AREA -0.1186182 0.04574207 0.08161910 0.05891909 -0.005769662
## PERIMETER -0.5360497 0.27357457 -0.08625353 0.16625803 -0.046865609
## COLUMBUS_ -5.8507800 1.21125000 2.27062504 -67.64104092 -5.875000000
## COLUMBUS_I 16.1927984 -6.54937500 -56.20041594 -36.92979165 -1.916666667
## POLYID -5.8507800 1.21125000 2.27062504 -67.64104092 -5.875000000
## NEIG 16.1927984 -6.54937500 -56.20041594 -36.92979165 -1.916666667
## HOVAL -1.4651665 12.93271669 7.32768389 12.45070762 0.462804599
## INC -5.6817997 4.94229823 5.11474709 -8.18558393 -1.187615211
## CRIME 28.1638021 -17.89152136 3.37628169 10.67616386 1.830404155
## OPEN 3.4866990 0.13628510 -3.37729145 -0.77718288 -0.138682093
## PLUMB 15.1328380 -3.21290103 -3.67530466 1.22104033 0.479038685
## DISCBD -3.2129010 2.08359158 1.07619102 -0.06160559 -0.098520408
## X -3.6753047 1.07619102 41.44770140 0.04836062 -0.391101253
## Y 1.2210403 -0.06160559 0.04836062 23.67235709 1.996173384
## NSA 0.4790387 -0.09852041 -0.39110125 1.99617338 0.255102041
## NSB 0.4332737 -0.08356293 -0.23973221 1.98653478 0.244897959
## EW -0.7188106 0.07418367 2.45199409 -0.37097791 -0.066751701
## CP 0.8946831 -0.61768707 -0.09964279 -0.27174314 0.005102041
## THOUS 0.0000000 0.00000000 0.00000000 0.00000000 0.000000000
## NEIGNO 16.1927984 -6.54937500 -56.20041594 -36.92979165 -1.916666667
## NSB EW CP THOUS NEIGNO
## AREA -0.0003917551 0.009551787 -0.027008620 0 -0.4436195
## PERIMETER -0.0200751620 0.011129271 -0.156448005 0 -1.7632600
## COLUMBUS_ -5.9791666667 1.520833333 0.604166667 0 91.3958333
## COLUMBUS_I -2.0833333333 -3.625000000 2.770833333 0 204.1666667
## POLYID -5.9791666667 1.520833333 0.604166667 0 91.3958333
## NEIG -2.0833333333 -3.625000000 2.770833333 0 204.1666667
## HOVAL 1.3551123805 -0.146364825 -4.588778860 0 -52.6077534
## INC -0.8398014349 0.426287023 -1.689073544 0 -3.8608970
## CRIME 1.1032160948 -0.388495334 6.352488843 0 56.5739759
## OPEN -0.0856056569 -0.253298787 0.155502282 0 9.3173659
## PLUMB 0.4332736901 -0.718810602 0.894683143 0 16.1927984
## DISCBD -0.0835629252 0.074183673 -0.617687075 0 -6.5493750
## X -0.2397322058 2.451994085 -0.099642794 0 -56.2004159
## Y 1.9865347827 -0.370977914 -0.271743137 0 -36.9297916
## NSA 0.2448979592 -0.066751701 0.005102041 0 -1.9166667
## NSB 0.2551020408 -0.058248299 -0.005102041 0 -2.0833333
## EW -0.0582482993 0.246598639 -0.004251701 0 -3.6250000
## CP -0.0051020408 -0.004251701 0.255102041 0 2.7708333
## THOUS 0.0000000000 0.000000000 0.000000000 0 0.0000000
## NEIGNO -2.0833333333 -3.625000000 2.770833333 0 204.1666667
print("Variable Dependiente = HOVAL (valor de la vivienda en $1,000)")
## [1] "Variable Dependiente = HOVAL (valor de la vivienda en $1,000)"
columbus_data$HOVAL
## [1] 80.467 44.567 26.350 33.200 23.225 28.750 75.000 37.125 52.600 96.400
## [11] 19.700 19.900 41.700 42.900 18.000 18.800 41.750 60.000 30.600 81.267
## [21] 19.975 30.450 47.733 53.200 17.900 20.300 34.100 22.850 32.500 22.500
## [31] 31.800 40.300 23.600 28.450 27.000 36.300 43.300 22.700 39.600 61.950
## [41] 42.100 44.333 25.700 33.500 27.733 76.100 42.500 26.800 35.800
summary(columbus_data$HOVAL)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 17.90 25.70 33.50 38.44 43.30 96.40
# General
plot_histogram(columbus_data)
# Con variables especificas
hist(columbus_data$HOVAL, main = "Histograma de HOVAL")
plot(columbus_data$INC, columbus_data$HOVAL, main = "Gráfico de Dispersión: INC vs HOVAL")
boxplot(columbus_data$CRIME, main = "Diagrama de Caja: CRIME")
# Mapa de distribución espacial de HOVAL en Columbus, Ohio
tm_shape(columbus) + tm_fill("HOVAL", palette = "RdYlBu", title = "HOVAL") +
tm_borders() + tm_layout(main.title = "Mapa de Distribución de HOVAL en Columbus, Ohio")
plot_correlation(columbus_data)
# Correlación con HOVAL
plot_correlation_HOVAL <- function(columbus_data_alt) {
# Calcular la matriz de correlación
corr_matrix <- cor(columbus_data_alt)
# Obtener las correlaciones de HOVAL con las otras variables
correlations_hoval <- corr_matrix["HOVAL", ]
# Crear un gráfico de barras para visualizar las correlaciones
barplot(correlations_hoval,
main = "Correlación de HOVAL con todas las variables",
xlab = "Variables",
ylab = "Correlación",
col = ifelse(correlations_hoval > 0, "blue", "red"), # Colorear positivas y negativas
ylim = c(-1, 1)) # Establecer límites para el eje y
}
plot_correlation_HOVAL(columbus_data)
Las variables significativas son:
HOVAL (Valor de la vivienda): Representa el valor promedio de la vivienda en cada vecindario, expresado en miles de dólares ($1,000).
INC (Ingreso del hogar): Indica el ingreso promedio de los hogares en cada vecindario, también en miles de dólares ($1,000).
CRIME (Tasa de criminalidad): Mide la tasa de crímenes en cada vecindario, específicamente el número de robos residenciales y robos de vehículos por cada mil hogares en el vecindario.
OPEN (Espacio abierto): Indica la cantidad de espacio abierto o áreas verdes disponibles en cada vecindario.
PLUMB (Viviendas sin plomería): Representa el porcentaje de unidades de vivienda en cada vecindario que no cuentan con instalaciones de plomería.
DISCBD (Distancia al centro de la ciudad): Mide la distancia de cada vecindario al centro central de negocios o al distrito central de la ciudad.
EW (Dummy Este-Oeste): Es una variable ficticia que indica la ubicación este o oeste del vecindario. Si tiene un valor de 1, indica que el vecindario está al este, de lo contrario, está al oeste.
# Gráfico de correlación entre todas las variables
corr_matrix <- cor(columbus_data_alt)
corrplot(corr_matrix, method = "circle", type = "upper",
tl.col = "black", tl.srt = 45, tl.cex = 0.8,
title = "Gráfico de Correlación entre Variables")
# Correlación con HOVAl pero solo con las variables significativas
correlations_hoval <- corr_matrix["HOVAL", ]
correlations_hoval
## HOVAL INC CRIME OPEN PLUMB DISCBD
## 1.00000000 0.46420480 -0.58064966 0.22966419 -0.27523324 0.40321712
## EW
## -0.01596125
# Convertir el resultado de la matriz de correlación a un dataframe
correlations_hoval_df <- as.data.frame(correlations_hoval)
# Resetear los nombres de fila para que estén en una columna separada
correlations_hoval_df$Variable <- rownames(correlations_hoval_df)
rownames(correlations_hoval_df) <- NULL
# Renombrar la columna de correlación
names(correlations_hoval_df)[1] <- "Correlation"
# Agregar color a las barras según el signo de la correlación
correlations_hoval_df$Color <- ifelse(correlations_hoval_df$Correlation > 0, "blue", "red")
# Crear el gráfico de barras con los valores de correlación
ggplot(data = correlations_hoval_df, aes(x = Variable, y = Correlation, fill = Color)) +
geom_bar(stat = "identity") +
geom_text(aes(label = round(Correlation, 2)), vjust = -0.5, size = 3.5) + # Agregar valores encima de las barras
labs(title = "Correlación de HOVAL con las Variables más significativas",
x = "Variables",
y = "Correlación con HOVAL") +
scale_fill_manual(values = c("blue", "red")) + # Asignar colores manualmente
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
vif(ols_model)
## INC CRIME OPEN PLUMB DISCBD EW
## 2.206434 2.932450 1.092822 1.921183 2.870002 1.258499
bptest(ols_model) # Test de Breusch-Pagan
##
## studentized Breusch-Pagan test
##
## data: ols_model
## BP = 4.0708, df = 6, p-value = 0.6671
durbinWatsonTest(ols_model)
## lag Autocorrelation D-W Statistic p-value
## 1 0.1270363 1.666834 0.23
## Alternative hypothesis: rho != 0
moran.test(residuals(ols_model), listw = map.linkW)
##
## Moran I test under randomisation
##
## data: residuals(ols_model)
## weights: map.linkW
##
## Moran I statistic standard deviate = 1.1575, p-value = 0.1235
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.08334098 -0.02083333 0.00809922
shapiro.test(residuals(ols_model)) # Test de Shapiro-Wilk
##
## Shapiro-Wilk normality test
##
## data: residuals(ols_model)
## W = 0.85987, p-value = 3.408e-05
qqnorm(residuals(ols_model)) # Q-Q plot
qqline(residuals(ols_model))
Realizar los cambios necesarios para mejorar la estimación de los resultados.
¿Cuál de los modelos de regresión muestra los mejores resultados? Incluir una breve justificación de la selección de dicho modelo de regresión.
rmse_values <- data.frame(
Model = c("OLS", "Log-OLS", "SAR","SAR Residuales", "SEM", "SEM Residuales", "XGBoost", "SVM", "Decision Tree", "Random Forest"),
RMSE = c(RMSE_ols_model, RMSE_log_ols_model, RMSE_SAR, RMSE_SAR_residual, RMSE_SEM, RMSE_SAR_residual, RMSE_XGB, RMSE_SVM, RMSE_decision_tree, RMSE_rf)
)
rmse_values
## Model RMSE
## 1 OLS 13.3312337
## 2 Log-OLS 0.2793216
## 3 SAR 39.2763645
## 4 SAR Residuales 0.2787592
## 5 SEM 39.2734646
## 6 SEM Residuales 0.2787592
## 7 XGBoost 23.3496309
## 8 SVM 37.2105385
## 9 Decision Tree 37.1415810
## 10 Random Forest 12.7301156
# Crear el gráfico de barras con valores encima de las barras
ggplot(rmse_values, aes(x = Model, y = RMSE)) +
geom_bar(stat = "identity", fill = "skyblue") +
geom_text(aes(label = round(RMSE, 3)), vjust = -0.5, size = 3.5) + # Agregar valores encima de las barras
labs(title = "RMSE por Modelo de Regresión",
x = "Modelo de Regresión",
y = "RMSE") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))