
La base de datos USArrests contiene estadĂsticas en arrestos por cada 100,000 residentes por agresiĂłn, asesinato y violaciĂłn en cada uno de los 50 estados de EE.UU. en 1973.
#install.packages("caret") #Algoritmos de aprendizaje automático
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
#install.packages("datasets") #Para usar la base de datos "Iris"
library(datasets)
#install.packages("lattice") #Crear gráficos
library(lattice)
#install.packages("DataExplorer") #Análisis Descriptivo"
library(DataExplorer)
#install.packages("kernlab")
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
#install.packages("dplyr") # ManipulaciĂłn de datos
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#install.packages("cluster")
library(cluster)
#install.packages("factoextra")
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#install.packages("data.table")
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
#install.packages("tidyverse")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” forcats 1.0.0 âś” stringr 1.5.1
## âś” lubridate 1.9.4 âś” tibble 3.2.1
## âś” purrr 1.0.4 âś” tidyr 1.3.1
## âś” readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– kernlab::alpha() masks ggplot2::alpha()
## âś– data.table::between() masks dplyr::between()
## âś– purrr::cross() masks kernlab::cross()
## âś– dplyr::filter() masks stats::filter()
## âś– data.table::first() masks dplyr::first()
## âś– lubridate::hour() masks data.table::hour()
## âś– lubridate::isoweek() masks data.table::isoweek()
## âś– dplyr::lag() masks stats::lag()
## âś– data.table::last() masks dplyr::last()
## âś– purrr::lift() masks caret::lift()
## âś– lubridate::mday() masks data.table::mday()
## âś– lubridate::minute() masks data.table::minute()
## âś– lubridate::month() masks data.table::month()
## âś– lubridate::quarter() masks data.table::quarter()
## âś– lubridate::second() masks data.table::second()
## âś– purrr::transpose() masks data.table::transpose()
## âś– lubridate::wday() masks data.table::wday()
## âś– lubridate::week() masks data.table::week()
## âś– lubridate::yday() masks data.table::yday()
## âś– lubridate::year() masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df <- USArrests
#create_report(df)
plot_missing(df)
plot_histogram(df)
plot_correlation(df)
summary(df)
## Murder Assault UrbanPop Rape
## Min. : 0.800 Min. : 45.0 Min. :32.00 Min. : 7.30
## 1st Qu.: 4.075 1st Qu.:109.0 1st Qu.:54.50 1st Qu.:15.07
## Median : 7.250 Median :159.0 Median :66.00 Median :20.10
## Mean : 7.788 Mean :170.8 Mean :65.54 Mean :21.23
## 3rd Qu.:11.250 3rd Qu.:249.0 3rd Qu.:77.75 3rd Qu.:26.18
## Max. :17.400 Max. :337.0 Max. :91.00 Max. :46.00
df <- df[, c("Murder", "Assault", "Rape")]
datos_escalados <- scale(df)
grupos <- 3 #NĂşmero de clusters inicial
segmentos <- kmeans(datos_escalados, grupos)
df$Cluster <- as.factor(segmentos$cluster)
df$Promedio_Score <- rowMeans(df[, c("Murder", "Assault", "Rape")])
df$Seguridad <- cut(df$Promedio_Score, breaks = quantile(df$Promedio_Score, probs = c(0, 0.25, 0.5, 0.75, 1)),
labels = c("Muy Seguro", "Seguro", "Inseguro", "Muy Inseguro"),
include.lowest = TRUE)
fviz_cluster(segmentos, data=datos_escalados)
set.seed(123)
optimizacion <- clusGap(datos_escalados, FUN=kmeans, nstart=1, K.max=10)
plot(optimizacion, xlab="Numero de Clusters K")
promedio <- aggregate(df[,-ncol(df)], by=list(df$Cluster), FUN=mean)
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
promedio
## Group.1 Murder Assault Rape Cluster Promedio_Score
## 1 1 12.331579 259.31579 29.21579 NA 100.28772
## 2 2 3.078571 80.92857 11.80000 NA 31.93571
## 3 3 6.588235 145.76471 20.07647 NA 57.47647
table(df$Cluster)
##
## 1 2 3
## 19 14 17