library(janitor)
library(tidyverse)
library(flextable)
library(moments)
setwd("~/Downloads/proy_cars_almada")
df=read.csv("car_dataset.csv")
colnames(df)
## [1] "Car.Make" "Car.Model"
## [3] "Year" "Body.Type"
## [5] "Color.Options" "Fuel.Type"
## [7] "Engine.Size..L." "Horsepower"
## [9] "Torque..Nm." "Transmission.Type"
## [11] "Acceleration..0.60.mph." "Top.Speed..mph."
## [13] "Mileage..MPG." "Safety.Features"
## [15] "Entertainment.Features" "Interior.Features"
## [17] "Exterior.Features" "Price...."
## [19] "Customer.Ratings" "Sales.Figures..Units.Sold."
previo=head(df)
library(ggplot2)
colnames(df)=c("Marca","Modelo","Año","Forma","Colores Disponibles",
"Tipo de Combustible","Tamaño del Motor","Caballos de Fuerza","Torque",
"Tipo de Transmisión","Aceleración","Velocidad Máxima",
"Kilometraje","Seguridad","Entretenimiento","Interior","Exterior",
"Precio","Calificación del cliente","Unidades Vendidas")
colnames(df)
## [1] "Marca" "Modelo"
## [3] "Año" "Forma"
## [5] "Colores Disponibles" "Tipo de Combustible"
## [7] "Tamaño del Motor" "Caballos de Fuerza"
## [9] "Torque" "Tipo de Transmisión"
## [11] "Aceleración" "Velocidad Máxima"
## [13] "Kilometraje" "Seguridad"
## [15] "Entretenimiento" "Interior"
## [17] "Exterior" "Precio"
## [19] "Calificación del cliente" "Unidades Vendidas"
table(df$Forma)
##
## Convertible Coupe Hatchback Minivan Sedan
## 7 11 18 3 27
## SUV SUV SUV Truck Wagon
## 1 1 82 12 2
marcas_autos_top11r <- c(Infiniti=1, Acura=2, Tesla=2, Mercedes=3, Volkswagen=3,
Volvo=3, Jeep=4, BMW=4, Mazda=5, Lexus=5, Audi=5)
table(marcas_autos_top11r)
## marcas_autos_top11r
## 1 2 3 4 5
## 1 2 3 2 3
dim(df)
## [1] 164 20
df %>% tabyl(Forma)
## Forma n percent
## Convertible 7 0.042682927
## Coupe 11 0.067073171
## Hatchback 18 0.109756098
## Minivan 3 0.018292683
## SUV 1 0.006097561
## SUV 1 0.006097561
## SUV 82 0.500000000
## Sedan 27 0.164634146
## Truck 12 0.073170732
## Wagon 2 0.012195122
df %>% tabyl(Forma) %>%
adorn_pct_formatting() %>%
flextable() %>%
fontsize(size=14) %>%
autofit()
Forma | n | percent |
|---|---|---|
Convertible | 7 | 4.3% |
Coupe | 11 | 6.7% |
Hatchback | 18 | 11.0% |
Minivan | 3 | 1.8% |
SUV | 1 | 0.6% |
SUV | 1 | 0.6% |
SUV | 82 | 50.0% |
Sedan | 27 | 16.5% |
Truck | 12 | 7.3% |
Wagon | 2 | 1.2% |
Esta es una tabla mejor:
df %>% tabyl(Forma) %>%
adorn_pct_formatting() %>%
flextable() %>%
fontsize(size=14) %>%
autofit() %>%
theme_box()
Forma | n | percent |
|---|---|---|
Convertible | 7 | 4.3% |
Coupe | 11 | 6.7% |
Hatchback | 18 | 11.0% |
Minivan | 3 | 1.8% |
SUV | 1 | 0.6% |
SUV | 1 | 0.6% |
SUV | 82 | 50.0% |
Sedan | 27 | 16.5% |
Truck | 12 | 7.3% |
Wagon | 2 | 1.2% |
Esta le agrega el total al final:
df %>% tabyl(Forma) %>%
adorn_totals("row") %>%
adorn_pct_formatting() %>%
flextable() %>%
fontsize(size=14) %>%
autofit() %>%
theme_box()
Forma | n | percent |
|---|---|---|
Convertible | 7 | 4.3% |
Coupe | 11 | 6.7% |
Hatchback | 18 | 11.0% |
Minivan | 3 | 1.8% |
SUV | 1 | 0.6% |
SUV | 1 | 0.6% |
SUV | 82 | 50.0% |
Sedan | 27 | 16.5% |
Truck | 12 | 7.3% |
Wagon | 2 | 1.2% |
Total | 164 | 100.0% |
df %>% tabyl(Marca) %>%
adorn_totals("row") %>%
adorn_pct_formatting() %>%
flextable() %>%
fontsize(size=14) %>%
autofit() %>%
theme_box()
Marca | n | percent |
|---|---|---|
Acura | 2 | 1.2% |
Audi | 5 | 3.0% |
BMW | 4 | 2.4% |
Chevrolet | 24 | 14.6% |
Ford | 22 | 13.4% |
GMC | 8 | 4.9% |
Genesis | 6 | 3.7% |
Honda | 11 | 6.7% |
Hyundai | 10 | 6.1% |
Infiniti | 1 | 0.6% |
Jeep | 4 | 2.4% |
Kia | 11 | 6.7% |
Lexus | 5 | 3.0% |
Mazda | 5 | 3.0% |
Mercedes | 3 | 1.8% |
Nissan | 7 | 4.3% |
Subaru | 9 | 5.5% |
Tesla | 2 | 1.2% |
Toyota | 19 | 11.6% |
Volkswagen | 1 | 0.6% |
Volkswagen | 2 | 1.2% |
Volvo | 3 | 1.8% |
Total | 164 | 100.0% |
df %>% tabyl(Modelo) %>%
adorn_totals("row") %>%
adorn_pct_formatting() %>%
flextable() %>%
fontsize(size=14) %>%
autofit() %>%
theme_box()
Modelo | n | percent |
|---|---|---|
4Runner | 1 | 0.6% |
5 Series | 2 | 1.2% |
A4 | 1 | 0.6% |
A5 | 1 | 0.6% |
Acadia | 6 | 3.7% |
Ascent | 1 | 0.6% |
Atlas | 1 | 0.6% |
Bolt EUV | 7 | 4.3% |
Bronco | 1 | 0.6% |
C-Class | 1 | 0.6% |
CR-V | 1 | 0.6% |
CX-5 | 2 | 1.2% |
Camaro | 7 | 4.3% |
Camry | 7 | 4.3% |
Carnival | 1 | 0.6% |
Civic | 2 | 1.2% |
Compass | 1 | 0.6% |
Corvette | 7 | 4.3% |
Crosstrek | 1 | 0.6% |
ES | 1 | 0.6% |
Edge | 1 | 0.6% |
Equinox | 1 | 0.6% |
Escape | 2 | 1.2% |
Expedition | 1 | 0.6% |
Explorer | 7 | 4.3% |
F-150 | 1 | 0.6% |
Fit | 6 | 3.7% |
Forester | 2 | 1.2% |
Frontier | 1 | 0.6% |
GLC-Class | 1 | 0.6% |
GV80 | 6 | 3.7% |
Grand Cherokee | 1 | 0.6% |
HR-V | 1 | 0.6% |
Highlander | 1 | 0.6% |
IS | 1 | 0.6% |
Jetta | 1 | 0.6% |
Legacy | 2 | 1.2% |
MDX | 1 | 0.6% |
MX-5 Miata | 1 | 0.6% |
Maverick | 6 | 3.7% |
Mazda6 | 2 | 1.2% |
Model 3 | 2 | 1.2% |
Mustang | 2 | 1.2% |
NX | 1 | 0.6% |
Odyssey | 1 | 0.6% |
Outback | 3 | 1.8% |
Pathfinder | 2 | 1.2% |
Prius | 6 | 3.7% |
Q5 | 2 | 1.2% |
Q7 | 1 | 0.6% |
QX50 | 1 | 0.6% |
RAV4 | 1 | 0.6% |
RDX | 1 | 0.6% |
RX | 2 | 1.2% |
Ranger | 1 | 0.6% |
Rogue | 1 | 0.6% |
Rogue Sport | 1 | 0.6% |
S-Class | 1 | 0.6% |
Santa Fe | 1 | 0.6% |
Sentra | 1 | 0.6% |
Sienna | 1 | 0.6% |
Sierra | 1 | 0.6% |
Sonata | 2 | 1.2% |
Sorento | 1 | 0.6% |
Soul | 6 | 3.7% |
Sportage | 2 | 1.2% |
Tacoma | 1 | 0.6% |
Tahoe | 1 | 0.6% |
Taos | 1 | 0.6% |
Telluride | 1 | 0.6% |
Trax | 1 | 0.6% |
Tucson | 1 | 0.6% |
Tundra | 1 | 0.6% |
Venue | 6 | 3.7% |
Versa | 1 | 0.6% |
Wrangler | 2 | 1.2% |
X5 | 2 | 1.2% |
XC40 | 1 | 0.6% |
XC60 | 2 | 1.2% |
Yukon | 1 | 0.6% |
Total | 164 | 100.0% |
df %>% tabyl(Modelo) %>%
ggplot(aes(x=Modelo,y=n,fill=Modelo)) +
geom_col()
Esta es la mejor tabla:
df %>% tabyl(Forma)
## Forma n percent
## Convertible 7 0.042682927
## Coupe 11 0.067073171
## Hatchback 18 0.109756098
## Minivan 3 0.018292683
## SUV 1 0.006097561
## SUV 1 0.006097561
## SUV 82 0.500000000
## Sedan 27 0.164634146
## Truck 12 0.073170732
## Wagon 2 0.012195122
df %>% tabyl(Forma) %>%
ggplot(aes(x=Forma,y=n,fill=Forma)) +
geom_col()
df %>% tabyl(Marca)
## Marca n percent
## Acura 2 0.012195122
## Audi 5 0.030487805
## BMW 4 0.024390244
## Chevrolet 24 0.146341463
## Ford 22 0.134146341
## GMC 8 0.048780488
## Genesis 6 0.036585366
## Honda 11 0.067073171
## Hyundai 10 0.060975610
## Infiniti 1 0.006097561
## Jeep 4 0.024390244
## Kia 11 0.067073171
## Lexus 5 0.030487805
## Mazda 5 0.030487805
## Mercedes 3 0.018292683
## Nissan 7 0.042682927
## Subaru 9 0.054878049
## Tesla 2 0.012195122
## Toyota 19 0.115853659
## Volkswagen 1 0.006097561
## Volkswagen 2 0.012195122
## Volvo 3 0.018292683
df %>% tabyl(Marca) %>%
ggplot(aes(x=Marca,y=n,fill=Marca)) +
geom_col()
Esta es la mejor opción hasta el momento:
df %>% tabyl(Forma) %>%
ggplot(aes(x=Forma,y=n,fill=Forma)) +
geom_col() +
labs(x="Forma",y="Frecuencias",title="Formas de los autos")
Otra prueba pero ahora con las frecuencias (números) visibles:
df %>% tabyl(Forma) %>%
ggplot(aes(x=Forma,y=n,fill=Forma)) +
geom_col() +
labs(x="Forma",y="Frecuencias",title="Formas de los autos") +
geom_text(aes(label=n),vjust=1.5,col="white",fontface="bold")
En vez de números que salgan porcentajes:
df %>% tabyl(Forma) %>%
ggplot(aes(x=Forma,y=n,fill=Forma)) +
geom_col() +
labs(x="Forma",y="Frecuencias",title="Formas de los autos") +
geom_text(aes(label= sprintf("%.2f%%",100*percent)),vjust=1.5,col="white",fontface="bold")
Más ejemplos:
Números al azar de la campana de Gauss, los más probables son los cercanos a 0.
Se puede usar el histograma cuando es dificil hacer una tabla de frecuencias, donde es dificil que los números se repitan.
n=100000
numeros=rnorm(n=n,mean=20,sd=1)
df1=data.frame(numeros)
df1 %>%
ggplot(aes(x=numeros)) +
geom_histogram(color="blue",fill="lightblue") +
labs(x="Números",y="Frecuencia",title="Campana de Gauss Experimental")