Cargue inicial de librerias
library(ggplot2)
library(CGPfunctions)
library(plotly)
##
## Adjuntando el paquete: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library (MASS)
##
## Adjuntando el paquete: 'MASS'
## The following object is masked from 'package:plotly':
##
## select
library(sqldf)
## Cargando paquete requerido: gsubfn
## Cargando paquete requerido: proto
## Cargando paquete requerido: RSQLite
library(agricolae)
Crear ruta de trabajo
setwd("G:/TRABAJO/DOCENCIA/KONRAND LORENZ/Nivelatorios")
library(readxl)
datos_olx=read_excel("mazda2_col_1.xlsx")
head(datos_olx,5)
## # A tibble: 5 × 8
## `web-scraper-order` precio kilometraje transmision modelo_ano color ciudad
## <chr> <dbl> <dbl> <chr> <dbl> <chr> <chr>
## 1 1662992063-359 58000000 33000 Mecanica 2019 Blanco Bogota
## 2 1662992030-348 56900000 100000 Automatica 2016 Azul Medell…
## 3 1662991456-146 38000000 92000 Mecanica 2013 Azul Bogota
## 4 1662991768-254 64900000 35000 Mecanica 2019 Azul Bogota
## 5 1662992126-381 54900000 61000 Automatica 2016 Azul Bogota
## # ℹ 1 more variable: Depto <chr>
str(datos_olx)
## tibble [318 × 8] (S3: tbl_df/tbl/data.frame)
## $ web-scraper-order: chr [1:318] "1662992063-359" "1662992030-348" "1662991456-146" "1662991768-254" ...
## $ precio : num [1:318] 58000000 56900000 38000000 64900000 54900000 41900000 72500000 66000000 42000000 38500000 ...
## $ kilometraje : num [1:318] 33000 100000 92000 35000 61000 ...
## $ transmision : chr [1:318] "Mecanica" "Automatica" "Mecanica" "Mecanica" ...
## $ modelo_ano : num [1:318] 2019 2016 2013 2019 2016 ...
## $ color : chr [1:318] "Blanco" "Azul" "Azul" "Azul" ...
## $ ciudad : chr [1:318] "Bogota" "Medellin" "Bogota" "Bogota" ...
## $ Depto : chr [1:318] "Bogotá" "Antioquia" "Bogotá" "Bogotá" ...
attach(datos_olx)
Exploracion Inicial
par(mfrow=c(1,3))
boxplot(precio, main='Precio Vehiculos')
boxplot(kilometraje, main='KM Vehiculos')
hist(modelo_ano, main='Modelos Vehiculos')
Conteo de vehiculos por modelo
q1=sqldf("select modelo_ano, count() as conteo
from datos_olx
group by modelo_ano
order by modelo_ano asc
")
q1
## modelo_ano conteo
## 1 1995 1
## 2 2007 2
## 3 2008 16
## 4 2009 13
## 5 2010 16
## 6 2011 33
## 7 2012 29
## 8 2013 30
## 9 2014 18
## 10 2015 20
## 11 2016 18
## 12 2017 23
## 13 2018 21
## 14 2019 23
## 15 2020 14
## 16 2021 17
## 17 2022 24
q2=sqldf("select transmision, count() as conteo, avg(precio) as precio_prom
from datos_olx
group by transmision
")
q2
## transmision conteo precio_prom
## 1 Automatica 174 56043563
## 2 Mecanica 144 43486944
Validacion Lineal
g1=ggplot(data=datos_olx,mapping=
aes(x=kilometraje,y=precio))+geom_point()+
theme_bw()+geom_smooth(method=lm)
g1
## `geom_smooth()` using formula = 'y ~ x'
Verifiquemos mediante un analisis de regresion lineal
modelo1=lm(precio~kilometraje)
summary(modelo1)
##
## Call:
## lm(formula = precio ~ kilometraje)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60440623 -6838351 -476261 6611820 108038776
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.225e+07 1.660e+06 43.51 <2e-16 ***
## kilometraje -2.911e+02 1.883e+01 -15.46 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15470000 on 316 degrees of freedom
## Multiple R-squared: 0.4307, Adjusted R-squared: 0.4289
## F-statistic: 239 on 1 and 316 DF, p-value: < 2.2e-16