Cargue inicial de librerias

library(ggplot2)
library(CGPfunctions)
library(plotly)
## 
## Adjuntando el paquete: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library (MASS)
## 
## Adjuntando el paquete: 'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
library(sqldf)
## Cargando paquete requerido: gsubfn
## Cargando paquete requerido: proto
## Cargando paquete requerido: RSQLite
library(agricolae)

Crear ruta de trabajo

setwd("G:/TRABAJO/DOCENCIA/KONRAND LORENZ/Nivelatorios")
library(readxl)
datos_olx=read_excel("mazda2_col_1.xlsx")
head(datos_olx,5)
## # A tibble: 5 × 8
##   `web-scraper-order`   precio kilometraje transmision modelo_ano color  ciudad 
##   <chr>                  <dbl>       <dbl> <chr>            <dbl> <chr>  <chr>  
## 1 1662992063-359      58000000       33000 Mecanica          2019 Blanco Bogota 
## 2 1662992030-348      56900000      100000 Automatica        2016 Azul   Medell…
## 3 1662991456-146      38000000       92000 Mecanica          2013 Azul   Bogota 
## 4 1662991768-254      64900000       35000 Mecanica          2019 Azul   Bogota 
## 5 1662992126-381      54900000       61000 Automatica        2016 Azul   Bogota 
## # ℹ 1 more variable: Depto <chr>
str(datos_olx)
## tibble [318 × 8] (S3: tbl_df/tbl/data.frame)
##  $ web-scraper-order: chr [1:318] "1662992063-359" "1662992030-348" "1662991456-146" "1662991768-254" ...
##  $ precio           : num [1:318] 58000000 56900000 38000000 64900000 54900000 41900000 72500000 66000000 42000000 38500000 ...
##  $ kilometraje      : num [1:318] 33000 100000 92000 35000 61000 ...
##  $ transmision      : chr [1:318] "Mecanica" "Automatica" "Mecanica" "Mecanica" ...
##  $ modelo_ano       : num [1:318] 2019 2016 2013 2019 2016 ...
##  $ color            : chr [1:318] "Blanco" "Azul" "Azul" "Azul" ...
##  $ ciudad           : chr [1:318] "Bogota" "Medellin" "Bogota" "Bogota" ...
##  $ Depto            : chr [1:318] "Bogotá" "Antioquia" "Bogotá" "Bogotá" ...
attach(datos_olx)

Exploracion Inicial

par(mfrow=c(1,3))
boxplot(precio, main='Precio Vehiculos')
boxplot(kilometraje, main='KM Vehiculos')
hist(modelo_ano, main='Modelos Vehiculos')

Conteo de vehiculos por modelo

q1=sqldf("select modelo_ano, count() as conteo
          from datos_olx
          group by modelo_ano
          order by modelo_ano asc
         ")
q1
##    modelo_ano conteo
## 1        1995      1
## 2        2007      2
## 3        2008     16
## 4        2009     13
## 5        2010     16
## 6        2011     33
## 7        2012     29
## 8        2013     30
## 9        2014     18
## 10       2015     20
## 11       2016     18
## 12       2017     23
## 13       2018     21
## 14       2019     23
## 15       2020     14
## 16       2021     17
## 17       2022     24
q2=sqldf("select transmision, count() as               conteo, avg(precio) as precio_prom
          from datos_olx
          group by transmision
         ")
q2
##   transmision conteo precio_prom
## 1  Automatica    174    56043563
## 2    Mecanica    144    43486944

Validacion Lineal

g1=ggplot(data=datos_olx,mapping=
            aes(x=kilometraje,y=precio))+geom_point()+
            theme_bw()+geom_smooth(method=lm)
g1
## `geom_smooth()` using formula = 'y ~ x'

Verifiquemos mediante un analisis de regresion lineal

modelo1=lm(precio~kilometraje)
summary(modelo1)
## 
## Call:
## lm(formula = precio ~ kilometraje)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -60440623  -6838351   -476261   6611820 108038776 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.225e+07  1.660e+06   43.51   <2e-16 ***
## kilometraje -2.911e+02  1.883e+01  -15.46   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15470000 on 316 degrees of freedom
## Multiple R-squared:  0.4307, Adjusted R-squared:  0.4289 
## F-statistic:   239 on 1 and 316 DF,  p-value: < 2.2e-16