Analisis exploratorio de datos caso base de datos OLX

ANALISIS BASE DE DATOS OLX

En el siguiente analisis se presentará el analisis de los datos de la base de datos de OLX la cual contiene información de ventas de vivienda en la ciudad de Cali, Colombia. La exploración consiste en calcular los estimadores estadisticos y graficar los datos que nos permitan describir el sector inmobiliario en la ciudad de Cali.

# Instalamos las librerías
library(forecast)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(reshape2)
library(stats)
library(ggplot2)
library(ggthemes)
library(ggrepel)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:reshape2':
## 
##     smiths

library(readxl)
library(reticulate)

# Ruta de Python
Sys.which('python')

##                                                                python 
## "C:\\Users\\itco4021\\AppData\\Local\\Programs\\PYTHON~1\\python.exe"

use_python("C:\\Users\\itco4021\\AppData\\Local\\Programs\\PYTHON~1\\python.exe")


## Configuración de Python
py_config()

## python:         C:/Users/itco4021/AppData/Local/Programs/PythonCodingPack/python.exe
## libpython:      C:/Users/itco4021/AppData/Local/Programs/PythonCodingPack/python38.dll
## pythonhome:     C:/Users/itco4021/AppData/Local/Programs/PythonCodingPack
## version:        3.8.5 (tags/v3.8.5:580fbb0, Jul 20 2020, 15:57:54) [MSC v.1924 64 bit (AMD64)]
## Architecture:   64bit
## numpy:          C:/Users/itco4021/AppData/Local/Programs/PythonCodingPack/Lib/site-packages/numpy
## numpy_version:  1.19.1
## 
## NOTE: Python version was forced by RETICULATE_PYTHON

EXPLORACIÓN DE DATOS CON R

#Cagar un dataframe con la base de datos

#file.choose()
ruta_excel<-"D:\\Ambiente\\escritorio\\olx_viviendas_cali.xlsx"

#dfOLX <- read_excel(ruta_excel)
dfOLX <- read_excel("d:/mis documentos/olx_viviendas_cali.xlsx", sheet = "datos")

summary(dfOLX)

##       ID                URL               ciudad              Zona          
##  Length:5919        Length:5919        Length:5919        Length:5919       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##     Barrio          Cordenada_latitud cordenada_longitud     Tipo          
##  Length:5919        Min.   :3.400     Min.   :-76.60     Length:5919       
##  Class :character   1st Qu.:3.425     1st Qu.:-76.54     Class :character  
##  Mode  :character   Median :3.449     Median :-76.53     Mode  :character  
##                     Mean   :3.447     Mean   :-76.53                       
##                     3rd Qu.:3.467     3rd Qu.:-76.51                       
##                     Max.   :3.576     Max.   :-76.21                       
##                                                                            
##       piso           Estrato      Area_contruida    parqueaderos   
##  Min.   : 1.000   Min.   :1.000   Min.   :  30.0   Min.   : 1.000  
##  1st Qu.: 2.000   1st Qu.:3.000   1st Qu.:  84.0   1st Qu.: 1.000  
##  Median : 3.000   Median :4.000   Median : 137.0   Median : 2.000  
##  Mean   : 3.764   Mean   :4.341   Mean   : 187.6   Mean   : 1.833  
##  3rd Qu.: 5.000   3rd Qu.:5.000   3rd Qu.: 240.0   3rd Qu.: 2.000  
##  Max.   :15.000   Max.   :6.000   Max.   :2000.0   Max.   :10.000  
##  NA's   :2204     NA's   :21                       NA's   :1524    
##      Baños         Habitaciones        Precio         
##  Min.   : 0.000   Min.   : 0.000   Min.   :3.000e+07  
##  1st Qu.: 2.000   1st Qu.: 3.000   1st Qu.:1.950e+08  
##  Median : 3.000   Median : 3.000   Median :3.300e+08  
##  Mean   : 3.092   Mean   : 3.871   Mean   :4.425e+08  
##  3rd Qu.: 4.000   3rd Qu.: 4.000   3rd Qu.:5.300e+08  
##  Max.   :10.000   Max.   :20.000   Max.   :4.500e+09  
##

head(dfOLX)

## # A tibble: 6 x 15
##   ID     URL   ciudad Zona  Barrio Cordenada_latit~ cordenada_longi~ Tipo   piso
##   <chr>  <chr> <chr>  <chr> <chr>             <dbl>            <dbl> <chr> <dbl>
## 1 15644~ http~ Cali   Zona~ la bu~             3.48            -76.2 Casa      2
## 2 15644~ http~ Cali   Zona~ villa~             3.40            -76.4 Casa     NA
## 3 15644~ http~ Cali   Zona~ crist~             3.40            -76.5 Apar~     2
## 4 15644~ http~ Cali   Zona~ pobla~             3.41            -76.5 Casa     NA
## 5 15644~ http~ Cali   Zona~ pobla~             3.41            -76.5 Casa      1
## 6 15644~ http~ Cali   Zona~ valle~             3.43            -76.5 Apar~    NA
## # ... with 6 more variables: Estrato <dbl>, Area_contruida <dbl>,
## #   parqueaderos <dbl>, Baños <dbl>, Habitaciones <dbl>, Precio <dbl>

hist(dfOLX$Area_contruida)

hist(dfOLX$Area_contruida, main = "OLX: histograma del area de viviendas",
      xlab = "area", ylab = "frecuencia",
      col = "steelblue")

hist(dfOLX$Precio, main = "OLX: histograma del Precio",
      xlab = "area", ylab = "frecuencia",
      col = "steelblue")

hist(dfOLX$Estrato, main = "OLX: histograma de Estratos",
      xlab = "area", ylab = "frecuencia",
      col = "steelblue")

plot(dfOLX$Area_contruida, dfOLX$Precio)

boxplot(dfOLX$Precio ~ dfOLX$Zona, col = "gray",
        main = "Precios\nsegún la Zona")

p <- ggplot(dfOLX)

p <- ggplot(dfOLX, aes(x = Area_contruida, y = Precio, colour = Estrato))

p <- p + geom_point()
p

ggplot(dfOLX, aes(x = Area_contruida, y = Precio)) + 
  geom_point() + geom_smooth() + 
  facet_grid(~ Estrato)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(dfOLX, aes(x = Area_contruida, y = Precio)) + 
  geom_point() + geom_smooth() + 
  facet_grid(~ Zona)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

EXPLORACIÓN DE DATOS CON PYTHON

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

from sklearn.impute import SimpleImputer

#######

df = pd.read_csv('olx_viviendas_cali.csv') 
df.info()

## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 5919 entries, 0 to 5918
## Data columns (total 15 columns):
##  #   Column              Non-Null Count  Dtype  
## ---  ------              --------------  -----  
##  0   ID                  5919 non-null   object 
##  1   URL                 5919 non-null   object 
##  2   ciudad              5919 non-null   object 
##  3   Zona                5572 non-null   object 
##  4   Barrio              5919 non-null   object 
##  5   Cordenada_latitud   5919 non-null   float64
##  6   cordenada_longitud  5919 non-null   float64
##  7   Tipo                5919 non-null   object 
##  8   piso                3715 non-null   float64
##  9   Estrato             5898 non-null   float64
##  10  Area_contruida      5919 non-null   float64
##  11  parqueaderos        4395 non-null   float64
##  12  Baños               5919 non-null   int64  
##  13  Habitaciones        5919 non-null   int64  
##  14  Precio              5919 non-null   int64  
## dtypes: float64(6), int64(3), object(6)
## memory usage: 693.8+ KB


### PRINT DF###
df

### PARAMETROS ESTADISTICOS #####

##                     ID  ...      Precio
## 0      1564437042-4552  ...   180000000
## 1      1564437042-4882  ...   140000000
## 2     1564442091-12739  ...   395000000
## 3      1564437042-2992  ...   115000000
## 4      1564437042-4607  ...   130000000
## ...                ...  ...         ...
## 5914    1564437042-634  ...  1400000000
## 5915  1564442091-11889  ...   670000000
## 5916  1564442091-12625  ...  1190000000
## 5917   1564437042-4126  ...  1100000000
## 5918   1564442091-8858  ...   170000000
## 
## [5919 rows x 15 columns]

df.describe()

##        Cordenada_latitud  cordenada_longitud  ...  Habitaciones        Precio
## count        5919.000000         5919.000000  ...   5919.000000  5.919000e+03
## mean            3.446652          -76.526232  ...      3.871093  4.425289e+08
## std             0.026577            0.022520  ...      1.967489  4.001503e+08
## min             3.400000          -76.598892  ...      0.000000  3.000000e+07
## 25%             3.425000          -76.543900  ...      3.000000  1.950000e+08
## 50%             3.449000          -76.529343  ...      3.000000  3.300000e+08
## 75%             3.466862          -76.511402  ...      4.000000  5.300000e+08
## max             3.576000          -76.208855  ...     20.000000  4.500000e+09
## 
## [8 rows x 9 columns]

# Resumen de las columnas categoricas
df.describe(include = ['O'])

##                      ID  ...         Tipo
## count              5919  ...         5919
## unique             5919  ...            2
## top     1564437042-4120  ...  Apartamento
## freq                  1  ...         3391
## 
## [4 rows x 6 columns]

# Valores unicos por columna sin tener en cuenta nulos
df.nunique()

## ID                    5919
## URL                   5919
## ciudad                   1
## Zona                     5
## Barrio                 540
## Cordenada_latitud     3126
## cordenada_longitud    2674
## Tipo                     2
## piso                    15
## Estrato                  6
## Area_contruida         597
## parqueaderos            10
## Baños                   11
## Habitaciones            20
## Precio                 550
## dtype: int64

# Valores unicos por columna teniendo en cuenta nulos
df.nunique(dropna=False)

## ID                    5919
## URL                   5919
## ciudad                   1
## Zona                     6
## Barrio                 540
## Cordenada_latitud     3126
## cordenada_longitud    2674
## Tipo                     2
## piso                    16
## Estrato                  7
## Area_contruida         597
## parqueaderos            11
## Baños                   11
## Habitaciones            20
## Precio                 550
## dtype: int64

# Conteo de valores por columna de no nulos
df.count()

## ID                    5919
## URL                   5919
## ciudad                5919
## Zona                  5572
## Barrio                5919
## Cordenada_latitud     5919
## cordenada_longitud    5919
## Tipo                  5919
## piso                  3715
## Estrato               5898
## Area_contruida        5919
## parqueaderos          4395
## Baños                 5919
## Habitaciones          5919
## Precio                5919
## dtype: int64

# Cuantos registros tenemos por categoria de la columna 'Zona' con nulos

df['Zona'].value_counts(dropna=False)

## Zona Norte      1940
## Zona Sur        1651
## Zona Oeste      1287
## Zona Oriente     553
## NaN              347
## Zona Centro      141
## Name: Zona, dtype: int64

# Cuales son los nombres de las columnas del dataset
df.columns

## Index(['ID', 'URL', 'ciudad', 'Zona', 'Barrio', 'Cordenada_latitud',
##        'cordenada_longitud', 'Tipo', 'piso', 'Estrato', 'Area_contruida',
##        'parqueaderos', 'Baños', 'Habitaciones', 'Precio'],
##       dtype='object')

##HISTOGRAMAS

plt.figure(figsize=(6,4))
sns.distplot(df['Area_contruida'])
plt.title('Area_contruida Histograma_CALI_OLX')
plt.show()

## Contego
sns.histplot(data=df, x="Tipo")

sns.histplot(data=df, x="Area_contruida")

# Si solo queremos la funcion de densidad
plt.figure(figsize=(6,3))
sns.kdeplot(df['Area_contruida'],shade=True)
plt.title('OLX Densidad area')
plt.xlabel('Area');

### Histograma con filtro

g = sns.FacetGrid(df, col="Tipo", height=3)
g.map(sns.kdeplot, 'Area_contruida', shade=True)

g.despine(left=True,bottom=True)

g = sns.FacetGrid(df, col='Zona', row='Estrato', hue='Tipo', height=3)
g.map(sns.kdeplot, 'Area_contruida', shade=True).add_legend()

g.despine(left=True, bottom=True)

plt.show()

plt.figure(figsize=(6,3))
sns.boxplot(df['Area_contruida'])#,orient='v')
plt.title('OLX Boxplot del Area')
plt.show()

plt.figure(figsize=(6,4))
sns.boxplot(x='Tipo',y='Area_contruida',data=df)
plt.title('OLX Boxplot de los tiposs y el area')
plt.show()

plt.figure(figsize=(10,4))
sns.boxplot(x='Tipo',y='Area_contruida',hue='Estrato',data=df)
plt.title('OLX Boxplot de Area, estrato y tipo')
plt.show()


plt.figure(figsize=(10,4))
sns.boxplot(x='Tipo',y='Area_contruida',hue='Zona',data=df)
plt.title('OLX Boxplot de Area, estrato y tipo')
plt.show()

plt.figure(figsize=(10,4))
sns.boxplot(x='Tipo',y='Precio',hue='Zona',data=df)
plt.title('OLX Boxplot de Area, estrato y tipo')
plt.show()

plt.figure(figsize=(6, 4))
sns.countplot('Tipo',data=df)
plt.show()

plt.figure(figsize=(6, 4))
sns.barplot(y='Area_contruida',x='Estrato',data=df)
plt.show()

plt.figure(figsize=(6, 4))
sns.barplot(y='Area_contruida',x='Estrato',hue='Tipo',data=df)
plt.show()

plt.figure()
sns.heatmap(df.drop(['Cordenada_latitud','cordenada_longitud'],axis=1).corr(),annot=True, linewidth=0.5,fmt='.1f')
plt.show()

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Analisis exploratorio de datos caso base de datos OLX

ELKIN LEONARDO CANTOR HUERFANO

11/2/2022

ANALISIS BASE DE DATOS OLX

EXPLORACIÓN DE DATOS CON R

EXPLORACIÓN DE DATOS CON PYTHON