#En este trabajo autónomo, vamos a recibir un conjunto de datos provenientes de una encuesta de salarios a gerentes que, por su libertad para contestar, requiere serrevisada y corregida antes de analizarla.

#Se instala librería necesaria para la visualización de dataset

install.packages("shiny")
## Installing package into 'C:/Users/Usuario/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
library(shiny)
install.packages("DT")
## Installing package into 'C:/Users/Usuario/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
library(DT)
## 
## Adjuntando el paquete: 'DT'
## The following objects are masked from 'package:shiny':
## 
##     dataTableOutput, renderDataTable
install.packages("dplyr")
## Installing package into 'C:/Users/Usuario/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
library(dplyr)
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
install.packages("tidyr")
## Installing package into 'C:/Users/Usuario/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
library(tidyr)

2.2. Recolección de datos.

Se importa la base a ser analizada: Taller_1_._Anexo_A

listings = read.csv("C:/Users/Usuario/Downloads/4. Examen-A-listings.csv", stringsAsFactors = FALSE)

2.3. Calidad de los datos.

Exploración inicial

#View(listings)
names(listings)  #Nombre de las Variables
##  [1] "id"                                          
##  [2] "listing_url"                                 
##  [3] "scrape_id"                                   
##  [4] "last_scraped"                                
##  [5] "source"                                      
##  [6] "name"                                        
##  [7] "description"                                 
##  [8] "neighborhood_overview"                       
##  [9] "picture_url"                                 
## [10] "host_id"                                     
## [11] "host_url"                                    
## [12] "host_name"                                   
## [13] "host_since"                                  
## [14] "host_location"                               
## [15] "host_about"                                  
## [16] "host_response_time"                          
## [17] "host_response_rate"                          
## [18] "host_acceptance_rate"                        
## [19] "host_is_superhost"                           
## [20] "host_thumbnail_url"                          
## [21] "host_picture_url"                            
## [22] "host_neighbourhood"                          
## [23] "host_listings_count"                         
## [24] "host_total_listings_count"                   
## [25] "host_verifications"                          
## [26] "host_has_profile_pic"                        
## [27] "host_identity_verified"                      
## [28] "neighbourhood"                               
## [29] "neighbourhood_cleansed"                      
## [30] "neighbourhood_group_cleansed"                
## [31] "latitude"                                    
## [32] "longitude"                                   
## [33] "property_type"                               
## [34] "room_type"                                   
## [35] "accommodates"                                
## [36] "bathrooms"                                   
## [37] "bathrooms_text"                              
## [38] "bedrooms"                                    
## [39] "beds"                                        
## [40] "amenities"                                   
## [41] "price"                                       
## [42] "minimum_nights"                              
## [43] "maximum_nights"                              
## [44] "minimum_minimum_nights"                      
## [45] "maximum_minimum_nights"                      
## [46] "minimum_maximum_nights"                      
## [47] "maximum_maximum_nights"                      
## [48] "minimum_nights_avg_ntm"                      
## [49] "maximum_nights_avg_ntm"                      
## [50] "calendar_updated"                            
## [51] "has_availability"                            
## [52] "availability_30"                             
## [53] "availability_60"                             
## [54] "availability_90"                             
## [55] "availability_365"                            
## [56] "calendar_last_scraped"                       
## [57] "number_of_reviews"                           
## [58] "number_of_reviews_ltm"                       
## [59] "number_of_reviews_l30d"                      
## [60] "first_review"                                
## [61] "last_review"                                 
## [62] "review_scores_rating"                        
## [63] "review_scores_accuracy"                      
## [64] "review_scores_cleanliness"                   
## [65] "review_scores_checkin"                       
## [66] "review_scores_communication"                 
## [67] "review_scores_location"                      
## [68] "review_scores_value"                         
## [69] "license"                                     
## [70] "instant_bookable"                            
## [71] "calculated_host_listings_count"              
## [72] "calculated_host_listings_count_entire_homes" 
## [73] "calculated_host_listings_count_private_rooms"
## [74] "calculated_host_listings_count_shared_rooms" 
## [75] "reviews_per_month"
#summary(listings) # Resumen de datos
nrow(listings)  #Número de Observaciones
## [1] 74329
ncol(listings)  #Número de Variables
## [1] 75
#str(listings)
#class(listings)
#sapply(listings, class)
#datatable(listings) #Visualización de la data: Anexo_A
table(sapply(listings, class)) #Tipo de Variables
## 
## character   integer   logical   numeric 
##        34        23         4        14

Se detectan 74329 observaciones y 75 variables.

Los datos se muestran en Inglés.

De las 75 variables, 34 son de tipo “character”, 23 son de tipo “integer”, 4 son de tipo “logical” y 14 son de tipo “numeric”

2.3. Calidad de los datos. > Se verifica la validez, precisión, completitud, consistencia y uniformidad de los datos con una tabla resumen

resumen <- data.frame(
  Variable = names(listings),
  tipo = sapply(listings, class),
  Total_valores = sapply(listings, function(x) length(x)),
  Valores_únicos = sapply(listings, function(x) length(unique(x))),
  Valores_blancos = sapply(listings, function(x) sum(nchar(trimws(x)) == 0)),
  Valores_vacios = sapply(listings, function(x) sum(x == "")),
  Valores_NA = sapply(listings, function(x) sum(is.na(x)))
)
#resumen
#str(resumen)
#View(resumen)
datatable(resumen) #Visualización de la data: resumen

2.4. El proceso de limpieza.

Elimiar filas duplicadas

listings <- listings %>% distinct()

Eliminar columnas innecesarias

listings <- listings[, !sapply(listings, function(x) all(is.na(x)))]

Cambiar formato de variables de tipo Fecha

#last_scraped
listings$last_scraped <- as.Date(listings$last_scraped) # Convertir la columna de fecha a tipo Date

listings$last_scraped<- format(listings$last_scraped, "%d/%m/%Y") # Cambiar el formato de la fecha a DIA/MES/AÑO

#host_since
listings$host_since <- as.Date(listings$host_since) # Convertir la columna de fecha a tipo Date

listings$host_since<- format(listings$host_since, "%d/%m/%Y") # Cambiar el formato de la fecha a DIA/MES/AÑO

Rellenar valores NA

# Rellenar los NA numéricos con la media
listings <- listings %>%
  mutate(across(where(is.numeric), ~replace(., is.na(.), mean(., na.rm = TRUE))))
  
# Rellenar los NA en las columnas fecha
listings$host_since[is.na(listings$host_since)] <- as.Date("2024-12-31")
listings$last_scraped[is.na(listings$last_scraped)] <- as.Date("2024-12-31")

Rellenar valores Vacios de texto

listings <- listings %>%
  mutate(across(where(is.character), ~replace(., . == "" | is.na(.), "No Name")))

Verificar Resumen

resumen <- data.frame(
  Variable = names(listings),
  tipo = sapply(listings, class),
  Total_valores = sapply(listings, function(x) length(x)),
  Valores_únicos = sapply(listings, function(x) length(unique(x))),
  Valores_blancos = sapply(listings, function(x) sum(nchar(trimws(x)) == 0)),
  Valores_vacios = sapply(listings, function(x) sum(x == "")),
  Valores_NA = sapply(listings, function(x) sum(is.na(x)))
)
datatable(resumen)
#View(resumen)

Exportar data limpia

se exporta csv del documento Anexo_A luego de la limpieza

write.csv(listings, file = "listings_limpio.csv", row.names = FALSE)