#En este trabajo autónomo, vamos a recibir un conjunto de datos provenientes de una encuesta de salarios a gerentes que, por su libertad para contestar, requiere serrevisada y corregida antes de analizarla.
#Se instala librería necesaria para la visualización de dataset
install.packages("shiny")
## Installing package into 'C:/Users/Usuario/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
library(shiny)
install.packages("DT")
## Installing package into 'C:/Users/Usuario/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
library(DT)
##
## Adjuntando el paquete: 'DT'
## The following objects are masked from 'package:shiny':
##
## dataTableOutput, renderDataTable
install.packages("dplyr")
## Installing package into 'C:/Users/Usuario/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
install.packages("tidyr")
## Installing package into 'C:/Users/Usuario/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
library(tidyr)
Se importa la base a ser analizada: Taller_1_._Anexo_A
listings = read.csv("C:/Users/Usuario/Downloads/4. Examen-A-listings.csv", stringsAsFactors = FALSE)
Exploración inicial
#View(listings)
names(listings) #Nombre de las Variables
## [1] "id"
## [2] "listing_url"
## [3] "scrape_id"
## [4] "last_scraped"
## [5] "source"
## [6] "name"
## [7] "description"
## [8] "neighborhood_overview"
## [9] "picture_url"
## [10] "host_id"
## [11] "host_url"
## [12] "host_name"
## [13] "host_since"
## [14] "host_location"
## [15] "host_about"
## [16] "host_response_time"
## [17] "host_response_rate"
## [18] "host_acceptance_rate"
## [19] "host_is_superhost"
## [20] "host_thumbnail_url"
## [21] "host_picture_url"
## [22] "host_neighbourhood"
## [23] "host_listings_count"
## [24] "host_total_listings_count"
## [25] "host_verifications"
## [26] "host_has_profile_pic"
## [27] "host_identity_verified"
## [28] "neighbourhood"
## [29] "neighbourhood_cleansed"
## [30] "neighbourhood_group_cleansed"
## [31] "latitude"
## [32] "longitude"
## [33] "property_type"
## [34] "room_type"
## [35] "accommodates"
## [36] "bathrooms"
## [37] "bathrooms_text"
## [38] "bedrooms"
## [39] "beds"
## [40] "amenities"
## [41] "price"
## [42] "minimum_nights"
## [43] "maximum_nights"
## [44] "minimum_minimum_nights"
## [45] "maximum_minimum_nights"
## [46] "minimum_maximum_nights"
## [47] "maximum_maximum_nights"
## [48] "minimum_nights_avg_ntm"
## [49] "maximum_nights_avg_ntm"
## [50] "calendar_updated"
## [51] "has_availability"
## [52] "availability_30"
## [53] "availability_60"
## [54] "availability_90"
## [55] "availability_365"
## [56] "calendar_last_scraped"
## [57] "number_of_reviews"
## [58] "number_of_reviews_ltm"
## [59] "number_of_reviews_l30d"
## [60] "first_review"
## [61] "last_review"
## [62] "review_scores_rating"
## [63] "review_scores_accuracy"
## [64] "review_scores_cleanliness"
## [65] "review_scores_checkin"
## [66] "review_scores_communication"
## [67] "review_scores_location"
## [68] "review_scores_value"
## [69] "license"
## [70] "instant_bookable"
## [71] "calculated_host_listings_count"
## [72] "calculated_host_listings_count_entire_homes"
## [73] "calculated_host_listings_count_private_rooms"
## [74] "calculated_host_listings_count_shared_rooms"
## [75] "reviews_per_month"
#summary(listings) # Resumen de datos
nrow(listings) #Número de Observaciones
## [1] 74329
ncol(listings) #Número de Variables
## [1] 75
#str(listings)
#class(listings)
#sapply(listings, class)
#datatable(listings) #Visualización de la data: Anexo_A
table(sapply(listings, class)) #Tipo de Variables
##
## character integer logical numeric
## 34 23 4 14
Se detectan 74329 observaciones y 75 variables.
Los datos se muestran en Inglés.
De las 75 variables, 34 son de tipo “character”, 23 son de tipo “integer”, 4 son de tipo “logical” y 14 son de tipo “numeric”
2.3. Calidad de los datos. > Se verifica la validez, precisión, completitud, consistencia y uniformidad de los datos con una tabla resumen
resumen <- data.frame(
Variable = names(listings),
tipo = sapply(listings, class),
Total_valores = sapply(listings, function(x) length(x)),
Valores_únicos = sapply(listings, function(x) length(unique(x))),
Valores_blancos = sapply(listings, function(x) sum(nchar(trimws(x)) == 0)),
Valores_vacios = sapply(listings, function(x) sum(x == "")),
Valores_NA = sapply(listings, function(x) sum(is.na(x)))
)
#resumen
#str(resumen)
#View(resumen)
datatable(resumen) #Visualización de la data: resumen
Elimiar filas duplicadas
listings <- listings %>% distinct()
Eliminar columnas innecesarias
listings <- listings[, !sapply(listings, function(x) all(is.na(x)))]
Cambiar formato de variables de tipo Fecha
#last_scraped
listings$last_scraped <- as.Date(listings$last_scraped) # Convertir la columna de fecha a tipo Date
listings$last_scraped<- format(listings$last_scraped, "%d/%m/%Y") # Cambiar el formato de la fecha a DIA/MES/AÑO
#host_since
listings$host_since <- as.Date(listings$host_since) # Convertir la columna de fecha a tipo Date
listings$host_since<- format(listings$host_since, "%d/%m/%Y") # Cambiar el formato de la fecha a DIA/MES/AÑO
Rellenar valores NA
# Rellenar los NA numéricos con la media
listings <- listings %>%
mutate(across(where(is.numeric), ~replace(., is.na(.), mean(., na.rm = TRUE))))
# Rellenar los NA en las columnas fecha
listings$host_since[is.na(listings$host_since)] <- as.Date("2024-12-31")
listings$last_scraped[is.na(listings$last_scraped)] <- as.Date("2024-12-31")
Rellenar valores Vacios de texto
listings <- listings %>%
mutate(across(where(is.character), ~replace(., . == "" | is.na(.), "No Name")))
Verificar Resumen
resumen <- data.frame(
Variable = names(listings),
tipo = sapply(listings, class),
Total_valores = sapply(listings, function(x) length(x)),
Valores_únicos = sapply(listings, function(x) length(unique(x))),
Valores_blancos = sapply(listings, function(x) sum(nchar(trimws(x)) == 0)),
Valores_vacios = sapply(listings, function(x) sum(x == "")),
Valores_NA = sapply(listings, function(x) sum(is.na(x)))
)
datatable(resumen)
#View(resumen)
se exporta csv del documento Anexo_A luego de la limpieza
write.csv(listings, file = "listings_limpio.csv", row.names = FALSE)