url <- 'https://raw.githubusercontent.com/moderndive/moderndive/master/data-raw/kc_house_data.csv'

house_prices <- read.csv(url)
head(house_prices)
##           id            date   price bedrooms bathrooms sqft_living sqft_lot
## 1 7129300520 20141013T000000  221900        3      1.00        1180     5650
## 2 6414100192 20141209T000000  538000        3      2.25        2570     7242
## 3 5631500400 20150225T000000  180000        2      1.00         770    10000
## 4 2487200875 20141209T000000  604000        4      3.00        1960     5000
## 5 1954400510 20150218T000000  510000        3      2.00        1680     8080
## 6 7237550310 20140512T000000 1225000        4      4.50        5420   101930
##   floors waterfront view condition grade sqft_above sqft_basement yr_built
## 1      1          0    0         3     7       1180             0     1955
## 2      2          0    0         3     7       2170           400     1951
## 3      1          0    0         3     6        770             0     1933
## 4      1          0    0         5     7       1050           910     1965
## 5      1          0    0         3     8       1680             0     1987
## 6      1          0    0         3    11       3890          1530     2001
##   yr_renovated zipcode     lat     long sqft_living15 sqft_lot15
## 1            0   98178 47.5112 -122.257          1340       5650
## 2         1991   98125 47.7210 -122.319          1690       7639
## 3            0   98028 47.7379 -122.233          2720       8062
## 4            0   98136 47.5208 -122.393          1360       5000
## 5            0   98074 47.6168 -122.045          1800       7503
## 6            0   98053 47.6561 -122.005          4760     101930

read.xlsx readr cargar dplyr

Preguntas

#1.¿Cuál es la media del precio de las viviendas (price)?

mean(house_prices$price)
## [1] 540088.1

#2.¿Qué medida se utiliza para describir el valor central cuando hay valores atípicos extremos?

#.RTA: La mediana.

#3.Calcula la desviación estándar de la variable sqft_living (área habitable). ¿Qué indica este valor en este contexto?

sd(house_prices$sqft_living)
## [1] 918.4409

#4.¿Cuál de las siguientes variables tiene mayor dispersión: price, sqft_lot, bedrooms? Justifica tu respuesta con base en medidas de dispersión.

#5.¿Qué variable presenta mayor número de ceros: sqft_basement o yr_renovated?: Ninguna variable tiene datos perdidos.

house_prices$id[is.na(house_prices$id)]
## numeric(0)
house_prices$date[is.na(house_prices$date)]
## character(0)
house_prices$price[is.na(house_prices$price)]
## numeric(0)
house_prices$bedrooms[is.na(house_prices$bedrooms)]
## integer(0)
house_prices$bathrooms[is.na(house_prices$bathrooms)]
## numeric(0)
house_prices$sqft_living[is.na(house_prices$sqft_living)]
## integer(0)
house_prices$sqft_lot[is.na(house_prices$sqft_lot)]
## integer(0)
house_prices$sqft_above[is.na(house_prices$sqft_above)]
## integer(0)
house_prices$sqft_basement[is.na(house_prices$sqft_basement)]
## integer(0)
house_prices$sqft_living15[is.na(house_prices$sqft_living15)]
## integer(0)
house_prices$sqft_lot15[is.na(house_prices$sqft_lot15)]
## integer(0)
house_prices$floors[is.na(house_prices$floors)]
## numeric(0)
house_prices$waterfront[is.na(house_prices$waterfront)]
## integer(0)
house_prices$view[is.na(house_prices$view)]
## integer(0)
house_prices$condition[is.na(house_prices$condition)]
## integer(0)
house_prices$grade[is.na(house_prices$grade)]
## integer(0)
house_prices$yr_built[is.na(house_prices$yr_built)]
## integer(0)
house_prices$yr_renovated[is.na(house_prices$yr_renovated)]
## integer(0)
house_prices$zipcode[is.na(house_prices$zipcode)]
## integer(0)
house_prices$lat[is.na(house_prices$lat)]
## numeric(0)
house_prices$long[is.na(house_prices$long)]
## numeric(0)

#6.¿Qué tipo de gráfico usarías para representar la distribución de price? Realizar el grafico: Se uiliza un histograma.

library(ggplot2)

ggplot(house_prices, aes(x = price)) +
  geom_density(fill = "green", alpha = 0.6) +
  labs(title = "Distribución de Precios de Viviendas",
       x = "Precio",
       y = "Densidad") +
  theme_minimal()

ggplot(house_prices, aes(x = price)) +
  geom_histogram(aes(y = ..density..), 
                 bins = 30, 
                 fill = "purple", 
                 color = "blue") +
  geom_density(color = "yellow", linewidth = 1) +
  labs(title = "Distribución de Precios",
       subtitle = "Histograma con curva de densidad",
       x = "Precio",
       y = "Densidad")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#7.Dibuja un histograma para la variable sqft_living. Describe brevemente la forma de la distribución.

hist(house_prices$sqft_living)

#El mayor número de viviedas tienen entre 1000 y 3000 pies cuadrados.