url <- 'https://raw.githubusercontent.com/moderndive/moderndive/master/data-raw/kc_house_data.csv'
house_prices <- read.csv(url)
head(house_prices)
## id date price bedrooms bathrooms sqft_living sqft_lot
## 1 7129300520 20141013T000000 221900 3 1.00 1180 5650
## 2 6414100192 20141209T000000 538000 3 2.25 2570 7242
## 3 5631500400 20150225T000000 180000 2 1.00 770 10000
## 4 2487200875 20141209T000000 604000 4 3.00 1960 5000
## 5 1954400510 20150218T000000 510000 3 2.00 1680 8080
## 6 7237550310 20140512T000000 1225000 4 4.50 5420 101930
## floors waterfront view condition grade sqft_above sqft_basement yr_built
## 1 1 0 0 3 7 1180 0 1955
## 2 2 0 0 3 7 2170 400 1951
## 3 1 0 0 3 6 770 0 1933
## 4 1 0 0 5 7 1050 910 1965
## 5 1 0 0 3 8 1680 0 1987
## 6 1 0 0 3 11 3890 1530 2001
## yr_renovated zipcode lat long sqft_living15 sqft_lot15
## 1 0 98178 47.5112 -122.257 1340 5650
## 2 1991 98125 47.7210 -122.319 1690 7639
## 3 0 98028 47.7379 -122.233 2720 8062
## 4 0 98136 47.5208 -122.393 1360 5000
## 5 0 98074 47.6168 -122.045 1800 7503
## 6 0 98053 47.6561 -122.005 4760 101930
read.xlsx readr cargar dplyr
#1.¿Cuál es la media del precio de las viviendas (price)?
mean(house_prices$price)
## [1] 540088.1
#2.¿Qué medida se utiliza para describir el valor central cuando hay valores atípicos extremos?
#.RTA: La mediana.
#3.Calcula la desviación estándar de la variable sqft_living (área habitable). ¿Qué indica este valor en este contexto?
sd(house_prices$sqft_living)
## [1] 918.4409
#4.¿Cuál de las siguientes variables tiene mayor dispersión: price, sqft_lot, bedrooms? Justifica tu respuesta con base en medidas de dispersión.
#5.¿Qué variable presenta mayor número de ceros: sqft_basement o yr_renovated?: Ninguna variable tiene datos perdidos.
house_prices$id[is.na(house_prices$id)]
## numeric(0)
house_prices$date[is.na(house_prices$date)]
## character(0)
house_prices$price[is.na(house_prices$price)]
## numeric(0)
house_prices$bedrooms[is.na(house_prices$bedrooms)]
## integer(0)
house_prices$bathrooms[is.na(house_prices$bathrooms)]
## numeric(0)
house_prices$sqft_living[is.na(house_prices$sqft_living)]
## integer(0)
house_prices$sqft_lot[is.na(house_prices$sqft_lot)]
## integer(0)
house_prices$sqft_above[is.na(house_prices$sqft_above)]
## integer(0)
house_prices$sqft_basement[is.na(house_prices$sqft_basement)]
## integer(0)
house_prices$sqft_living15[is.na(house_prices$sqft_living15)]
## integer(0)
house_prices$sqft_lot15[is.na(house_prices$sqft_lot15)]
## integer(0)
house_prices$floors[is.na(house_prices$floors)]
## numeric(0)
house_prices$waterfront[is.na(house_prices$waterfront)]
## integer(0)
house_prices$view[is.na(house_prices$view)]
## integer(0)
house_prices$condition[is.na(house_prices$condition)]
## integer(0)
house_prices$grade[is.na(house_prices$grade)]
## integer(0)
house_prices$yr_built[is.na(house_prices$yr_built)]
## integer(0)
house_prices$yr_renovated[is.na(house_prices$yr_renovated)]
## integer(0)
house_prices$zipcode[is.na(house_prices$zipcode)]
## integer(0)
house_prices$lat[is.na(house_prices$lat)]
## numeric(0)
house_prices$long[is.na(house_prices$long)]
## numeric(0)
#6.¿Qué tipo de gráfico usarías para representar la distribución de price? Realizar el grafico: Se uiliza un histograma.
library(ggplot2)
ggplot(house_prices, aes(x = price)) +
geom_density(fill = "green", alpha = 0.6) +
labs(title = "Distribución de Precios de Viviendas",
x = "Precio",
y = "Densidad") +
theme_minimal()
ggplot(house_prices, aes(x = price)) +
geom_histogram(aes(y = ..density..),
bins = 30,
fill = "purple",
color = "blue") +
geom_density(color = "yellow", linewidth = 1) +
labs(title = "Distribución de Precios",
subtitle = "Histograma con curva de densidad",
x = "Precio",
y = "Densidad")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#7.Dibuja un histograma para la variable sqft_living. Describe brevemente la forma de la distribución.
hist(house_prices$sqft_living)
#El mayor número de viviedas tienen entre 1000 y 3000 pies
cuadrados.