library(tidyverse)
library(tidyr)
library(leaflet)
library(plotly)
setwd("/Users/kidusteffera/Desktop/DATA110/week 11 ")
California <- read.csv('housing.csv')Project 2
California House Pricing
Combine latitude and longitude into a single GeoLocation-style column
Cali_House <- California |>
unite("coords", latitude, longitude, sep = ", ", remove = FALSE) |>
mutate(coords = paste0("(", coords, ")"),
latitude = as.numeric(latitude),
longitude = as.numeric(longitude))
head(Cali_House) coords longitude latitude housing_median_age total_rooms
1 (37.88, -122.23) -122.23 37.88 41 880
2 (37.86, -122.22) -122.22 37.86 21 7099
3 (37.85, -122.24) -122.24 37.85 52 1467
4 (37.85, -122.25) -122.25 37.85 52 1274
5 (37.85, -122.25) -122.25 37.85 52 1627
6 (37.85, -122.25) -122.25 37.85 52 919
total_bedrooms population households median_income median_house_value
1 129 322 126 8.3252 452600
2 1106 2401 1138 8.3014 358500
3 190 496 177 7.2574 352100
4 235 558 219 5.6431 341300
5 280 565 259 3.8462 342200
6 213 413 193 4.0368 269700
ocean_proximity
1 NEAR BAY
2 NEAR BAY
3 NEAR BAY
4 NEAR BAY
5 NEAR BAY
6 NEAR BAY
Filter the dataset by ocean proximity
Cali_House_Clean <- Cali_House |>
filter(ocean_proximity == "NEAR BAY") |>
filter(!is.na(latitude), !is.na(longitude), !is.na(median_house_value))
nrow(Cali_House_Clean)[1] 2290
head(Cali_House_Clean) coords longitude latitude housing_median_age total_rooms
1 (37.88, -122.23) -122.23 37.88 41 880
2 (37.86, -122.22) -122.22 37.86 21 7099
3 (37.85, -122.24) -122.24 37.85 52 1467
4 (37.85, -122.25) -122.25 37.85 52 1274
5 (37.85, -122.25) -122.25 37.85 52 1627
6 (37.85, -122.25) -122.25 37.85 52 919
total_bedrooms population households median_income median_house_value
1 129 322 126 8.3252 452600
2 1106 2401 1138 8.3014 358500
3 190 496 177 7.2574 352100
4 235 558 219 5.6431 341300
5 280 565 259 3.8462 342200
6 213 413 193 4.0368 269700
ocean_proximity
1 NEAR BAY
2 NEAR BAY
3 NEAR BAY
4 NEAR BAY
5 NEAR BAY
6 NEAR BAY
names(Cali_House_Clean) [1] "coords" "longitude" "latitude"
[4] "housing_median_age" "total_rooms" "total_bedrooms"
[7] "population" "households" "median_income"
[10] "median_house_value" "ocean_proximity"
Cali_House_Clean2 <- Cali_House_Clean |>
select(-total_rooms, -total_bedrooms, -households)
head(Cali_House_Clean2) coords longitude latitude housing_median_age population
1 (37.88, -122.23) -122.23 37.88 41 322
2 (37.86, -122.22) -122.22 37.86 21 2401
3 (37.85, -122.24) -122.24 37.85 52 496
4 (37.85, -122.25) -122.25 37.85 52 558
5 (37.85, -122.25) -122.25 37.85 52 565
6 (37.85, -122.25) -122.25 37.85 52 413
median_income median_house_value ocean_proximity
1 8.3252 452600 NEAR BAY
2 8.3014 358500 NEAR BAY
3 7.2574 352100 NEAR BAY
4 5.6431 341300 NEAR BAY
5 3.8462 342200 NEAR BAY
6 4.0368 269700 NEAR BAY
bay_established <- Cali_House_Clean2 |>
filter(housing_median_age >= 35)
nrow(bay_established)[1] 1450
head(bay_established) coords longitude latitude housing_median_age population
1 (37.88, -122.23) -122.23 37.88 41 322
2 (37.85, -122.24) -122.24 37.85 52 496
3 (37.85, -122.25) -122.25 37.85 52 558
4 (37.85, -122.25) -122.25 37.85 52 565
5 (37.85, -122.25) -122.25 37.85 52 413
6 (37.84, -122.25) -122.25 37.84 52 1094
median_income median_house_value ocean_proximity
1 8.3252 452600 NEAR BAY
2 7.2574 352100 NEAR BAY
3 5.6431 341300 NEAR BAY
4 3.8462 342200 NEAR BAY
5 4.0368 269700 NEAR BAY
6 3.6591 299200 NEAR BAY
bay_established |>
summarise(
n = n(),
median_value = median(median_house_value),
median_income = median(median_income),
median_age = median(housing_median_age)
) n median_value median_income median_age
1 1450 238650 3.5833 48
p <- plot_ly(
data = bay_established,
x = ~median_income,
y = ~median_house_value,
type = "scatter",
mode = "markers",
marker = list(
size = 8,
color = ~housing_median_age,
colorscale = "YlOrRd",
showscale = TRUE,
colorbar = list(title = "Housing<br>Median Age"),
line = list(width = 0.5, color = "white"),
opacity = 0.7
),
text = ~paste0(
"Median Income: $", round(median_income * 10000), "<br>",
"Median House Value: $", format(median_house_value, big.mark = ","), "<br>",
"Housing Median Age: ", housing_median_age, " years<br>",
"Population: ", population
),
hoverinfo = "text"
) |>
layout(
title = list(text = "Median House Value vs. Median Income<br><sub>NEAR BAY block groups, housing age ≥ 35 years</sub>"),
xaxis = list(title = "Median Income (tens of thousands of USD)"),
yaxis = list(title = "Median House Value (USD)")
)
ppal <- colorNumeric(palette = "YlOrRd", domain = bay_established$median_house_value)
leaflet(bay_established) |>
addTiles() |>
addCircleMarkers(
lng = ~longitude,
lat = ~latitude,
radius = ~median_house_value / 100000,
color = ~pal(median_house_value),
fillOpacity = 0.7,
stroke = FALSE
) |>
addLegend("bottomright", pal = pal, values = ~median_house_value,
title = "Median House Value ($)", opacity = 1)leaflet(bay_established) |>
addProviderTiles(providers$CartoDB.Positron) |>
addCircleMarkers(
lng = ~longitude,
lat = ~latitude,
radius = ~median_house_value / 100000,
color = ~pal(median_house_value),
fillOpacity = 0.75,
stroke = TRUE,
weight = 1,
popup = ~paste0(
"<b>Median House Value:</b> $", format(median_house_value, big.mark = ","), "<br/>",
"<b>Median Income:</b> $", round(median_income * 10000), "<br/>",
"<b>Housing Median Age:</b> ", housing_median_age, " years<br/>",
"<b>Population:</b> ", population
),
label = ~paste0("$", format(median_house_value, big.mark = ","))
) |>
addLegend("topright", pal = pal, values = ~median_house_value,
title = "Median House Value ($)", opacity = 1) |>
setView(lng = -122.27, lat = 37.80, zoom = 9)5. Write a paragraph
The analysis that was performed in this exercise was focused upon the region of the dataset known as the NEAR BAY region, which represented each block group within the area around the Bay. Furthermore, only those block groups with both ocean_proximity of “NEAR BAY” and a median age of the houses within the area of at least 35 years were included within this analysis. Thus, the size of the subset of the dataset that was analyzed was well below the 900-observation limit for the problem, yet still represented various block groups within the area around the Bay.
Within the Plotly chart, we can see the relationship between median income and median house value within the NEAR BAY region of San Francisco. In general, the higher the median income of a given block group, the higher the median value of the houses within that block group. However, the relationship between these two variables tends to flat out for block groups that have high median house values (around $500,000). The color of each point on the chart represents the median age of the housing within those block groups. Thus, block groups with high median house values also tend to have some of the oldest housing within the Bay Area. By hovering over each point on the Plotly chart, information regarding the median income of the individuals within each block group, the median value of the houses within each block group, the median age of the housing within each block group, and the population within each of those block groups can be revealed.
Similar to the Plotly chart is the leaflet map of California. Each circle on the map represents each block group within the data with a median house value; the size and color of those circles represent the median value of the houses within each of those block groups. Additionally, by clicking on each circle on the map, information regarding the median income within each block group, the median house value within each block group, the median age of the housing within each of those block groups, and the population within that block group can be revealed. The results of this map reveal again that the highest valued houses are along the western shore of the Bay Area (such as in San Francisco and the Peninsula), yet lower median house values along the remainder of the Bay Area.