Project 2

Author

K Bedassa

California House Pricing

library(tidyverse)
library(tidyr)
library(leaflet)
library(plotly)
setwd("/Users/kidusteffera/Desktop/DATA110/week 11 ")
California <- read.csv('housing.csv')

Combine latitude and longitude into a single GeoLocation-style column

Cali_House <- California |>
  unite("coords", latitude, longitude, sep = ", ", remove = FALSE) |>
  mutate(coords = paste0("(", coords, ")"),
         latitude = as.numeric(latitude),
         longitude = as.numeric(longitude))
head(Cali_House)
            coords longitude latitude housing_median_age total_rooms
1 (37.88, -122.23)   -122.23    37.88                 41         880
2 (37.86, -122.22)   -122.22    37.86                 21        7099
3 (37.85, -122.24)   -122.24    37.85                 52        1467
4 (37.85, -122.25)   -122.25    37.85                 52        1274
5 (37.85, -122.25)   -122.25    37.85                 52        1627
6 (37.85, -122.25)   -122.25    37.85                 52         919
  total_bedrooms population households median_income median_house_value
1            129        322        126        8.3252             452600
2           1106       2401       1138        8.3014             358500
3            190        496        177        7.2574             352100
4            235        558        219        5.6431             341300
5            280        565        259        3.8462             342200
6            213        413        193        4.0368             269700
  ocean_proximity
1        NEAR BAY
2        NEAR BAY
3        NEAR BAY
4        NEAR BAY
5        NEAR BAY
6        NEAR BAY

Filter the dataset by ocean proximity

Cali_House_Clean <- Cali_House |>
  filter(ocean_proximity == "NEAR BAY") |>
  filter(!is.na(latitude), !is.na(longitude), !is.na(median_house_value))
nrow(Cali_House_Clean)
[1] 2290
head(Cali_House_Clean)
            coords longitude latitude housing_median_age total_rooms
1 (37.88, -122.23)   -122.23    37.88                 41         880
2 (37.86, -122.22)   -122.22    37.86                 21        7099
3 (37.85, -122.24)   -122.24    37.85                 52        1467
4 (37.85, -122.25)   -122.25    37.85                 52        1274
5 (37.85, -122.25)   -122.25    37.85                 52        1627
6 (37.85, -122.25)   -122.25    37.85                 52         919
  total_bedrooms population households median_income median_house_value
1            129        322        126        8.3252             452600
2           1106       2401       1138        8.3014             358500
3            190        496        177        7.2574             352100
4            235        558        219        5.6431             341300
5            280        565        259        3.8462             342200
6            213        413        193        4.0368             269700
  ocean_proximity
1        NEAR BAY
2        NEAR BAY
3        NEAR BAY
4        NEAR BAY
5        NEAR BAY
6        NEAR BAY
names(Cali_House_Clean)
 [1] "coords"             "longitude"          "latitude"          
 [4] "housing_median_age" "total_rooms"        "total_bedrooms"    
 [7] "population"         "households"         "median_income"     
[10] "median_house_value" "ocean_proximity"   
Cali_House_Clean2 <- Cali_House_Clean |>
  select(-total_rooms, -total_bedrooms, -households)
head(Cali_House_Clean2)
            coords longitude latitude housing_median_age population
1 (37.88, -122.23)   -122.23    37.88                 41        322
2 (37.86, -122.22)   -122.22    37.86                 21       2401
3 (37.85, -122.24)   -122.24    37.85                 52        496
4 (37.85, -122.25)   -122.25    37.85                 52        558
5 (37.85, -122.25)   -122.25    37.85                 52        565
6 (37.85, -122.25)   -122.25    37.85                 52        413
  median_income median_house_value ocean_proximity
1        8.3252             452600        NEAR BAY
2        8.3014             358500        NEAR BAY
3        7.2574             352100        NEAR BAY
4        5.6431             341300        NEAR BAY
5        3.8462             342200        NEAR BAY
6        4.0368             269700        NEAR BAY
bay_established <- Cali_House_Clean2 |>
  filter(housing_median_age >= 35)

nrow(bay_established)
[1] 1450
head(bay_established)
            coords longitude latitude housing_median_age population
1 (37.88, -122.23)   -122.23    37.88                 41        322
2 (37.85, -122.24)   -122.24    37.85                 52        496
3 (37.85, -122.25)   -122.25    37.85                 52        558
4 (37.85, -122.25)   -122.25    37.85                 52        565
5 (37.85, -122.25)   -122.25    37.85                 52        413
6 (37.84, -122.25)   -122.25    37.84                 52       1094
  median_income median_house_value ocean_proximity
1        8.3252             452600        NEAR BAY
2        7.2574             352100        NEAR BAY
3        5.6431             341300        NEAR BAY
4        3.8462             342200        NEAR BAY
5        4.0368             269700        NEAR BAY
6        3.6591             299200        NEAR BAY
bay_established |>
  summarise(
    n = n(),
    median_value = median(median_house_value),
    median_income = median(median_income),
    median_age = median(housing_median_age)
  )
     n median_value median_income median_age
1 1450       238650        3.5833         48
p <- plot_ly(
  data = bay_established,
  x = ~median_income,
  y = ~median_house_value,
  type = "scatter",
  mode = "markers",
  marker = list(
    size = 8,
    color = ~housing_median_age,
    colorscale = "YlOrRd",
    showscale = TRUE,
    colorbar = list(title = "Housing<br>Median Age"),
    line = list(width = 0.5, color = "white"),
    opacity = 0.7
  ),
  text = ~paste0(
    "Median Income: $", round(median_income * 10000), "<br>",
    "Median House Value: $", format(median_house_value, big.mark = ","), "<br>",
    "Housing Median Age: ", housing_median_age, " years<br>",
    "Population: ", population
  ),
  hoverinfo = "text"
) |>
  layout(
    title = list(text = "Median House Value vs. Median Income<br><sub>NEAR BAY block groups, housing age ≥ 35 years</sub>"),
    xaxis = list(title = "Median Income (tens of thousands of USD)"),
    yaxis = list(title = "Median House Value (USD)")
  )

p
pal <- colorNumeric(palette = "YlOrRd", domain = bay_established$median_house_value)

leaflet(bay_established) |>
  addTiles() |>
  addCircleMarkers(
    lng = ~longitude,
    lat = ~latitude,
    radius = ~median_house_value / 100000,
    color = ~pal(median_house_value),
    fillOpacity = 0.7,
    stroke = FALSE
  ) |>
  addLegend("bottomright", pal = pal, values = ~median_house_value,
            title = "Median House Value ($)", opacity = 1)
leaflet(bay_established) |>
  addProviderTiles(providers$CartoDB.Positron) |>
  addCircleMarkers(
    lng = ~longitude,
    lat = ~latitude,
    radius = ~median_house_value / 100000,
    color = ~pal(median_house_value),
    fillOpacity = 0.75,
    stroke = TRUE,
    weight = 1,
    popup = ~paste0(
      "<b>Median House Value:</b> $", format(median_house_value, big.mark = ","), "<br/>",
      "<b>Median Income:</b> $", round(median_income * 10000), "<br/>",
      "<b>Housing Median Age:</b> ", housing_median_age, " years<br/>",
      "<b>Population:</b> ", population
    ),
    label = ~paste0("$", format(median_house_value, big.mark = ","))
  ) |>
  addLegend("topright", pal = pal, values = ~median_house_value,
            title = "Median House Value ($)", opacity = 1) |>
  setView(lng = -122.27, lat = 37.80, zoom = 9)

5. Write a paragraph

The analysis that was performed in this exercise was focused upon the region of the dataset known as the NEAR BAY region, which represented each block group within the area around the Bay. Furthermore, only those block groups with both ocean_proximity of “NEAR BAY” and a median age of the houses within the area of at least 35 years were included within this analysis. Thus, the size of the subset of the dataset that was analyzed was well below the 900-observation limit for the problem, yet still represented various block groups within the area around the Bay.

Within the Plotly chart, we can see the relationship between median income and median house value within the NEAR BAY region of San Francisco. In general, the higher the median income of a given block group, the higher the median value of the houses within that block group. However, the relationship between these two variables tends to flat out for block groups that have high median house values (around $500,000). The color of each point on the chart represents the median age of the housing within those block groups. Thus, block groups with high median house values also tend to have some of the oldest housing within the Bay Area. By hovering over each point on the Plotly chart, information regarding the median income of the individuals within each block group, the median value of the houses within each block group, the median age of the housing within each block group, and the population within each of those block groups can be revealed.

Similar to the Plotly chart is the leaflet map of California. Each circle on the map represents each block group within the data with a median house value; the size and color of those circles represent the median value of the houses within each of those block groups. Additionally, by clicking on each circle on the map, information regarding the median income within each block group, the median house value within each block group, the median age of the housing within each of those block groups, and the population within that block group can be revealed. The results of this map reveal again that the highest valued houses are along the western shore of the Bay Area (such as in San Francisco and the Peninsula), yet lower median house values along the remainder of the Bay Area.