library(stringr)
library(dplyr)
library(tidyr)
library(zoo)
library(ggplot2)
library(knitr)
library(rvest)
library(tibble)
We start by importing the data from a csv file in Github. We can see that the table is in a “wide” format and needs to be cleaned up.
raw_df <- read.csv('https://raw.githubusercontent.com/mehtablocker/cuny_607/master/City_MedianValuePerSqft_AllHomes.csv', stringsAsFactors=F)
raw_df %>% head() %>% kable()
| RegionID | RegionName | State | Metro | CountyName | SizeRank | X1997 | X1998 | X1999 | X2000 | X2001 | X2002 | X2003 | X2004 | X2005 | X2006 | X2007 | X2008 | X2009 | X2010 | X2011 | X2012 | X2013 | X2014 | X2015 | X2016 | X2017 | X2018 | X2019 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6181 | New York | NY | New York-Newark-Jersey City | Queens County | 1 | 132 | 138 | 149 | 167 | 187 | 215 | 226 | 275 | 329 | 386 | 399 | 394 | 364 | 349 | 340 | 336 | 343 | 370 | 402 | 433 | 466 | 499 | 529 |
| 12447 | Los Angeles | CA | Los Angeles-Long Beach-Anaheim | Los Angeles County | 2 | 111 | 117 | 129 | 142 | 158 | 178 | 215 | 268 | 345 | 408 | 418 | 378 | 302 | 274 | 263 | 246 | 269 | 320 | 345 | 375 | 410 | 446 | 471 |
| 17426 | Chicago | IL | Chicago-Naperville-Elgin | Cook County | 3 | 85 | 82 | 99 | 105 | 119 | 129 | 142 | 159 | 181 | 205 | 214 | 208 | 177 | 144 | 132 | 120 | 121 | 132 | 139 | 150 | 158 | 167 | 173 |
| 39051 | Houston | TX | Houston-The Woodlands-Sugar Land | Harris County | 4 | 51 | 53 | 57 | 60 | 61 | 62 | 65 | 66 | 73 | 74 | 76 | 77 | 73 | 71 | 70 | 69 | 69 | 73 | 79 | 86 | 93 | 100 | 106 |
| 6915 | San Antonio | TX | San Antonio-New Braunfels | Bexar County | 5 | 53 | 54 | 55 | 58 | 57 | 56 | 59 | 61 | 67 | 70 | 75 | 80 | 75 | 74 | 73 | 72 | 73 | 76 | 80 | 87 | 92 | 100 | 111 |
| 13271 | Philadelphia | PA | Philadelphia-Camden-Wilmington | Philadelphia County | 6 | 37 | 39 | 39 | 41 | 43 | 46 | 51 | 58 | 71 | 87 | 94 | 96 | 91 | 91 | 86 | 82 | 79 | 82 | 84 | 88 | 99 | 112 | 124 |
We use gather() to go from wide to narrow, then gsub() to clean up the new column. The new table is no longer in a wide format.
city_df <- raw_df %>%
gather(key = year, value = value_per_sqft, X1997:X2019) %>%
mutate(year=as.integer(gsub("X", "", year)))
city_df %>% head() %>% kable()
| RegionID | RegionName | State | Metro | CountyName | SizeRank | year | value_per_sqft |
|---|---|---|---|---|---|---|---|
| 6181 | New York | NY | New York-Newark-Jersey City | Queens County | 1 | 1997 | 132 |
| 12447 | Los Angeles | CA | Los Angeles-Long Beach-Anaheim | Los Angeles County | 2 | 1997 | 111 |
| 17426 | Chicago | IL | Chicago-Naperville-Elgin | Cook County | 3 | 1997 | 85 |
| 39051 | Houston | TX | Houston-The Woodlands-Sugar Land | Harris County | 4 | 1997 | 51 |
| 6915 | San Antonio | TX | San Antonio-New Braunfels | Bexar County | 5 | 1997 | 53 |
| 13271 | Philadelphia | PA | Philadelphia-Camden-Wilmington | Philadelphia County | 6 | 1997 | 37 |
How has the national median value per square foot changed throughout the years?
city_grp_year_df <- city_df %>%
group_by(year) %>%
summarise(national_value_per_sqft=median(value_per_sqft, na.rm=T))
city_grp_year_df %>% kable()
| year | national_value_per_sqft |
|---|---|
| 1997 | 62 |
| 1998 | 64 |
| 1999 | 67 |
| 2000 | 71 |
| 2001 | 75 |
| 2002 | 79 |
| 2003 | 85 |
| 2004 | 91 |
| 2005 | 100 |
| 2006 | 110 |
| 2007 | 115 |
| 2008 | 113 |
| 2009 | 105 |
| 2010 | 99 |
| 2011 | 95 |
| 2012 | 91 |
| 2013 | 92 |
| 2014 | 96 |
| 2015 | 99 |
| 2016 | 103 |
| 2017 | 109 |
| 2018 | 116 |
| 2019 | 124 |
city_grp_year_df %>%
ggplot(aes(x=year, y=national_value_per_sqft)) +
geom_point() + geom_line() +
labs(title = "National Median Home Value per Square Foot", x="Year", y="Dollars")
From the graph we can see values have doubled since 1997. We can also see the huge effect of the mortgage crisis in 2008. It took until 2018 (i.e., 10 years) to get back to 2008 levels.
How does New York City compare to the national median?
We can filter the data by New York City’s RegionID and then join it to the previous National data frame.
nyc_df <- city_df %>% filter(RegionID == 6181) %>% select(year, nyc_value_per_sqft=value_per_sqft)
national_nyc_df <- city_grp_year_df %>% left_join(nyc_df, by="year")
national_nyc_df %>% kable()
| year | national_value_per_sqft | nyc_value_per_sqft |
|---|---|---|
| 1997 | 62 | 132 |
| 1998 | 64 | 138 |
| 1999 | 67 | 149 |
| 2000 | 71 | 167 |
| 2001 | 75 | 187 |
| 2002 | 79 | 215 |
| 2003 | 85 | 226 |
| 2004 | 91 | 275 |
| 2005 | 100 | 329 |
| 2006 | 110 | 386 |
| 2007 | 115 | 399 |
| 2008 | 113 | 394 |
| 2009 | 105 | 364 |
| 2010 | 99 | 349 |
| 2011 | 95 | 340 |
| 2012 | 91 | 336 |
| 2013 | 92 | 343 |
| 2014 | 96 | 370 |
| 2015 | 99 | 402 |
| 2016 | 103 | 433 |
| 2017 | 109 | 466 |
| 2018 | 116 | 499 |
| 2019 | 124 | 529 |
national_nyc_df %>% ggplot(aes(x=year)) +
geom_point(aes(y=national_value_per_sqft)) +
geom_line(aes(y=national_value_per_sqft, color="National")) +
geom_point(aes(y=nyc_value_per_sqft)) +
geom_line(aes(y=nyc_value_per_sqft, color="NYC")) +
labs(title="Home Value per Square Foot", x="Year", y="Dollars", colour="") +
scale_colour_manual(values=c("black", "red"))
In 1997, New York City was about twice the national value. Now it is over four times the national value! From peak to valley of the mortgage crisis, NYC lost only about 16% of its value compared to 21% at the national level.
Which city is the most expensive each year?
city_df %>% group_by(year) %>%
arrange(year, desc(value_per_sqft)) %>%
slice(1) %>%
select(year, everything()) %>% kable()
| year | RegionID | RegionName | State | Metro | CountyName | SizeRank | value_per_sqft |
|---|---|---|---|---|---|---|---|
| 1997 | 4721 | Fisher Island | FL | Miami-Fort Lauderdale-West Palm Beach | Miami-Dade County | 11976 | 437 |
| 1998 | 4721 | Fisher Island | FL | Miami-Fort Lauderdale-West Palm Beach | Miami-Dade County | 11976 | 425 |
| 1999 | 30280 | Atherton | CA | San Francisco-Oakland-Hayward | San Mateo County | 5480 | 478 |
| 2000 | 30280 | Atherton | CA | San Francisco-Oakland-Hayward | San Mateo County | 5480 | 628 |
| 2001 | 30280 | Atherton | CA | San Francisco-Oakland-Hayward | San Mateo County | 5480 | 788 |
| 2002 | 30280 | Atherton | CA | San Francisco-Oakland-Hayward | San Mateo County | 5480 | 867 |
| 2003 | 30280 | Atherton | CA | San Francisco-Oakland-Hayward | San Mateo County | 5480 | 936 |
| 2004 | 124901 | Jupiter Island | FL | Port St. Lucie | Martin County | 11487 | 1051 |
| 2005 | 124901 | Jupiter Island | FL | Port St. Lucie | Martin County | 11487 | 1344 |
| 2006 | 124901 | Jupiter Island | FL | Port St. Lucie | Martin County | 11487 | 1400 |
| 2007 | 124901 | Jupiter Island | FL | Port St. Lucie | Martin County | 11487 | 1287 |
| 2008 | 124901 | Jupiter Island | FL | Port St. Lucie | Martin County | 11487 | 1098 |
| 2009 | 49856 | Stinson Beach | CA | San Francisco-Oakland-Hayward | Marin County | 11883 | 1016 |
| 2010 | 49856 | Stinson Beach | CA | San Francisco-Oakland-Hayward | Marin County | 11883 | 921 |
| 2011 | 49856 | Stinson Beach | CA | San Francisco-Oakland-Hayward | Marin County | 11883 | 887 |
| 2012 | 30280 | Atherton | CA | San Francisco-Oakland-Hayward | San Mateo County | 5480 | 874 |
| 2013 | 30280 | Atherton | CA | San Francisco-Oakland-Hayward | San Mateo County | 5480 | 1078 |
| 2014 | 30280 | Atherton | CA | San Francisco-Oakland-Hayward | San Mateo County | 5480 | 1213 |
| 2015 | 26374 | Palo Alto | CA | San Jose-Sunnyvale-Santa Clara | Santa Clara County | 692 | 1329 |
| 2016 | 26374 | Palo Alto | CA | San Jose-Sunnyvale-Santa Clara | Santa Clara County | 692 | 1495 |
| 2017 | 30280 | Atherton | CA | San Francisco-Oakland-Hayward | San Mateo County | 5480 | 1620 |
| 2018 | 30280 | Atherton | CA | San Francisco-Oakland-Hayward | San Mateo County | 5480 | 1806 |
| 2019 | 30280 | Atherton | CA | San Francisco-Oakland-Hayward | San Mateo County | 5480 | 1793 |
There are some unusual cities in that table. It might make more sense to put in a filter for population size.
city_df %>% filter(SizeRank<=200) %>%
group_by(year) %>%
arrange(year, desc(value_per_sqft)) %>%
slice(1) %>%
select(year, everything()) %>% kable()
| year | RegionID | RegionName | State | Metro | CountyName | SizeRank | value_per_sqft |
|---|---|---|---|---|---|---|---|
| 1997 | 18615 | Honolulu | HI | Urban Honolulu | Honolulu County | 48 | 248 |
| 1998 | 20330 | San Francisco | CA | San Francisco-Oakland-Hayward | San Francisco County | 14 | 240 |
| 1999 | 20330 | San Francisco | CA | San Francisco-Oakland-Hayward | San Francisco County | 14 | 276 |
| 2000 | 20330 | San Francisco | CA | San Francisco-Oakland-Hayward | San Francisco County | 14 | 352 |
| 2001 | 20330 | San Francisco | CA | San Francisco-Oakland-Hayward | San Francisco County | 14 | 419 |
| 2002 | 20330 | San Francisco | CA | San Francisco-Oakland-Hayward | San Francisco County | 14 | 408 |
| 2003 | 20330 | San Francisco | CA | San Francisco-Oakland-Hayward | San Francisco County | 14 | 445 |
| 2004 | 20330 | San Francisco | CA | San Francisco-Oakland-Hayward | San Francisco County | 14 | 483 |
| 2005 | 20330 | San Francisco | CA | San Francisco-Oakland-Hayward | San Francisco County | 14 | 590 |
| 2006 | 20330 | San Francisco | CA | San Francisco-Oakland-Hayward | San Francisco County | 14 | 626 |
| 2007 | 20330 | San Francisco | CA | San Francisco-Oakland-Hayward | San Francisco County | 14 | 625 |
| 2008 | 20330 | San Francisco | CA | San Francisco-Oakland-Hayward | San Francisco County | 14 | 635 |
| 2009 | 20330 | San Francisco | CA | San Francisco-Oakland-Hayward | San Francisco County | 14 | 581 |
| 2010 | 20330 | San Francisco | CA | San Francisco-Oakland-Hayward | San Francisco County | 14 | 549 |
| 2011 | 20330 | San Francisco | CA | San Francisco-Oakland-Hayward | San Francisco County | 14 | 539 |
| 2012 | 20330 | San Francisco | CA | San Francisco-Oakland-Hayward | San Francisco County | 14 | 527 |
| 2013 | 20330 | San Francisco | CA | San Francisco-Oakland-Hayward | San Francisco County | 14 | 603 |
| 2014 | 20330 | San Francisco | CA | San Francisco-Oakland-Hayward | San Francisco County | 14 | 715 |
| 2015 | 20330 | San Francisco | CA | San Francisco-Oakland-Hayward | San Francisco County | 14 | 810 |
| 2016 | 20330 | San Francisco | CA | San Francisco-Oakland-Hayward | San Francisco County | 14 | 914 |
| 2017 | 20330 | San Francisco | CA | San Francisco-Oakland-Hayward | San Francisco County | 14 | 930 |
| 2018 | 20330 | San Francisco | CA | San Francisco-Oakland-Hayward | San Francisco County | 14 | 1013 |
| 2019 | 20330 | San Francisco | CA | San Francisco-Oakland-Hayward | San Francisco County | 14 | 1071 |
After filtering for the top 200 biggest cities, we can safely pity someone looking for housing in the San Francisco area right now. How does San Fran compare to NYC and national values?
sf_df <- city_df %>% filter(RegionID == 20330) %>% select(year, sf_value_per_sqft=value_per_sqft)
national_nyc_sf_df <- national_nyc_df %>% left_join(sf_df, by="year")
national_nyc_sf_df %>% kable()
| year | national_value_per_sqft | nyc_value_per_sqft | sf_value_per_sqft |
|---|---|---|---|
| 1997 | 62 | 132 | 208 |
| 1998 | 64 | 138 | 240 |
| 1999 | 67 | 149 | 276 |
| 2000 | 71 | 167 | 352 |
| 2001 | 75 | 187 | 419 |
| 2002 | 79 | 215 | 408 |
| 2003 | 85 | 226 | 445 |
| 2004 | 91 | 275 | 483 |
| 2005 | 100 | 329 | 590 |
| 2006 | 110 | 386 | 626 |
| 2007 | 115 | 399 | 625 |
| 2008 | 113 | 394 | 635 |
| 2009 | 105 | 364 | 581 |
| 2010 | 99 | 349 | 549 |
| 2011 | 95 | 340 | 539 |
| 2012 | 91 | 336 | 527 |
| 2013 | 92 | 343 | 603 |
| 2014 | 96 | 370 | 715 |
| 2015 | 99 | 402 | 810 |
| 2016 | 103 | 433 | 914 |
| 2017 | 109 | 466 | 930 |
| 2018 | 116 | 499 | 1013 |
| 2019 | 124 | 529 | 1071 |
national_nyc_sf_df %>% ggplot(aes(x=year)) +
geom_point(aes(y=national_value_per_sqft)) +
geom_line(aes(y=national_value_per_sqft, color="National")) +
geom_point(aes(y=nyc_value_per_sqft)) +
geom_line(aes(y=nyc_value_per_sqft, color="NYC")) +
geom_point(aes(y=sf_value_per_sqft)) +
geom_line(aes(y=sf_value_per_sqft, color="SF")) +
labs(title="Home Value per Square Foot", x="Year", y="Dollars", colour="") +
scale_colour_manual(values=c("black", "red", "blue"))
From the graph we can see it’s really in the past seven or eight years that San Francisco’s value has exploded.
We imported a dataset of Zillow Home Values and transformed the table from a wide to narrow format, thereby making analysis easier. We then looked at how the national home value has changed in the last 22 years. Finally, we compared the national values to those of New York City and San Francisco.
Going forward, it might be interesting to try and find this data at a level more granular than city/metro area. Some cities are so big that they encompass a wide variation of home values just within themselves. For example, while Manhattan and Newark may both techincally fall into the category of New York Metropolitan area, surely there is a large discrepancy in home values between them.