Loading the libraries and data set

library(tidyverse)
library(ggplot2)
library(lubridate)
library(fiftystater)
library(geofacet)

# data read

state<- read.csv('State_time_series.csv')

#data_dict <- read.csv('DataDictionary.csv')

Introduction

Zillow collects , processes and publishes housing and economic data from a variety of propriety sources.To get an idea of the data we have, we shall ask several questions.

How do the home values change by tier?

tier <- names(state)[grepl('Tier',names(state))]

total_distribution <- state %>%
    select(ends_with('Tier')) %>% mutate(date=state$Date) %>%
  mutate(date=ymd(date)) %>%
  na.omit() %>%
  gather(type,value,1:3) %>%
  select(value)
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone 'zone/tz/2017c.1.0/
## zoneinfo/Asia/Kolkata'
options(scipen=999)
state %>%
  select(ends_with('Tier')) %>% mutate(date=state$Date) %>%
  mutate(date=ymd(date)) %>%
  na.omit() %>%
  gather(type,value,1:3) %>%
  ggplot(aes(x=value,fill=type))+geom_density(alpha=0.4,color='transparent')+facet_wrap(~type,nrow = 3)+
  theme(panel.background = element_rect(fill = '#ffffff'),
        axis.text.x = element_text(vjust=-1,angle=90),
        strip.background = element_blank(),
        legend.position = 'none',plot.title = element_text(hjust=0.5),plot.caption = element_text(hjust=-0.05,face='italic'))+labs(title='How do the house values vary?',x='In USD',caption='The distribution in gray shows the overall distribution of the home values')+geom_density(data=total_distribution,aes(x=value),fill='gray',color='transparent',alpha=0.5)

How do the home values change with time across tiers?

state_max <- state %>%
  select(ends_with('Tier')) %>% mutate(date=state$Date) %>%
  mutate(date=ymd(date)) %>%
  gather(type,value,1:3) %>%
  na.omit() %>%
  group_by(date,type) %>%
  summarise(median_value=median(value,na.rm=T)) %>%
  ungroup() %>%
  group_by(type) %>% 
  summarise(max_value=max(median_value)) %>%
  ungroup() %>%
    mutate(type=gsub("ZHVI_","",type))


state_when_max <- state %>%
  select(ends_with('Tier')) %>% mutate(date=state$Date) %>%
  mutate(date=ymd(date)) %>%
  gather(type,value,1:3) %>%
  na.omit() %>%
  group_by(date,type) %>%
  summarise(median_value=median(value,na.rm=T)) %>%
  ungroup() %>%
  filter(median_value %in% state_max$max_value) %>%
    mutate(type=gsub("ZHVI_","",type))


state_recession <- state %>%
  select(ends_with('Tier')) %>% mutate(date=state$Date) %>%
  mutate(date=ymd(date)) %>%
  gather(type,value,1:3) %>%
  na.omit() %>%
  group_by(date,type) %>%
  summarise(median_value=median(value,na.rm=T)) %>%
  filter(date>='2007-12-01' & date<='2009-06-30') %>%
  ungroup() %>%
    mutate(type=gsub("ZHVI_","",type)) 
  
  

state %>%
  select(ends_with('Tier')) %>% mutate(date=state$Date) %>%
  mutate(date=ymd(date)) %>%
  gather(type,value,1:3) %>%
  na.omit() %>%
  group_by(date,type) %>%
  summarise(median_value=median(value,na.rm=T)) %>%
  ungroup() %>%
  mutate(type=gsub("ZHVI_","",type)) %>%
  ggplot(aes(x=date,y=median_value,color=type))+geom_line()+theme(panel.background = element_rect(fill = '#ffffff'),
                                                                  axis.text.x = element_text(vjust=-1,angle=90),
                                                                  strip.background = element_blank(),
                                                                  legend.position = 'none',plot.title = element_text(hjust=0.5),
                                                                  plot.caption = element_text(hjust=-0.05,face='italic'))+labs(x='Date',y='Median Value USD',
                                                                                                   title='Variation of Median Value of Houses by Tier Type',
                                                                                                   caption='The vertical lines represent the dates when the median values were maximum'
                                                                                                  )+
  geom_segment(data=state_when_max,aes(x=date,y=0,xend=date,color=type,yend=median_value),linetype=2)+
  geom_text(data = state_when_max,aes(x=date,y=median_value,label=date),size=5,inherit.aes = F,vjust=-1)+
  geom_text(data = state_when_max,aes(x=date,y=median_value,label=type),size=4,inherit.aes = T,vjust=1,hjust=-2)

  #geom_rect(xmin=ymd('2007-12-01'),xmax=ymd('2009-06-30'),ymin=0,ymax=Inf,fill='gray',alpha=0.01,color='#ffffff')

How do the median price to rent ratio change across time across region?

state_map <- state %>% rename(name=RegionName)
state_grid_modified <- geofacet::us_state_grid1  %>%
   mutate(name=gsub(" ",'',name)) %>%
  inner_join(state_map,by='name')

state_grid_modified <- state_grid_modified %>%
  
  mutate(name=ifelse(name=='WestVirginia',"West Virginia",name)) %>%
  mutate(name=ifelse(name=='SouthDakota',"South Dakota",name)) %>%
  mutate(name=ifelse(name=='RhodeIsland',"Rhode Island",name)) %>%
  mutate(name=ifelse(name=='NorthDakota',"North Dakota",name)) %>%
  mutate(name=ifelse(name=='NorthCarolina',"North Carolina",name)) %>%
  mutate(name=ifelse(name=='NewMexico',"New Mexico",name)) %>%
  mutate(name=ifelse(name=='NewJersey',"New Jersey",name)) %>%
  mutate(name=ifelse(name=='NewHampshire',"New Hampshire",name)) %>%
  mutate(name=ifelse(name=='SouthCarolina','South Carolina',name)) %>%
  mutate(name=ifelse(name=='NewYork','New York',name)) %>%
  mutate(name=ifelse(name=='DistrictofColumbia','District of Columbia',name))
  
  
state_map <- state_grid_modified %>%
  mutate(state=tolower(name)) %>%
  group_by(state) %>%
  summarise(median_ratio=median(PriceToRentRatio_AllHomes,na.rm=T))
  




p <- state_map %>%
  rename(`Median Ratio of Price to Rent`=median_ratio) %>%
  ggplot(aes(map_id=state))+
  geom_map(aes(fill=`Median Ratio of Price to Rent`),map = fifty_states)+
expand_limits(x = fifty_states$long, y = fifty_states$lat) +
  coord_map()+theme(legend.position = 'none',axis.ticks = element_blank(),panel.background = element_blank(),axis.text = element_blank(),axis.title = element_blank(),plot.title = element_text(hjust = 0.5),plot.caption = element_text(hjust=-0.05,face='italic'),plot.subtitle = element_text(face = 'italic',size='8'))+labs(fill='',title='Median Price to Rent Ratio',caption='States in Red show that it is better to rent than to buy there',subtitle='Median of Price to Rent Ratio is Taken')+
  scale_fill_continuous(low='green',high='red')

plotly::ggplotly(p)

The above map shows that it is better to rent than to buy in the western parts of the United States.In the eastern part of the United States, in the state of Massachusetts it is better to rent than to buy.

How do the time period on the Zillow database and median value change?

state$us_region <- ''
new_england <- c('Connecticut', 'Maine', 'Massachusetts', 'NewHampshire', 'RhodeIsland','Vermont')
mid_atlantic <- c('NewJersey', 'NewYork', 'Pennsylvania')
east_north_central <- c('Illinois', 'Indiana', 'Michigan', 'Ohio','Wisconsin')
west_north_central <- c('Iowa', 'Kansas', 'Minnesota', 'Missouri', 'Nebraska', 'NorthDakota', 'SouthDakota')
south_atlantic <- c('Delaware', 'Florida', 'Georgia', 'Maryland', 'NorthCarolina', 'SouthCarolina', 'Virginia', 'DistrictofColumbia','WestVirginia')
east_south_central <- c('Alabama', 'Kentucky', 'Mississippi', 'Tennessee')
west_south_central <- c('Arkansas', 'Louisiana', 'Oklahoma', 'Texas')
mountain <- c('Arizona', 'Colorado', 'Idaho', 'Montana', 'Nevada', 'NewMexico', 'Utah', 'Wyoming')
pacific <- c('Alaska', 'California', 'Hawaii', 'Oregon', 'Washington')

state_with_region <- state %>%
  mutate(us_region=ifelse(RegionName %in% new_england,"New England",us_region)) %>%
  mutate(us_region=ifelse(RegionName %in% mid_atlantic,"Mid Altantic",us_region)) %>%
  mutate(us_region=ifelse(RegionName %in% new_england,"New England",us_region)) %>%
  mutate(us_region=ifelse(RegionName %in% east_north_central,"East North Central",us_region)) %>%
  mutate(us_region=ifelse(RegionName %in% west_north_central,"West North Central",us_region)) %>%
  mutate(us_region=ifelse(RegionName %in% south_atlantic,"South Atlantic",us_region)) %>%
  mutate(us_region=ifelse(RegionName %in% east_south_central,"East South Central",us_region)) %>%
  mutate(us_region=ifelse(RegionName %in% west_south_central,"West South Central",us_region)) %>%
  mutate(us_region=ifelse(RegionName %in% mountain,"Mountain",us_region)) %>%
  mutate(us_region=ifelse(RegionName %in% pacific,"Pacific",us_region))


### ZHVI all homes VS Days on Zillow

state_with_region %>% select(us_region,ZHVI_AllHomes,DaysOnZillow_AllHomes) %>%
  filter(us_region!='') %>%
  na.omit() %>% ggplot(aes(x=ZHVI_AllHomes,y=DaysOnZillow_AllHomes,color=us_region))+geom_jitter(alpha=0.05)+
  geom_smooth()+
  facet_wrap(~us_region,nrow=3)+theme(strip.background = element_blank(),legend.position = 'none',
                                      panel.background = element_blank(),
                                      axis.text.x = element_text(vjust=-1,angle=90),
                                      plot.title = element_text(hjust=0.5,size=15),axis.title.x = element_text(size=10),axis.title.y = element_text(size=10),strip.text = element_text(size=15))+
  labs(title='Days on Zillow VS Zillow Home Value Index',x='ZHVI All Homes (USD)',y='Days on Zillow (All Homes)')

Monthly rent variation for all homes and time?

data_median_rent_per_sq_feet <- state %>%
  select(ZriPerSqft_AllHomes,Date) %>%
  mutate(Date=ymd(Date)) %>%
  na.omit() %>%
  group_by(Date) %>%
  summarise(median_value=median(ZriPerSqft_AllHomes,na.rm=T)) %>%
  ungroup() %>%
  arrange(desc(Date)) %>%
  mutate(lead_median_value=lead(median_value)) %>%
  na.omit() %>%
  mutate(diff=median_value-lead_median_value) %>%
  select(Date,median_value,diff) %>%
  gather(type,value,2:3) %>%
  mutate(type=ifelse(type=='median_value',"Median Value Of House Rent Per square Feet","Rate Of Increase")) 

data_max_rent_per_sq_feet <- data_median_rent_per_sq_feet %>%
  filter(grepl('Rate',type)) %>%
  arrange(desc(value)) %>%
  head(1)



data_median_rent_per_sq_feet %>%

  ggplot(aes(x=Date,y=value,color=type))+geom_line()+theme(panel.background = element_blank(),strip.background = element_blank(),legend.position = 'none',plot.caption = element_text(hjust=-0.05,face='italic'),strip.text = element_text(size=12))+
  labs(x='Date',y='',caption=stringr::str_c('The vertical line represents the date on which the rate of change of rent per square feet was the highest :',data_max_rent_per_sq_feet$Date))+facet_wrap(~type,nrow=2,scales = 'free')+
  geom_vline(aes(xintercept=data_max_rent_per_sq_feet$Date),linetype=3)