ggplot2
and tidyverse
libraries for data visualization and munginglubridate
library is used here for easy operations with datetime objects.fiftystater
and the geofacet
packages are used for plotting the given values across the 50 states of the USA.library(tidyverse)
library(ggplot2)
library(lubridate)
library(fiftystater)
library(geofacet)
# data read
state<- read.csv('State_time_series.csv')
#data_dict <- read.csv('DataDictionary.csv')
Zillow collects , processes and publishes housing and economic data from a variety of propriety sources.To get an idea of the data we have, we shall ask several questions.
tier <- names(state)[grepl('Tier',names(state))]
total_distribution <- state %>%
select(ends_with('Tier')) %>% mutate(date=state$Date) %>%
mutate(date=ymd(date)) %>%
na.omit() %>%
gather(type,value,1:3) %>%
select(value)
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone 'zone/tz/2017c.1.0/
## zoneinfo/Asia/Kolkata'
options(scipen=999)
state %>%
select(ends_with('Tier')) %>% mutate(date=state$Date) %>%
mutate(date=ymd(date)) %>%
na.omit() %>%
gather(type,value,1:3) %>%
ggplot(aes(x=value,fill=type))+geom_density(alpha=0.4,color='transparent')+facet_wrap(~type,nrow = 3)+
theme(panel.background = element_rect(fill = '#ffffff'),
axis.text.x = element_text(vjust=-1,angle=90),
strip.background = element_blank(),
legend.position = 'none',plot.title = element_text(hjust=0.5),plot.caption = element_text(hjust=-0.05,face='italic'))+labs(title='How do the house values vary?',x='In USD',caption='The distribution in gray shows the overall distribution of the home values')+geom_density(data=total_distribution,aes(x=value),fill='gray',color='transparent',alpha=0.5)
state_max <- state %>%
select(ends_with('Tier')) %>% mutate(date=state$Date) %>%
mutate(date=ymd(date)) %>%
gather(type,value,1:3) %>%
na.omit() %>%
group_by(date,type) %>%
summarise(median_value=median(value,na.rm=T)) %>%
ungroup() %>%
group_by(type) %>%
summarise(max_value=max(median_value)) %>%
ungroup() %>%
mutate(type=gsub("ZHVI_","",type))
state_when_max <- state %>%
select(ends_with('Tier')) %>% mutate(date=state$Date) %>%
mutate(date=ymd(date)) %>%
gather(type,value,1:3) %>%
na.omit() %>%
group_by(date,type) %>%
summarise(median_value=median(value,na.rm=T)) %>%
ungroup() %>%
filter(median_value %in% state_max$max_value) %>%
mutate(type=gsub("ZHVI_","",type))
state_recession <- state %>%
select(ends_with('Tier')) %>% mutate(date=state$Date) %>%
mutate(date=ymd(date)) %>%
gather(type,value,1:3) %>%
na.omit() %>%
group_by(date,type) %>%
summarise(median_value=median(value,na.rm=T)) %>%
filter(date>='2007-12-01' & date<='2009-06-30') %>%
ungroup() %>%
mutate(type=gsub("ZHVI_","",type))
state %>%
select(ends_with('Tier')) %>% mutate(date=state$Date) %>%
mutate(date=ymd(date)) %>%
gather(type,value,1:3) %>%
na.omit() %>%
group_by(date,type) %>%
summarise(median_value=median(value,na.rm=T)) %>%
ungroup() %>%
mutate(type=gsub("ZHVI_","",type)) %>%
ggplot(aes(x=date,y=median_value,color=type))+geom_line()+theme(panel.background = element_rect(fill = '#ffffff'),
axis.text.x = element_text(vjust=-1,angle=90),
strip.background = element_blank(),
legend.position = 'none',plot.title = element_text(hjust=0.5),
plot.caption = element_text(hjust=-0.05,face='italic'))+labs(x='Date',y='Median Value USD',
title='Variation of Median Value of Houses by Tier Type',
caption='The vertical lines represent the dates when the median values were maximum'
)+
geom_segment(data=state_when_max,aes(x=date,y=0,xend=date,color=type,yend=median_value),linetype=2)+
geom_text(data = state_when_max,aes(x=date,y=median_value,label=date),size=5,inherit.aes = F,vjust=-1)+
geom_text(data = state_when_max,aes(x=date,y=median_value,label=type),size=4,inherit.aes = T,vjust=1,hjust=-2)
#geom_rect(xmin=ymd('2007-12-01'),xmax=ymd('2009-06-30'),ymin=0,ymax=Inf,fill='gray',alpha=0.01,color='#ffffff')
state_map <- state %>% rename(name=RegionName)
state_grid_modified <- geofacet::us_state_grid1 %>%
mutate(name=gsub(" ",'',name)) %>%
inner_join(state_map,by='name')
state_grid_modified <- state_grid_modified %>%
mutate(name=ifelse(name=='WestVirginia',"West Virginia",name)) %>%
mutate(name=ifelse(name=='SouthDakota',"South Dakota",name)) %>%
mutate(name=ifelse(name=='RhodeIsland',"Rhode Island",name)) %>%
mutate(name=ifelse(name=='NorthDakota',"North Dakota",name)) %>%
mutate(name=ifelse(name=='NorthCarolina',"North Carolina",name)) %>%
mutate(name=ifelse(name=='NewMexico',"New Mexico",name)) %>%
mutate(name=ifelse(name=='NewJersey',"New Jersey",name)) %>%
mutate(name=ifelse(name=='NewHampshire',"New Hampshire",name)) %>%
mutate(name=ifelse(name=='SouthCarolina','South Carolina',name)) %>%
mutate(name=ifelse(name=='NewYork','New York',name)) %>%
mutate(name=ifelse(name=='DistrictofColumbia','District of Columbia',name))
state_map <- state_grid_modified %>%
mutate(state=tolower(name)) %>%
group_by(state) %>%
summarise(median_ratio=median(PriceToRentRatio_AllHomes,na.rm=T))
p <- state_map %>%
rename(`Median Ratio of Price to Rent`=median_ratio) %>%
ggplot(aes(map_id=state))+
geom_map(aes(fill=`Median Ratio of Price to Rent`),map = fifty_states)+
expand_limits(x = fifty_states$long, y = fifty_states$lat) +
coord_map()+theme(legend.position = 'none',axis.ticks = element_blank(),panel.background = element_blank(),axis.text = element_blank(),axis.title = element_blank(),plot.title = element_text(hjust = 0.5),plot.caption = element_text(hjust=-0.05,face='italic'),plot.subtitle = element_text(face = 'italic',size='8'))+labs(fill='',title='Median Price to Rent Ratio',caption='States in Red show that it is better to rent than to buy there',subtitle='Median of Price to Rent Ratio is Taken')+
scale_fill_continuous(low='green',high='red')
plotly::ggplotly(p)
The above map shows that it is better to rent than to buy in the western parts of the United States.In the eastern part of the United States, in the state of Massachusetts it is better to rent than to buy.
state$us_region <- ''
new_england <- c('Connecticut', 'Maine', 'Massachusetts', 'NewHampshire', 'RhodeIsland','Vermont')
mid_atlantic <- c('NewJersey', 'NewYork', 'Pennsylvania')
east_north_central <- c('Illinois', 'Indiana', 'Michigan', 'Ohio','Wisconsin')
west_north_central <- c('Iowa', 'Kansas', 'Minnesota', 'Missouri', 'Nebraska', 'NorthDakota', 'SouthDakota')
south_atlantic <- c('Delaware', 'Florida', 'Georgia', 'Maryland', 'NorthCarolina', 'SouthCarolina', 'Virginia', 'DistrictofColumbia','WestVirginia')
east_south_central <- c('Alabama', 'Kentucky', 'Mississippi', 'Tennessee')
west_south_central <- c('Arkansas', 'Louisiana', 'Oklahoma', 'Texas')
mountain <- c('Arizona', 'Colorado', 'Idaho', 'Montana', 'Nevada', 'NewMexico', 'Utah', 'Wyoming')
pacific <- c('Alaska', 'California', 'Hawaii', 'Oregon', 'Washington')
state_with_region <- state %>%
mutate(us_region=ifelse(RegionName %in% new_england,"New England",us_region)) %>%
mutate(us_region=ifelse(RegionName %in% mid_atlantic,"Mid Altantic",us_region)) %>%
mutate(us_region=ifelse(RegionName %in% new_england,"New England",us_region)) %>%
mutate(us_region=ifelse(RegionName %in% east_north_central,"East North Central",us_region)) %>%
mutate(us_region=ifelse(RegionName %in% west_north_central,"West North Central",us_region)) %>%
mutate(us_region=ifelse(RegionName %in% south_atlantic,"South Atlantic",us_region)) %>%
mutate(us_region=ifelse(RegionName %in% east_south_central,"East South Central",us_region)) %>%
mutate(us_region=ifelse(RegionName %in% west_south_central,"West South Central",us_region)) %>%
mutate(us_region=ifelse(RegionName %in% mountain,"Mountain",us_region)) %>%
mutate(us_region=ifelse(RegionName %in% pacific,"Pacific",us_region))
### ZHVI all homes VS Days on Zillow
state_with_region %>% select(us_region,ZHVI_AllHomes,DaysOnZillow_AllHomes) %>%
filter(us_region!='') %>%
na.omit() %>% ggplot(aes(x=ZHVI_AllHomes,y=DaysOnZillow_AllHomes,color=us_region))+geom_jitter(alpha=0.05)+
geom_smooth()+
facet_wrap(~us_region,nrow=3)+theme(strip.background = element_blank(),legend.position = 'none',
panel.background = element_blank(),
axis.text.x = element_text(vjust=-1,angle=90),
plot.title = element_text(hjust=0.5,size=15),axis.title.x = element_text(size=10),axis.title.y = element_text(size=10),strip.text = element_text(size=15))+
labs(title='Days on Zillow VS Zillow Home Value Index',x='ZHVI All Homes (USD)',y='Days on Zillow (All Homes)')
Property prices in the east south central region (Alabama,Kentucky,Mississippi,Tennessee) of the United States go down very drastically fast, followed by states in the west south central region( Arkansas,Louisiana,Oklahoma,Texas) of the United States.
Property prices in the mid atlantic region (NewJersey,NewYork,Pennsylvania) and the the east north central region (Illinois,Indiana,Michigan,Ohio,Wisconsin) first go down as the as the number of days increases and then go up after a certain point.
data_median_rent_per_sq_feet <- state %>%
select(ZriPerSqft_AllHomes,Date) %>%
mutate(Date=ymd(Date)) %>%
na.omit() %>%
group_by(Date) %>%
summarise(median_value=median(ZriPerSqft_AllHomes,na.rm=T)) %>%
ungroup() %>%
arrange(desc(Date)) %>%
mutate(lead_median_value=lead(median_value)) %>%
na.omit() %>%
mutate(diff=median_value-lead_median_value) %>%
select(Date,median_value,diff) %>%
gather(type,value,2:3) %>%
mutate(type=ifelse(type=='median_value',"Median Value Of House Rent Per square Feet","Rate Of Increase"))
data_max_rent_per_sq_feet <- data_median_rent_per_sq_feet %>%
filter(grepl('Rate',type)) %>%
arrange(desc(value)) %>%
head(1)
data_median_rent_per_sq_feet %>%
ggplot(aes(x=Date,y=value,color=type))+geom_line()+theme(panel.background = element_blank(),strip.background = element_blank(),legend.position = 'none',plot.caption = element_text(hjust=-0.05,face='italic'),strip.text = element_text(size=12))+
labs(x='Date',y='',caption=stringr::str_c('The vertical line represents the date on which the rate of change of rent per square feet was the highest :',data_max_rent_per_sq_feet$Date))+facet_wrap(~type,nrow=2,scales = 'free')+
geom_vline(aes(xintercept=data_max_rent_per_sq_feet$Date),linetype=3)
2014-10-31
.