# Instructions
# . The research question is can we predict which three zip codes provide the best 
# investment opportunity for the Syracuse Real Estate Investment Trust (SREIT)?  
# . Using the base data available from Zillow 
# (files.zillowstatic.com/research/public/Zip/Zip_Zhvi_SingleFamilyResidence.csv)
# o Review the data - clean as appropriate
# o Provide an initial data analysis
# . Develop a thematic map(s) of the United States:
#  o    By state
# o Median housing for Dec (state average)
# o Present years from 2003-2011
# . Develop time series plots for the following Arkansas metro areas:
#  o    Hot Springs, Little Rock, Fayetteville, Searcy
# o Present all values from 1997 to 2013
# o Average at the metro area level
# . Using data from Zillow and other sources (think Bureau of Labor Statistics and Census data):
#  o    Develop model for forecasting average median housing value for 2014 
# o Use the average of 2014-01, 2014-02, 2014-03 as your test set
# o Consolidate monthly data into an annual average
# . Answer the following questions:
#  o    What three zip codes provide the best investment opportunity for the SREIT?
# o Why?

# Well, I tried many many options, so I installed a ton of packages. 

# Install packages from the class lecture
library(PBSmapping)
library(maptools)
library(RCurl)
library(RJSONIO)
library(foreign)
# Additional packages
# devtools::install_github("twitter/AnomalyDetection") # AnomalyDetection
library(AnomalyDetection)
library(ggplot2)
library(Rcpp)
library(timeDate)
library(data.table)
library(tseries)
library(lubridate)
library(forecast)
# devtools::install_github("ellisp/forecastxgb-r-package/pkg") # forecastxgb
library(forecastxgb)
library(caret)
library(qlcMatrix)
library(xgboost)
library(dplyr)
library(plotly)
library(ggthemes)
library(USAboundaries)
library(sp)
library(maptools)
library(ggplot2)
library(rgeos)
library(rgdal)
library(reshape)
library(gpclib)
library(dplyr)
library(data.table)
library(USAboundaries)
library(raster)
library(RCurl)    
library(tidyr)      
library(stringr)
library(maps)
library(ggmap)
# Time Series packages
require(astsa, quietly=TRUE, warn.conflicts=FALSE)
require(knitr)
library(astsa)
library(knitr)

# Set the directory.
setwd("C:/DC/Advanced Information Analytics/Labs")

# Remove scientific notation 
options(scipen=999)

# 1. Review the data - clean as appropriate.
real.estate <- read.csv("Zillo BB data.csv", header = TRUE)
str(real.estate) # 11279 obs. of  228 variables. 
summary(real.estate)
glimpse(real.estate)

# Remove NAs
real.estate.cleaned <- na.omit(real.estate) 
str(real.estate.cleaned) # 9283 obs. of  228 variables. 

# Save this new data frame as .csv file.
write.csv(real.estate.cleaned, file = "real.estate.cleaned.csv")
str(real.estate.cleaned) # 9283 obs. of  228 variables. So, we deleted 1996 rows, about 15%. 
# However, some columns still have empty spaces, for instance "Metro" column has 219 empty cells. 
# But it is not crucial for our research.
summary(real.estate.cleaned)
glimpse(real.estate.cleaned)
# Question 2.   Develop a thematic map(s) of the United States:
#  o    By state
# o Median housing for Dec (state average)

# Aggregate data by state
real.estate.cleaned.by.state <- aggregate(real.estate.cleaned, list(real.estate.cleaned$State), mean)
write.csv(real.estate.cleaned.by.state, file = "real.estate.cleaned.by.state.csv")
# It gave us summarize data by state and mean of median housing. 
# Correctness of this code was double checked in Excel.
# But because of removed data I now have only ... 40 states. 10 states were lost. 

# Data in December was cleaned in Excel. 
# I deleted everything besides December data and calculated average for each state.
real.estate.cleaned.by.state.Dec <- read.csv("real.estate.cleaned.by.state2.csv", header = TRUE)
# Ten states are absent due to omit.na()
# Alaska, Idaho, Kansas, Maine, Montana, New Mexico, North Dakota, South Dakota, Vermont, West Virginia
# I put 0 value there
head(real.estate.cleaned.by.state.Dec)
##         State Mean.of.median.housing
## 1     Alabama               116777.7
## 2    Arkansas               104362.0
## 3     Arizona               188516.7
## 4  California               416353.8
## 5    Colorado               217096.9
## 6 Connecticut               285076.4
# State                  Mean.of.median.housing  
# 1     Alabama               116777.7 
# 2    Arkanzas               104362.0 

# Map with "fiftystater" package
# devtools::install_github("wmurphyrd/fiftystater")
library(fiftystater)
data("fifty_states") # this line is optional due to lazy data loading
december <- real.estate.cleaned.by.state.Dec # shorten the title
names(december) # "State" "Mean.of.median.housing"
## [1] "State"                  "Mean.of.median.housing"
# Well, it was difficult, so I used two sources:
# First, this package "fiftystater"
# https://github.com/wmurphyrd/fiftystater
# Second, I asked a question on stackoverflow, because I faced some problems
# https://stackoverflow.com/questions/45107302/error-in-seq-lennrowdata-1-argument-must-be-coercible-to-non-negative-in/45110592#45110592

# tolower 
december$statelower <- tolower(december$State)

# Create a map
Map <- ggplot(december, aes(map_id = december$statelower)) + 
  geom_map(aes(fill = december$Mean.of.median.housing), color="#ffffff", map = fifty_states) + 
  expand_limits(x = fifty_states$long, y = fifty_states$lat) +
  coord_map() +
  scale_x_continuous(breaks = NULL) + 
  scale_y_continuous(breaks = NULL) +
  labs(x = "", y = "") +
  theme(legend.position = "bottom", 
        panel.background = element_blank()) + 
  labs(title = "US states by the mean of median housing price in December",
  subtitle = "Source: zillowstatic.com") + guides(fill=guide_legend(title=NULL)) + 
  theme(axis.line.x = element_line(size = .5, colour = "black"),
        axis.title = element_text(size = 14),
        legend.position = "bottom",
        legend.direction = "horizontal",
        legend.box = "vertical",
        legend.key.size = unit(0.7, "cm"),
        text = element_text(family = "OfficinaSanITC-Book"),
        plot.title = element_text(family = "OfficinaSanITC-Book")) + 
        theme(legend.title = element_text("Mean of median housing")) +
        guides(fill=guide_legend(title=NULL))
Map

# So, now we can see that the highest median housing is obviously in California and Hawaii.
# Ten states has dark blue color, because data was empty and I put zero there. 

# Question 3. 
# . Develop a thematic map(s) of the United States:
#  o    By state
# o Present years from 2003-2011

# Now I need to summarize data by year
# Again, data was cleaned in Excel.
# I also filled in all ten absent states with 0. 

housing.by.year <- read.csv("real.estate.cleaned.by.state03-11.csv", header = TRUE)
housing.by.year$State
##  [1] Alabama        Arkansas       Arizona        California    
##  [5] Colorado       Connecticut    Delaware       Florida       
##  [9] Georgia        Hawaii         Iowa           Illinois      
## [13] Indiana        Kentucky       Louisiana      Massachusetts 
## [17] Maryland       Michigan       Minnesota      Missouri      
## [21] Mississippi    North Carolina Nebraska       New Hampshire 
## [25] New Jersey     Nevada         New York       Ohio          
## [29] Oklahoma       Oregon         Pennsylvania   Rhode Island  
## [33] South Carolina Tennessee      Texas          Utah          
## [37] Virginia       Washington     Wisconsin      Wyoming       
## [41] Alaska         Idaho          Kansas         Maine         
## [45] Montana        New Mexico     NorthDakota    South Dakota  
## [49] Vermont        West Virginia 
## 50 Levels: Alabama Alaska Arizona Arkansas California ... Wyoming
# Now I need to map all years from 2003 to 2011.

# Map 2003
housing.by.year$statelower <- tolower(housing.by.year$State) # tolower

m2003 <- ggplot(housing.by.year, aes(map_id = housing.by.year$statelower)) + 
  geom_map(aes(fill = housing.by.year$Year.2003), color="#ffffff", map = fifty_states) + 
  expand_limits(x = fifty_states$long, y = fifty_states$lat) +
  coord_map() +
  scale_x_continuous(breaks = NULL) + 
  scale_y_continuous(breaks = NULL) +
  labs(x = "", y = "") +
  theme(legend.position = "bottom", 
        panel.background = element_blank()) + 
  labs(title = "US states median housing price in 2003",
       subtitle = "Source: zillowstatic.com") + guides(fill=guide_legend(title=NULL)) + 
  theme(axis.line.x = element_line(size = .5, colour = "black"),
        axis.title = element_text(size = 14),
        legend.position = "bottom",
        legend.direction = "horizontal",
        legend.box = "vertical",
        legend.key.size = unit(0.7, "cm"),
        text = element_text(family = "OfficinaSanITC-Book"),
        plot.title = element_text(family = "OfficinaSanITC-Book")) + 
  theme(legend.title = element_text("Mean of median housing")) +
  guides(fill=guide_legend(title=NULL))
m2003

# Map 2004
m2004 <- ggplot(housing.by.year, aes(map_id = housing.by.year$statelower)) + 
  geom_map(aes(fill = housing.by.year$Year.2004), color="#ffffff", map = fifty_states) + 
  expand_limits(x = fifty_states$long, y = fifty_states$lat) +
  coord_map() +
  scale_x_continuous(breaks = NULL) + 
  scale_y_continuous(breaks = NULL) +
  labs(x = "", y = "") +
  theme(legend.position = "bottom", 
        panel.background = element_blank()) + 
  labs(title = "US states median housing price in 2004",
       subtitle = "Source: zillowstatic.com") + guides(fill=guide_legend(title=NULL)) + 
  theme(axis.line.x = element_line(size = .5, colour = "black"),
        axis.title = element_text(size = 14),
        legend.position = "bottom",
        legend.direction = "horizontal",
        legend.box = "vertical",
        legend.key.size = unit(0.7, "cm"),
        text = element_text(family = "OfficinaSanITC-Book"),
        plot.title = element_text(family = "OfficinaSanITC-Book")) + 
  theme(legend.title = element_text("Mean of median housing")) +
  guides(fill=guide_legend(title=NULL))
m2004

# Map 2005
m2005 <- ggplot(housing.by.year, aes(map_id = housing.by.year$statelower)) + 
  geom_map(aes(fill = housing.by.year$Year.2005), color="#ffffff", map = fifty_states) + 
  expand_limits(x = fifty_states$long, y = fifty_states$lat) +
  coord_map() +
  scale_x_continuous(breaks = NULL) + 
  scale_y_continuous(breaks = NULL) +
  labs(x = "", y = "") +
  theme(legend.position = "bottom", 
        panel.background = element_blank()) + 
  labs(title = "US states median housing price in 2005",
       subtitle = "Source: zillowstatic.com") + guides(fill=guide_legend(title=NULL)) + 
  theme(axis.line.x = element_line(size = .5, colour = "black"),
        axis.title = element_text(size = 14),
        legend.position = "bottom",
        legend.direction = "horizontal",
        legend.box = "vertical",
        legend.key.size = unit(0.7, "cm"),
        text = element_text(family = "OfficinaSanITC-Book"),
        plot.title = element_text(family = "OfficinaSanITC-Book")) + 
  theme(legend.title = element_text("Mean of median housing")) +
  guides(fill=guide_legend(title=NULL))
m2005

# Map 2006
m2006 <- ggplot(housing.by.year, aes(map_id = housing.by.year$statelower)) + 
  geom_map(aes(fill = housing.by.year$Year.2006), color="#ffffff", map = fifty_states) + 
  expand_limits(x = fifty_states$long, y = fifty_states$lat) +
  coord_map() +
  scale_x_continuous(breaks = NULL) + 
  scale_y_continuous(breaks = NULL) +
  labs(x = "", y = "") +
  theme(legend.position = "bottom", 
        panel.background = element_blank()) + 
  labs(title = "US states median housing price in 2006",
       subtitle = "Source: zillowstatic.com") + guides(fill=guide_legend(title=NULL)) + 
  theme(axis.line.x = element_line(size = .5, colour = "black"),
        axis.title = element_text(size = 14),
        legend.position = "bottom",
        legend.direction = "horizontal",
        legend.box = "vertical",
        legend.key.size = unit(0.7, "cm"),
        text = element_text(family = "OfficinaSanITC-Book"),
        plot.title = element_text(family = "OfficinaSanITC-Book")) + 
  theme(legend.title = element_text("Mean of median housing")) +
  guides(fill=guide_legend(title=NULL))
m2006

# Map 2007
m2007 <- ggplot(housing.by.year, aes(map_id = housing.by.year$statelower)) + 
  geom_map(aes(fill = housing.by.year$Year.2007), color="#ffffff", map = fifty_states) + 
  expand_limits(x = fifty_states$long, y = fifty_states$lat) +
  coord_map() +
  scale_x_continuous(breaks = NULL) + 
  scale_y_continuous(breaks = NULL) +
  labs(x = "", y = "") +
  theme(legend.position = "bottom", 
        panel.background = element_blank()) + 
  labs(title = "US states median housing price in 2007",
       subtitle = "Source: zillowstatic.com") + guides(fill=guide_legend(title=NULL)) + 
  theme(axis.line.x = element_line(size = .5, colour = "black"),
        axis.title = element_text(size = 14),
        legend.position = "bottom",
        legend.direction = "horizontal",
        legend.box = "vertical",
        legend.key.size = unit(0.7, "cm"),
        text = element_text(family = "OfficinaSanITC-Book"),
        plot.title = element_text(family = "OfficinaSanITC-Book")) + 
  theme(legend.title = element_text("Mean of median housing")) +
  guides(fill=guide_legend(title=NULL))
m2007

# Map 2008
m2008 <- ggplot(housing.by.year, aes(map_id = housing.by.year$statelower)) + 
  geom_map(aes(fill = housing.by.year$Year.2008), color="#ffffff", map = fifty_states) + 
  expand_limits(x = fifty_states$long, y = fifty_states$lat) +
  coord_map() +
  scale_x_continuous(breaks = NULL) + 
  scale_y_continuous(breaks = NULL) +
  labs(x = "", y = "") +
  theme(legend.position = "bottom", 
        panel.background = element_blank()) + 
  labs(title = "US states median housing price in 2008",
       subtitle = "Source: zillowstatic.com") + guides(fill=guide_legend(title=NULL)) + 
  theme(axis.line.x = element_line(size = .5, colour = "black"),
        axis.title = element_text(size = 14),
        legend.position = "bottom",
        legend.direction = "horizontal",
        legend.box = "vertical",
        legend.key.size = unit(0.7, "cm"),
        text = element_text(family = "OfficinaSanITC-Book"),
        plot.title = element_text(family = "OfficinaSanITC-Book")) + 
  theme(legend.title = element_text("Mean of median housing")) +
  guides(fill=guide_legend(title=NULL))
m2008

# Map 2009
m2009 <- ggplot(housing.by.year, aes(map_id = housing.by.year$statelower)) + 
  geom_map(aes(fill = housing.by.year$Year.2009), color="#ffffff", map = fifty_states) + 
  expand_limits(x = fifty_states$long, y = fifty_states$lat) +
  coord_map() +
  scale_x_continuous(breaks = NULL) + 
  scale_y_continuous(breaks = NULL) +
  labs(x = "", y = "") +
  theme(legend.position = "bottom", 
        panel.background = element_blank()) + 
  labs(title = "US states median housing price in 2009",
       subtitle = "Source: zillowstatic.com") + guides(fill=guide_legend(title=NULL)) + 
  theme(axis.line.x = element_line(size = .5, colour = "black"),
        axis.title = element_text(size = 14),
        legend.position = "bottom",
        legend.direction = "horizontal",
        legend.box = "vertical",
        legend.key.size = unit(0.7, "cm"),
        text = element_text(family = "OfficinaSanITC-Book"),
        plot.title = element_text(family = "OfficinaSanITC-Book")) + 
  theme(legend.title = element_text("Mean of median housing")) +
  guides(fill=guide_legend(title=NULL))
m2009

# Map 2010
m2010 <- ggplot(housing.by.year, aes(map_id = housing.by.year$statelower)) + 
  geom_map(aes(fill = housing.by.year$Year.2010), color="#ffffff", map = fifty_states) + 
  expand_limits(x = fifty_states$long, y = fifty_states$lat) +
  coord_map() +
  scale_x_continuous(breaks = NULL) + 
  scale_y_continuous(breaks = NULL) +
  labs(x = "", y = "") +
  theme(legend.position = "bottom", 
        panel.background = element_blank()) + 
  labs(title = "US states median housing price in 2010",
       subtitle = "Source: zillowstatic.com") + guides(fill=guide_legend(title=NULL)) + 
  theme(axis.line.x = element_line(size = .5, colour = "black"),
        axis.title = element_text(size = 14),
        legend.position = "bottom",
        legend.direction = "horizontal",
        legend.box = "vertical",
        legend.key.size = unit(0.7, "cm"),
        text = element_text(family = "OfficinaSanITC-Book"),
        plot.title = element_text(family = "OfficinaSanITC-Book")) + 
  theme(legend.title = element_text("Mean of median housing")) +
  guides(fill=guide_legend(title=NULL))
m2010

# Map 2011
m2011 <- ggplot(housing.by.year, aes(map_id = housing.by.year$statelower)) + 
  geom_map(aes(fill = housing.by.year$Year.2011), color="#ffffff", map = fifty_states) + 
  expand_limits(x = fifty_states$long, y = fifty_states$lat) +
  coord_map() +
  scale_x_continuous(breaks = NULL) + 
  scale_y_continuous(breaks = NULL) +
  labs(x = "", y = "") +
  theme(legend.position = "bottom", 
        panel.background = element_blank()) + 
  labs(title = "US states median housing price in 2011",
       subtitle = "Source: zillowstatic.com") + guides(fill=guide_legend(title=NULL)) + 
  theme(axis.line.x = element_line(size = .5, colour = "black"),
        axis.title = element_text(size = 14),
        legend.position = "bottom",
        legend.direction = "horizontal",
        legend.box = "vertical",
        legend.key.size = unit(0.7, "cm"),
        text = element_text(family = "OfficinaSanITC-Book"),
        plot.title = element_text(family = "OfficinaSanITC-Book")) + 
  theme(legend.title = element_text("Mean of median housing")) +
  guides(fill=guide_legend(title=NULL))
m2011

# So, what can we see? Distribution varies by year. Different scales are used.
# One of the reason may be the financial crisis in the US in 2008.

# Question 4. 
# . Develop time series plots for the following Arkansas metro areas:
#  o    Hot Springs, Little Rock, Fayetteville, Searcy
# o Present all values from 1997 to 2013
# o Average at the metro area level

# I cleaned data in Excel. I have data for four cities and separately mean of each city.
arkansas <- read.csv("Arkansas metro1.csv", header = TRUE)
str(arkansas)
## 'data.frame':    204 obs. of  5 variables:
##  $ Month       : Factor w/ 204 levels "1-Apr","1-Aug",..: 174 170 186 158 190 182 178 162 202 198 ...
##  $ Hot.Springs : int  70400 70150 70500 71450 72550 73350 74100 74700 75250 75550 ...
##  $ Little.Rock : num  94708 95383 96108 96483 96700 ...
##  $ Fayetteville: num  100133 100300 100467 100367 100533 ...
##  $ Searcy      : int  78600 78800 78800 78400 78100 78200 78600 79000 79100 79100 ...
glimpse(arkansas)
## Observations: 204
## Variables: 5
## $ Month        <fctr> Jan-97, Feb-97, Mar-97, Apr-97, May-97, Jun-97, ...
## $ Hot.Springs  <int> 70400, 70150, 70500, 71450, 72550, 73350, 74100, ...
## $ Little.Rock  <dbl> 94708.33, 95383.33, 96108.33, 96483.33, 96700.00,...
## $ Fayetteville <dbl> 100133.3, 100300.0, 100466.7, 100366.7, 100533.3,...
## $ Searcy       <int> 78600, 78800, 78800, 78400, 78100, 78200, 78600, ...
colnames(arkansas) # "Month" "Hot.Springs"  "Little.Rock"  "Fayetteville" "Searcy"   
## [1] "Month"        "Hot.Springs"  "Little.Rock"  "Fayetteville"
## [5] "Searcy"
# Trend for Arkansas cities
# Hot Springs
hot.springs <- ggplot(arkansas, aes(Month, Hot.Springs, group = 1)) + geom_line(color = "red") +
  labs(title = "Median housing price in Hot Springs, AR (1997-2013)",
       subtitle = "Source: zillowstatic.com") +
  theme(plot.title = element_text(hjust = 0.5)) + 
  labs(x = "Year", 
       y = "Median housing price") +
  scale_color_fivethirtyeight() +
  theme_fivethirtyeight() +
  theme(axis.line.x = element_line(size = .5, colour = "black"),
        axis.title = element_text(size = 14),
        legend.position = "right",
        legend.direction = "vertical",
        legend.box = "vertical",
        legend.key.size = unit(0.7, "cm"),
        legend.text = element_text(size = 10),
        text = element_text(family = "OfficinaSanITC-Book"),
        plot.title = element_text(family = "OfficinaSanITC-Book")) +
  theme(
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) 
hot.springs

# Little Rock
little.rock <- ggplot(arkansas, aes(Month, Little.Rock, group = 1)) + geom_line(color = "blue") +
  labs(title = "Median housing price in Little Rock, AR (1997-2013)",
       subtitle = "Source: zillowstatic.com") +
  theme(plot.title = element_text(hjust = 0.5)) + 
  labs(x = "Year", 
       y = "Median housing price") +
  scale_color_fivethirtyeight() +
  theme_fivethirtyeight() +
  theme(axis.line.x = element_line(size = .5, colour = "black"),
        axis.title = element_text(size = 14),
        legend.position = "right",
        legend.direction = "vertical",
        legend.box = "vertical",
        legend.key.size = unit(0.7, "cm"),
        legend.text = element_text(size = 10),
        text = element_text(family = "OfficinaSanITC-Book"),
        plot.title = element_text(family = "OfficinaSanITC-Book")) +
  theme(
    axis.text.x=element_blank(),
    axis.ticks.x=element_blank()) 
little.rock

# Searcy
searcy <- ggplot(arkansas, aes(Month, Searcy, group = 1)) + geom_line(color = "green") +
  labs(title = "Median housing price in Searcy, AR (1997-2013)",
       subtitle = "Source: zillowstatic.com") +
  theme(plot.title = element_text(hjust = 0.5)) + 
  labs(x = "Year", 
       y = "Median housing price") +
  scale_color_fivethirtyeight() +
  theme_fivethirtyeight() +
  theme(axis.line.x = element_line(size = .5, colour = "black"),
        axis.title = element_text(size = 14),
        legend.position = "right",
        legend.direction = "vertical",
        legend.box = "vertical",
        legend.key.size = unit(0.7, "cm"),
        legend.text = element_text(size = 10),
        text = element_text(family = "OfficinaSanITC-Book"),
        plot.title = element_text(family = "OfficinaSanITC-Book")) +
  theme(
    axis.text.x=element_blank(),
    axis.ticks.x=element_blank()) 
searcy

# Fayetteville
fayetteville <- ggplot(arkansas, aes(Month, Fayetteville, group = 1)) + geom_line(color = "orange") +
  labs(title = "Median housing price in Fayetteville, AR (1997-2013)",
       subtitle = "Source: zillowstatic.com") +
  theme(plot.title = element_text(hjust = 0.5)) + 
  labs(x = "Year", 
       y = "Median housing price") +
  scale_color_fivethirtyeight() +
  theme_fivethirtyeight() +
  theme(axis.line.x = element_line(size = .5, colour = "black"),
        axis.title = element_text(size = 14),
        legend.position = "right",
        legend.direction = "vertical",
        legend.box = "vertical",
        legend.key.size = unit(0.7, "cm"),
        legend.text = element_text(size = 10),
        text = element_text(family = "OfficinaSanITC-Book"),
        plot.title = element_text(family = "OfficinaSanITC-Book")) +
  theme(
    axis.text.x=element_blank(),
    axis.ticks.x=element_blank()) 
fayetteville

# Mean for each city 
mean(arkansas$Hot.Springs) # 107042.4
## [1] 107042.4
mean(arkansas$Little.Rock) # 134420.6
## [1] 134420.6
mean(arkansas$Fayetteville) # 144649.7
## [1] 144649.7
mean(arkansas$Searcy) # 100102.9
## [1] 100102.9
# Question 5.
# . Using data from Zillow and other sources (think Bureau of Labor Statistics and Census data):
#  o    Develop model for forecasting average median housing value for 2014 
# o Use the average of 2014-01, 2014-02, 2014-03 as your test set
# o Consolidate monthly data into an annual average

# I created in Excel means for each month for forecasting average median housing value for 2014.
# I suppose that means of three months are not enough to make precise forecasting,
# so, I use the whole 2014 year (10 months)
forecast.mean <- read.csv("forecast.mean.csv", header = TRUE)
colnames(forecast.mean)
## [1] "ï..Month"               "Mean.of.median.housing"
# Plot the trend
forecast.mean.plot <- ggplot(forecast.mean, aes(x=forecast.mean$ï..Month, 
                                                y=forecast.mean$Mean.of.median.housing,
                                                group = 1,
                                                color=Mean.of.median.housing)) +
  geom_line() + geom_smooth(method = "lm") +
  labs(title = "Trend in means of median housing prices in 2014",
       subtitle = "Source: zillowstatic.com") +
  theme(plot.title = element_text(hjust = 0.5)) + 
  labs(x = "Month", 
       y = "Median housing price") +
  theme_fivethirtyeight() +
  theme(axis.line.x = element_line(size = .5, colour = "black"),
        axis.title = element_text(size = 14),
        legend.position = "right",
        legend.direction = "vertical",
        legend.box = "vertical",
        legend.key.size = unit(0.7, "cm"),
        legend.text = element_text(size = 10),
        text = element_text(family = "OfficinaSanITC-Book"),
        plot.title = element_text(family = "OfficinaSanITC-Book")) + 
  theme(legend.position="none") 
forecast.mean.plot

# What can we see here? Blue line is a general positive trend, it is increasing. 
# Another line shows us changes in average price per month. Fluctuation is within $10,000.

# Besides I want to use all months for more precise estimation. 
forecast.all.data <- read.csv("forecast.all.data.csv", header = TRUE)
colnames(forecast.all.data) #"Mean.of.median.housing" "Month"   
## [1] "Mean.of.median.housing" "Month"
# Plot the trend
forecast.all.data.plot <- ggplot(forecast.all.data, aes(x=forecast.all.data$Month, 
                                                y=forecast.all.data$Mean.of.median.housing,
                                                group = 1,
                                                color=Mean.of.median.housing)) +
  geom_line() + geom_smooth(method = "lm") +
  labs(title = "Trend in the US (1996-2014)",
       subtitle = "Source: zillowstatic.com") +
  theme(plot.title = element_text(hjust = 0.5)) + 
  labs(x = "Month", 
       y = "Median housing price") +
  theme_fivethirtyeight() +
  theme(axis.line.x = element_line(size = .5, colour = "black"),
        axis.title = element_text(size = 14),
        legend.position = "right",
        legend.direction = "vertical",
        legend.box = "vertical",
        legend.key.size = unit(0.7, "cm"),
        legend.text = element_text(size = 10),
        text = element_text(family = "OfficinaSanITC-Book"),
        plot.title = element_text(family = "OfficinaSanITC-Book")) + 
  theme(legend.position="none") +
  theme(
    axis.text.x=element_blank(),
    axis.ticks.x=element_blank()) 
forecast.all.data.plot

# I deleted names of months and years. But this is actually much cooler.
# We vividly see the trend. As a result, we can predict the approximate price 
# based on this trend. 

# Trend Decomposition
# We next decompose monthly ARIMA into its underlying trends. 
# These include the observed monthly ARIMA values, a generalized trend, 
# a seasonal trend, and the remainder (noise) which is unexplanied by either 
# the general or seasonal trends.
monthly.ARIMA <- ts(na.omit(forecast.all.data$Month), frequency=30)
decomp <- stl(monthly.ARIMA, s.window="periodic")
deseasonal_cnt <- seasadj(decomp)
plot(decomp, main='Trends')

# We can see interesting tendencies. 
# The generalized trend serves to further smooth monthly ARIMA. 
# Again, our previous seasonal trend is confirmed with this model. 
# Seasonal trend has the similar pattern with the general trend. 


# Question 6. 
# . Answer the following questions:
# o What three zip codes provide the best investment opportunity for the SREIT?
# o Why?

# Analysis of Syracuse data
# For this I created a subset in Excel.
syracuse <- read.csv("Syracuse.real.estate.csv", header = TRUE)
colnames(syracuse)
## [1] "Month"    "Zip13210" "Zip13208" "Zip13204" "Zip13205" "Zip13206"
## [7] "Zip13203" "Zip13207" "Zip13224"
# [1] "Month"    "Zip13210" "Zip13208" "Zip13204" "Zip13205" "Zip13206" "Zip13203" "Zip13207"
# [9] "Zip13224"
# Here we have 8 zipcodes and I will show the trend for each of them. 

# Zip 13210 (I lived with this zip two years on Westcott area :))
zip13210 <- ggplot(syracuse, aes(Month, Zip13210, group = 1)) + geom_line(color = "darkorange") +
  labs(title = "Zip 13210 trend (1996-2014) in Syracuse, NY",
       subtitle = "Source: zillowstatic.com") +
  theme(plot.title = element_text(hjust = 0.5)) + 
  labs(x = "Year", 
       y = "Median housing price") +
  scale_color_fivethirtyeight() +
  theme_fivethirtyeight() +
  theme(axis.line.x = element_line(size = .5, colour = "black"),
        axis.title = element_text(size = 14),
        legend.position = "right",
        legend.direction = "vertical",
        legend.box = "vertical",
        legend.key.size = unit(0.7, "cm"),
        legend.text = element_text(size = 10),
        text = element_text(family = "OfficinaSanITC-Book"),
        plot.title = element_text(family = "OfficinaSanITC-Book")) +
  theme(
    axis.text.x=element_blank(),
    axis.ticks.x=element_blank()) 
zip13210 # We can see that current price is about $70,000. 

# Zip 13208
zip13208 <- ggplot(syracuse, aes(Month, Zip13208, group = 1)) + geom_line(color = "darkorange") +
  labs(title = "Zip 13208 trend (1996-2014) in Syracuse, NY",
       subtitle = "Source: zillowstatic.com") +
  theme(plot.title = element_text(hjust = 0.5)) + 
  labs(x = "Year", 
       y = "Median housing price") +
  scale_color_fivethirtyeight() +
  theme_fivethirtyeight() +
  theme(axis.line.x = element_line(size = .5, colour = "black"),
        axis.title = element_text(size = 14),
        legend.position = "right",
        legend.direction = "vertical",
        legend.box = "vertical",
        legend.key.size = unit(0.7, "cm"),
        legend.text = element_text(size = 10),
        text = element_text(family = "OfficinaSanITC-Book"),
        plot.title = element_text(family = "OfficinaSanITC-Book")) +
  theme(
    axis.text.x=element_blank(),
    axis.ticks.x=element_blank()) 
zip13208 # We can see that the current price is about $55,000.

# Zip 13204
zip13204 <- ggplot(syracuse, aes(Month, Zip13204, group = 1)) + geom_line(color = "darkorange") +
  labs(title = "Zip 13204 trend (1996-2014) in Syracuse, NY",
       subtitle = "Source: zillowstatic.com") +
  theme(plot.title = element_text(hjust = 0.5)) + 
  labs(x = "Year", 
       y = "Median housing price") +
  scale_color_fivethirtyeight() +
  theme_fivethirtyeight() +
  theme(axis.line.x = element_line(size = .5, colour = "black"),
        axis.title = element_text(size = 14),
        legend.position = "right",
        legend.direction = "vertical",
        legend.box = "vertical",
        legend.key.size = unit(0.7, "cm"),
        legend.text = element_text(size = 10),
        text = element_text(family = "OfficinaSanITC-Book"),
        plot.title = element_text(family = "OfficinaSanITC-Book")) +
  theme(
    axis.text.x=element_blank(),
    axis.ticks.x=element_blank()) 
zip13204 # We can see that the current price is about $45,000.

# Zip 13205
zip13205 <- ggplot(syracuse, aes(Month, Zip13205, group = 1)) + geom_line(color = "darkorange") +
  labs(title = "Zip 13205 trend (1996-2014) in Syracuse, NY",
       subtitle = "Source: zillowstatic.com") +
  theme(plot.title = element_text(hjust = 0.5)) + 
  labs(x = "Year", 
       y = "Median housing price") +
  scale_color_fivethirtyeight() +
  theme_fivethirtyeight() +
  theme(axis.line.x = element_line(size = .5, colour = "black"),
        axis.title = element_text(size = 14),
        legend.position = "right",
        legend.direction = "vertical",
        legend.box = "vertical",
        legend.key.size = unit(0.7, "cm"),
        legend.text = element_text(size = 10),
        text = element_text(family = "OfficinaSanITC-Book"),
        plot.title = element_text(family = "OfficinaSanITC-Book")) +
  theme(
    axis.text.x=element_blank(),
    axis.ticks.x=element_blank()) 
zip13205 # We can see that the current price is about $50,000.

# Zip 13206
zip13206 <- ggplot(syracuse, aes(Month, Zip13206, group = 1)) + geom_line(color = "darkorange") +
  labs(title = "Zip 13206 trend (1996-2014) in Syracuse, NY",
       subtitle = "Source: zillowstatic.com") +
  theme(plot.title = element_text(hjust = 0.5)) + 
  labs(x = "Year", 
       y = "Median housing price") +
  scale_color_fivethirtyeight() +
  theme_fivethirtyeight() +
  theme(axis.line.x = element_line(size = .5, colour = "black"),
        axis.title = element_text(size = 14),
        legend.position = "right",
        legend.direction = "vertical",
        legend.box = "vertical",
        legend.key.size = unit(0.7, "cm"),
        legend.text = element_text(size = 10),
        text = element_text(family = "OfficinaSanITC-Book"),
        plot.title = element_text(family = "OfficinaSanITC-Book")) +
  theme(
    axis.text.x=element_blank(),
    axis.ticks.x=element_blank()) 
zip13206 # We can see that the current price is about $60,000+. 

# Zip 13203
zip13203 <- ggplot(syracuse, aes(Month, Zip13203, group = 1)) + geom_line(color = "darkorange") +
  labs(title = "Zip 13203 trend (1996-2014) in Syracuse, NY",
       subtitle = "Source: zillowstatic.com") +
  theme(plot.title = element_text(hjust = 0.5)) + 
  labs(x = "Year", 
       y = "Median housing price") +
  scale_color_fivethirtyeight() +
  theme_fivethirtyeight() +
  theme(axis.line.x = element_line(size = .5, colour = "black"),
        axis.title = element_text(size = 14),
        legend.position = "right",
        legend.direction = "vertical",
        legend.box = "vertical",
        legend.key.size = unit(0.7, "cm"),
        legend.text = element_text(size = 10),
        text = element_text(family = "OfficinaSanITC-Book"),
        plot.title = element_text(family = "OfficinaSanITC-Book")) +
  theme(
    axis.text.x=element_blank(),
    axis.ticks.x=element_blank()) 
zip13203 # We can see that the current price is about $55,000.

# Zip 13207
zip13207 <- ggplot(syracuse, aes(Month, Zip13207, group = 1)) + geom_line(color = "darkorange") +
  labs(title = "Zip 13207 trend (1996-2014) in Syracuse, NY",
       subtitle = "Source: zillowstatic.com") +
  theme(plot.title = element_text(hjust = 0.5)) + 
  labs(x = "Year", 
       y = "Median housing price") +
  scale_color_fivethirtyeight() +
  theme_fivethirtyeight() +
  theme(axis.line.x = element_line(size = .5, colour = "black"),
        axis.title = element_text(size = 14),
        legend.position = "right",
        legend.direction = "vertical",
        legend.box = "vertical",
        legend.key.size = unit(0.7, "cm"),
        legend.text = element_text(size = 10),
        text = element_text(family = "OfficinaSanITC-Book"),
        plot.title = element_text(family = "OfficinaSanITC-Book")) +
  theme(
    axis.text.x=element_blank(),
    axis.ticks.x=element_blank()) 
zip13207 # We can see that the current price is about $55,000.

# Zip 13224
zip13224 <- ggplot(syracuse, aes(Month, Zip13224, group = 1)) + geom_line(color = "darkorange") +
  labs(title = "Zip 13224 trend (1996-2014) in Syracuse, NY",
       subtitle = "Source: zillowstatic.com") +
  theme(plot.title = element_text(hjust = 0.5)) + 
  labs(x = "Year", 
       y = "Median housing price") +
  scale_color_fivethirtyeight() +
  theme_fivethirtyeight() +
  theme(axis.line.x = element_line(size = .5, colour = "black"),
        axis.title = element_text(size = 14),
        legend.position = "right",
        legend.direction = "vertical",
        legend.box = "vertical",
        legend.key.size = unit(0.7, "cm"),
        legend.text = element_text(size = 10),
        text = element_text(family = "OfficinaSanITC-Book"),
        plot.title = element_text(family = "OfficinaSanITC-Book")) +
  theme(
    axis.text.x=element_blank(),
    axis.ticks.x=element_blank()) 
zip13224 # We can see that the current price is about $85,000.

# Well, I firmly believe that investing based on charts is not a good idea, but...
# 13210 (especially Westcott area), 13224, and 13206 are my three choices. 


# Additional sources for mapping data in US states
# http://eriqande.github.io/rep-res-web/lectures/making-maps-with-R.html
# http://rpubs.com/jfbratt/basic-mapping
# https://rpubs.com/alyssafahringer/165330
# https://github.com/wmurphyrd/fiftystater
# http://api.rpubs.com/jbrnbrg/project2_607
# http://eriqande.github.io/rep-res-web/lectures/making-maps-with-R.html#maps-package-and-ggplot
# https://rpbs.com/jfbratt/basic-mapping
# https://uchicagoconsulting.wordpress.com/
# http://www.kevjohnson.org/making-maps-in-r/
# https://stackoverflow.com/questions/29614972/ggplot-us-state-map-colors-are-fine-polygons-jagged-r
# http://adamolson.org/2015/07/15/post_about_maps/
# http://rforpublichealth.blogspot.com/2015/10/mapping-with-ggplot-create-nice.html
# http://stat405.had.co.nz/ggmap.pdf