---
title: "Gainesville  Housing Trends"
output:
  html_notebook: default
  pdf_document: default
---
## Trends in 3 and 4 bedroom houses sold in Gainesville, FL since 2010

Data is From the county property appraiser. Clean up steps are listed here:
```{r}
library("ggplot2")
library("viridis")
library("viridisLite")

#Load data from the Property Appraiser's website
data <- read.csv("~/Downloads/3bed.csv")
data$beds <- rep(x=3, times=dim(data)[1])
data2 <- read.csv("~/Downloads/4bed.csv")
data2$beds <- rep(x=4, times=dim(data2)[1])
data<-rbind(data, data2)
# Set date format
data$Sale_Date <-as.Date(data$Sale_Date, format="%m/%d/%Y")
# filter out records with NA's
data<-data[is.na(data$HtdSqFt) ==FALSE,]
data<-data[data$HtdSqFt > 0,]
data<-data[is.na(data$Bldg_Value)==FALSE,]
data<-data[is.na(data$Sale_Price)==FALSE,]
data<-data[data$Sale_Price<800000,]
data<-data[is.na(data$Sale_Date)==FALSE,]
# Calculate cost per heated square foot
data$costPerSqFt<-data$Sale_Price/data$HtdSqFt
# Add land and building value together
data$appraised <- data$Land_Value + data$Bldg_Value 
#filter out data before 2010
data<-data[data$Sale_Date>"2010-01-01",]
# Filter out extreamly low value sales that are likely to be interfamily transfers etc.
data<-data[data$Sale_Price>50000,]
```

```{r}
# Plot the Log odds ratio of Sale Price / Appraised price to find outliers
ggplot(data = data, aes(x=log2(Sale_Price/appraised))) + 
         geom_histogram(bins=50) +
         xlab("log2 odds ratio of sale prise vs appraised price ")
# filter outliers
data<-data[abs(log2(data$Sale_Price/data$appraised))<1, ]
```

### Add data for house of interest
```{r}
hoi <- data.frame(Sale_Date=as.Date("2017-08-01"),appraised=239600, HtdSqFt=2629,costPerSqFt=310000/2629, Sale_Price=310000, Land_Value=36000, Bldg_Value=193600 )
```
### Plot relationship between appraised value and sales price
```{r}
ggplot(data= data, aes(x=Sale_Price, y= appraised)) + geom_point(aes(color=log10(HtdSqFt)),  size=0.5) + scale_color_viridis() + 
  geom_smooth(method ="lm") +
  ylab("Appraised value") +
  xlab("Sale price") +
  geom_point(data = hoi, colour = "red")
```
### The distribution of the sales prices for houses appraised  +/- $5000 of the house of interest
```{r}
del <- 5000
housevalue <-hoi$appraised
slice <- data[(data$appraised > (housevalue - del) & data$appraised < (housevalue +del)),]
ggplot(data=slice, aes(x=Sale_Price)) + geom_histogram(bins=20)
```


### Plot of the price per square foot over time. Data is color-coded by sales price.
```{r}
# Plot the Price per Square Foot over time 
#datasubset<- data[(data$Sale_Price>180000 & data$Sale_Price < 350000), ]

ggplot(data=data, aes(x=Sale_Date, y=costPerSqFt)) + 
  geom_point(aes(color=log10(Sale_Price)), size=0.5) + 
  scale_color_viridis() + 
  geom_smooth(method="loess") +
  ylab("Dollars per Square Foot") +
  xlab("Year") +
  geom_point(data = hoi, colour = "red")


```

### A plot of the cost per square foot vs the sale price on a log scale. Points are color-coded by the size of the home. 
```{r}
ggplot(data=data, aes(x=Sale_Price, y=costPerSqFt)) +
  scale_x_log10(minor=c(1e5,2e5,3e5,4e5,6e5,7e5)) +
  geom_point(aes(color=log10(HtdSqFt)),size=0.5) + 
  scale_color_viridis() + 
  geom_smooth(method ="lm") +
  ylab("Dollars per Square Foot") +
  xlab("Sale price on a log scale") +
  geom_point(data = hoi, colour = "red")
```
### Plot of the relationship between sale price and square footage
```{r}

ggplot(data=data, aes(x=Sale_Price, y=HtdSqFt))  +
  geom_point(aes(color=costPerSqFt), size=0.5) +
  scale_color_viridis() + 
  geom_smooth(method = "lm") +
  ylab("Square Footage") +
  xlab("Sale Price") +
  geom_point(data = hoi, colour = "red")
```

### Predict sales price and 90% Confidence Interval of house of interest
```{r}
mod0<- lm(Sale_Price ~ appraised, data = data)
summary(mod1)
 p.df<- data.frame(appraised=housevalue)
predict(mod0, p.df,se.fit=T, level=.9, interval = "prediction")
```

### Fitting different linear model of sale price
```{r}
mod2<-glm(Sale_Price ~ HtdSqFt + Land_Value + Bldg_Value, data=data)
summary(mod2)
BIC(mod2)
preds<-predict(mod2, hoi, se.fit=T,type="link")
preds


```





