library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
setwd("C:/Users/StarKid/Desktop/Data_Science/Data_101/final_project/zillow_data")
cities_crosswalk <- read.csv("cities_crosswalk.csv")
city_time_series <- read.csv("City_time_series.csv")
county_time_series <- read.csv("County_time_series.csv")
countycrosswalk_zillow <- read.csv("CountyCrossWalk_Zillow.csv")
datadictionary <- read.csv("DataDictionary.csv")
metro_time_series <-read.csv("Metro_time_series.csv")
neighborhood_time_series <- read.csv("Neighborhood_time_series.csv")
state_time_series <- read.csv("State_time_series.csv")
zip_time_series <- read.csv("Zip_time_series.csv")
#dim(cities_crosswalk)
#str(cities_crosswalk)
#dim(city_time_series)
#str(city_time_series)
#glimpse(city_time_series)
#head(city_time_series)
#tail(city_time_series)
#city_time_series$RegionName
dim(state_time_series)
## [1] 13212 82
#str(state_time_series)
glimpse(state_time_series)
## Rows: 13,212
## Columns: 82
## $ Date <chr> "1996-04…
## $ RegionName <chr> "Alabama…
## $ DaysOnZillow_AllHomes <dbl> NA, NA, …
## $ InventorySeasonallyAdjusted_AllHomes <int> NA, NA, …
## $ InventoryRaw_AllHomes <int> NA, NA, …
## $ MedianListingPricePerSqft_1Bedroom <dbl> NA, NA, …
## $ MedianListingPricePerSqft_2Bedroom <dbl> NA, NA, …
## $ MedianListingPricePerSqft_3Bedroom <dbl> NA, NA, …
## $ MedianListingPricePerSqft_4Bedroom <dbl> NA, NA, …
## $ MedianListingPricePerSqft_5BedroomOrMore <dbl> NA, NA, …
## $ MedianListingPricePerSqft_AllHomes <dbl> NA, NA, …
## $ MedianListingPricePerSqft_CondoCoop <dbl> NA, NA, …
## $ MedianListingPricePerSqft_DuplexTriplex <dbl> NA, NA, …
## $ MedianListingPricePerSqft_SingleFamilyResidence <dbl> NA, NA, …
## $ MedianListingPrice_1Bedroom <dbl> NA, NA, …
## $ MedianListingPrice_2Bedroom <dbl> NA, NA, …
## $ MedianListingPrice_3Bedroom <dbl> NA, NA, …
## $ MedianListingPrice_4Bedroom <dbl> NA, NA, …
## $ MedianListingPrice_5BedroomOrMore <dbl> NA, NA, …
## $ MedianListingPrice_AllHomes <dbl> NA, NA, …
## $ MedianListingPrice_CondoCoop <dbl> NA, NA, …
## $ MedianListingPrice_DuplexTriplex <dbl> NA, NA, …
## $ MedianListingPrice_SingleFamilyResidence <dbl> NA, NA, …
## $ MedianPctOfPriceReduction_AllHomes <dbl> NA, NA, …
## $ MedianPctOfPriceReduction_CondoCoop <dbl> NA, NA, …
## $ MedianPctOfPriceReduction_SingleFamilyResidence <dbl> NA, NA, …
## $ MedianPriceCutDollar_AllHomes <dbl> NA, NA, …
## $ MedianPriceCutDollar_CondoCoop <dbl> NA, NA, …
## $ MedianPriceCutDollar_SingleFamilyResidence <dbl> NA, NA, …
## $ MedianRentalPricePerSqft_1Bedroom <dbl> NA, NA, …
## $ MedianRentalPricePerSqft_2Bedroom <dbl> NA, NA, …
## $ MedianRentalPricePerSqft_3Bedroom <dbl> NA, NA, …
## $ MedianRentalPricePerSqft_4Bedroom <dbl> NA, NA, …
## $ MedianRentalPricePerSqft_5BedroomOrMore <dbl> NA, NA, …
## $ MedianRentalPricePerSqft_AllHomes <dbl> NA, NA, …
## $ MedianRentalPricePerSqft_CondoCoop <dbl> NA, NA, …
## $ MedianRentalPricePerSqft_DuplexTriplex <dbl> NA, NA, …
## $ MedianRentalPricePerSqft_MultiFamilyResidence5PlusUnits <dbl> NA, NA, …
## $ MedianRentalPricePerSqft_SingleFamilyResidence <dbl> NA, NA, …
## $ MedianRentalPricePerSqft_Studio <dbl> NA, NA, …
## $ MedianRentalPrice_1Bedroom <dbl> NA, NA, …
## $ MedianRentalPrice_2Bedroom <dbl> NA, NA, …
## $ MedianRentalPrice_3Bedroom <dbl> NA, NA, …
## $ MedianRentalPrice_4Bedroom <dbl> NA, NA, …
## $ MedianRentalPrice_5BedroomOrMore <dbl> NA, NA, …
## $ MedianRentalPrice_AllHomes <dbl> NA, NA, …
## $ MedianRentalPrice_CondoCoop <dbl> NA, NA, …
## $ MedianRentalPrice_DuplexTriplex <dbl> NA, NA, …
## $ MedianRentalPrice_MultiFamilyResidence5PlusUnits <dbl> NA, NA, …
## $ MedianRentalPrice_SingleFamilyResidence <dbl> NA, NA, …
## $ MedianRentalPrice_Studio <dbl> NA, NA, …
## $ ZHVIPerSqft_AllHomes <int> 50, 62, …
## $ PctOfHomesDecreasingInValues_AllHomes <dbl> NA, NA, …
## $ PctOfHomesIncreasingInValues_AllHomes <dbl> NA, NA, …
## $ PctOfHomesSellingForGain_AllHomes <dbl> NA, NA, …
## $ PctOfHomesSellingForLoss_AllHomes <dbl> NA, NA, …
## $ PctOfListingsWithPriceReductionsSeasAdj_AllHomes <dbl> NA, NA, …
## $ PctOfListingsWithPriceReductionsSeasAdj_CondoCoop <dbl> NA, NA, …
## $ PctOfListingsWithPriceReductionsSeasAdj_SingleFamilyResidence <dbl> NA, NA, …
## $ PctOfListingsWithPriceReductions_AllHomes <dbl> NA, NA, …
## $ PctOfListingsWithPriceReductions_CondoCoop <dbl> NA, NA, …
## $ PctOfListingsWithPriceReductions_SingleFamilyResidence <dbl> NA, NA, …
## $ PriceToRentRatio_AllHomes <dbl> NA, NA, …
## $ Sale_Counts <dbl> NA, NA, …
## $ Sale_Counts_Seas_Adj <dbl> NA, NA, …
## $ Sale_Prices <dbl> NA, NA, …
## $ ZHVI_1bedroom <int> 61500, 5…
## $ ZHVI_2bedroom <int> 48900, 8…
## $ ZHVI_3bedroom <int> 78200, 9…
## $ ZHVI_4bedroom <int> 146500, …
## $ ZHVI_5BedroomOrMore <int> 206300, …
## $ ZHVI_AllHomes <int> 79500, 1…
## $ ZHVI_BottomTier <int> 45600, 6…
## $ ZHVI_CondoCoop <int> 99500, 7…
## $ ZHVI_MiddleTier <int> 79500, 1…
## $ ZHVI_SingleFamilyResidence <int> 79000, 1…
## $ ZHVI_TopTier <int> 140200, …
## $ ZRI_AllHomes <int> NA, NA, …
## $ ZRI_AllHomesPlusMultifamily <int> NA, NA, …
## $ ZriPerSqft_AllHomes <dbl> NA, NA, …
## $ Zri_MultiFamilyResidenceRental <int> NA, NA, …
## $ Zri_SingleFamilyResidenceRental <int> NA, NA, …
str(state_time_series)
## 'data.frame': 13212 obs. of 82 variables:
## $ Date : chr "1996-04-30" "1996-04-30" "1996-04-30" "1996-04-30" ...
## $ RegionName : chr "Alabama" "Arizona" "Arkansas" "California" ...
## $ DaysOnZillow_AllHomes : num NA NA NA NA NA NA NA NA NA NA ...
## $ InventorySeasonallyAdjusted_AllHomes : int NA NA NA NA NA NA NA NA NA NA ...
## $ InventoryRaw_AllHomes : int NA NA NA NA NA NA NA NA NA NA ...
## $ MedianListingPricePerSqft_1Bedroom : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianListingPricePerSqft_2Bedroom : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianListingPricePerSqft_3Bedroom : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianListingPricePerSqft_4Bedroom : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianListingPricePerSqft_5BedroomOrMore : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianListingPricePerSqft_AllHomes : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianListingPricePerSqft_CondoCoop : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianListingPricePerSqft_DuplexTriplex : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianListingPricePerSqft_SingleFamilyResidence : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianListingPrice_1Bedroom : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianListingPrice_2Bedroom : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianListingPrice_3Bedroom : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianListingPrice_4Bedroom : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianListingPrice_5BedroomOrMore : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianListingPrice_AllHomes : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianListingPrice_CondoCoop : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianListingPrice_DuplexTriplex : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianListingPrice_SingleFamilyResidence : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianPctOfPriceReduction_AllHomes : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianPctOfPriceReduction_CondoCoop : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianPctOfPriceReduction_SingleFamilyResidence : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianPriceCutDollar_AllHomes : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianPriceCutDollar_CondoCoop : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianPriceCutDollar_SingleFamilyResidence : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPricePerSqft_1Bedroom : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPricePerSqft_2Bedroom : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPricePerSqft_3Bedroom : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPricePerSqft_4Bedroom : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPricePerSqft_5BedroomOrMore : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPricePerSqft_AllHomes : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPricePerSqft_CondoCoop : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPricePerSqft_DuplexTriplex : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPricePerSqft_MultiFamilyResidence5PlusUnits : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPricePerSqft_SingleFamilyResidence : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPricePerSqft_Studio : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPrice_1Bedroom : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPrice_2Bedroom : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPrice_3Bedroom : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPrice_4Bedroom : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPrice_5BedroomOrMore : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPrice_AllHomes : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPrice_CondoCoop : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPrice_DuplexTriplex : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPrice_MultiFamilyResidence5PlusUnits : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPrice_SingleFamilyResidence : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPrice_Studio : num NA NA NA NA NA NA NA NA NA NA ...
## $ ZHVIPerSqft_AllHomes : int 50 62 42 102 82 85 71 56 55 185 ...
## $ PctOfHomesDecreasingInValues_AllHomes : num NA NA NA NA NA NA NA NA NA NA ...
## $ PctOfHomesIncreasingInValues_AllHomes : num NA NA NA NA NA NA NA NA NA NA ...
## $ PctOfHomesSellingForGain_AllHomes : num NA NA NA NA NA NA NA NA NA NA ...
## $ PctOfHomesSellingForLoss_AllHomes : num NA NA NA NA NA NA NA NA NA NA ...
## $ PctOfListingsWithPriceReductionsSeasAdj_AllHomes : num NA NA NA NA NA NA NA NA NA NA ...
## $ PctOfListingsWithPriceReductionsSeasAdj_CondoCoop : num NA NA NA NA NA NA NA NA NA NA ...
## $ PctOfListingsWithPriceReductionsSeasAdj_SingleFamilyResidence: num NA NA NA NA NA NA NA NA NA NA ...
## $ PctOfListingsWithPriceReductions_AllHomes : num NA NA NA NA NA NA NA NA NA NA ...
## $ PctOfListingsWithPriceReductions_CondoCoop : num NA NA NA NA NA NA NA NA NA NA ...
## $ PctOfListingsWithPriceReductions_SingleFamilyResidence : num NA NA NA NA NA NA NA NA NA NA ...
## $ PriceToRentRatio_AllHomes : num NA NA NA NA NA NA NA NA NA NA ...
## $ Sale_Counts : num NA NA NA NA NA NA NA NA NA NA ...
## $ Sale_Counts_Seas_Adj : num NA NA NA NA NA NA NA NA NA NA ...
## $ Sale_Prices : num NA NA NA NA NA NA NA NA NA NA ...
## $ ZHVI_1bedroom : int 61500 59200 53000 93700 77800 64700 90100 45400 74900 152300 ...
## $ ZHVI_2bedroom : int 48900 86400 54500 123400 97500 97000 88200 65400 64700 186600 ...
## $ ZHVI_3bedroom : int 78200 96100 76800 150900 129000 130400 103500 89100 88000 231800 ...
## $ ZHVI_4bedroom : int 146500 128400 135100 196100 176100 194800 157800 133600 149700 303400 ...
## $ ZHVI_5BedroomOrMore : int 206300 190500 186000 265300 212900 299800 176100 199900 212800 345500 ...
## $ ZHVI_AllHomes : int 79500 103600 64400 157900 128100 132000 106800 86300 92000 227400 ...
## $ ZHVI_BottomTier : int 45600 67100 38400 95100 82700 83700 77200 52500 57200 144500 ...
## $ ZHVI_CondoCoop : int 99500 78900 70300 136100 99400 85000 NA 70600 89300 177000 ...
## $ ZHVI_MiddleTier : int 79500 103600 64400 157900 128100 132000 106800 86300 92000 227400 ...
## $ ZHVI_SingleFamilyResidence : int 79000 107500 64500 162000 133600 141000 107400 92100 92400 262600 ...
## $ ZHVI_TopTier : int 140200 168700 115200 270600 209300 231600 161600 155300 163900 374700 ...
## $ ZRI_AllHomes : int NA NA NA NA NA NA NA NA NA NA ...
## $ ZRI_AllHomesPlusMultifamily : int NA NA NA NA NA NA NA NA NA NA ...
## $ ZriPerSqft_AllHomes : num NA NA NA NA NA NA NA NA NA NA ...
## $ Zri_MultiFamilyResidenceRental : int NA NA NA NA NA NA NA NA NA NA ...
## $ Zri_SingleFamilyResidenceRental : int NA NA NA NA NA NA NA NA NA NA ...
#summary(state_time_series)
boxplot(state_time_series$MedianListingPricePerSqft_1Bedroom, outline = FALSE)
hist(state_time_series$Sale_Prices, col = "green", breaks= 100)
plot(state_time_series$PctOfHomesSellingForLoss_AllHomes)
plot(state_time_series$PctOfHomesIncreasingInValues_AllHomes)
#colSums(is.na(state_time_series))
max(state_time_series$MedianListingPrice_AllHomes, na.rm = T)
## [1] 610000
min(state_time_series$MedianListingPrice_AllHomes, na.rm = T)
## [1] 112944
max(max(state_time_series$MedianRentalPrice_AllHomes, na.rm = T))
## [1] 3600
min(state_time_series$MedianRentalPrice_AllHomes, na.rm = T)
## [1] 750
#head(state_time_series, 10)
#tail(state_time_series, 10)
#colSums(is.na(state_time_series))
# How many missing values do we have? there are 744
#sum(is.na(state_time_series$ZHVI_AllHomes))
#### Lets view the missing values and see if we need to clean.
# which(is.na(state_time_series$ZHVI_AllHomes))
# view(state_time_series$ZHVI_AllHomes)
## when your looking at the data set you will notice that some of this
### Find missing data from the DMV
# source: https://levelup.gitconnected.com/structural-vector-autoregression-in-r-5d6dbfc56499
# Make a new variable to see the time series
library(tseries)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(TSstudio)
library(tibble)
zillow_state <- state_time_series %>%
select(Date, RegionName,ZHVI_AllHomes) %>%
drop_na(ZHVI_AllHomes)
zillow_state$Date <- as.Date(zillow_state$Date, format = "%Y-%m-%d")
as_tibble(zillow_state)
## # A tibble: 12,438 × 3
## Date RegionName ZHVI_AllHomes
## <date> <chr> <int>
## 1 1996-04-30 Alabama 79500
## 2 1996-04-30 Arizona 103600
## 3 1996-04-30 Arkansas 64400
## 4 1996-04-30 California 157900
## 5 1996-04-30 Colorado 128100
## 6 1996-04-30 Connecticut 132000
## 7 1996-04-30 Delaware 106800
## 8 1996-04-30 Florida 86300
## 9 1996-04-30 Georgia 92000
## 10 1996-04-30 Hawaii 227400
## # ℹ 12,428 more rows
#x= state_zillow$ZHVI_AllHomes
#y= state_zillow$Date
# which if its a data frame?
#view(zillow_state)
class(zillow_state)
## [1] "data.frame"
unique(zillow_state$RegionName) # All the Possible names in the data set
## [1] "Alabama" "Arizona" "Arkansas"
## [4] "California" "Colorado" "Connecticut"
## [7] "Delaware" "Florida" "Georgia"
## [10] "Hawaii" "Idaho" "Illinois"
## [13] "Indiana" "Iowa" "Kentucky"
## [16] "Maine" "Maryland" "Massachusetts"
## [19] "Michigan" "Minnesota" "Mississippi"
## [22] "Missouri" "Nebraska" "Nevada"
## [25] "NewHampshire" "NewJersey" "NewMexico"
## [28] "NorthCarolina" "Ohio" "Oklahoma"
## [31] "Oregon" "Pennsylvania" "RhodeIsland"
## [34] "SouthCarolina" "Tennessee" "Utah"
## [37] "Virginia" "Washington" "WestVirginia"
## [40] "SouthDakota" "Texas" "Wisconsin"
## [43] "Montana" "Wyoming" "Alaska"
## [46] "DistrictofColumbia" "Vermont" "NorthDakota"
## [49] "NewYork" "Kansas"
dmv <- zillow_state %>%
filter(RegionName %in% c("Maryland", "Virginia", "DistrictofColumbia"))
# Maryland
md <- dmv %>%
filter(RegionName %in% c("Maryland"))
# Virginia
va <- dmv %>%
filter(RegionName %in% c("Virginia"))
# District of Columbia
dc <- dmv %>%
filter(RegionName %in% c("DistrictofColumbia"))
# now convert it to time series object each variable
zill_home_value_index <- ts(zillow_state$ZHVI_AllHomes, start = c(1996,4,30), frequency = 12)
# Create time series for Maryland, Virginia, and DC
md <- ts(md$ZHVI_AllHomes, start = c(1996, 4, 30),
frequency = 12) # frequency = 12 because the prices are reported every month. there are 12 months in a year.
va <- ts(va$ZHVI_AllHomes, start = c(1996, 4, 30),
frequency = 12)
dc <- ts(dc$ZHVI_AllHomes, start = c(1999,6,30),
frequency = 12) #they started to collect data at 1999
# Plot the variables
ts_plot(zill_home_value_index)
ts_plot(md,
title = "Zillow Home Value Index Maryland",
Xtitle = "Years",
Ytitle = "Estimated Home Prices")
ts_plot(va,
title = "Zillow Home Value Index Virginia",
Xtitle = "Years",
Ytitle = "Estimated Home Prices")
ts_plot(dc,
title = "Zillow Home Value Index D.C",
Xtitle = "Years",
Ytitle = "Estimated Home Prices")
plot(md)
plot(va)
plot(dc)
zillow_state_rental <- state_time_series %>%
select(Date, RegionName, MedianRentalPrice_1Bedroom, MedianRentalPrice_2Bedroom, MedianRentalPrice_3Bedroom, MedianRentalPrice_4Bedroom, MedianRentalPrice_5BedroomOrMore, MedianRentalPrice_CondoCoop, MedianRentalPricePerSqft_DuplexTriplex, MedianRentalPrice_MultiFamilyResidence5PlusUnits, MedianRentalPrice_SingleFamilyResidence, MedianRentalPrice_Studio, MedianRentalPrice_AllHomes)
#zillow_state_rental %>%
na.omit
## function (object, ...)
## UseMethod("na.omit")
## <bytecode: 0x00000267e0857bb0>
## <environment: namespace:stats>
str(zillow_state_rental)
## 'data.frame': 13212 obs. of 13 variables:
## $ Date : chr "1996-04-30" "1996-04-30" "1996-04-30" "1996-04-30" ...
## $ RegionName : chr "Alabama" "Arizona" "Arkansas" "California" ...
## $ MedianRentalPrice_1Bedroom : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPrice_2Bedroom : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPrice_3Bedroom : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPrice_4Bedroom : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPrice_5BedroomOrMore : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPrice_CondoCoop : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPricePerSqft_DuplexTriplex : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPrice_MultiFamilyResidence5PlusUnits: num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPrice_SingleFamilyResidence : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPrice_Studio : num NA NA NA NA NA NA NA NA NA NA ...
## $ MedianRentalPrice_AllHomes : num NA NA NA NA NA NA NA NA NA NA ...
# they started to collect data for rent in 2010
#view(zillow_state_rental)
#label for Axis
label = c("1 Bed", "2 Bed", "3 Bed", "4 Bed", "5 Bed", "Condo", "Family", "Studio")
boxplot(zillow_state_rental$MedianRentalPrice_1Bedroom, zillow_state_rental$MedianRentalPrice_2Bedroom, zillow_state_rental$MedianRentalPrice_3Bedroom, zillow_state_rental$MedianRentalPrice_4Bedroom, zillow_state_rental$MedianRentalPrice_5BedroomOrMore, zillow_state_rental$MedianRentalPrice_CondoCoop, zillow_state_rental$MedianRentalPrice_SingleFamilyResidence, zillow_state_rental$MedianRentalPrice_Studio, outline = F, names = label)
#zillow_state_rental
unique(state_time_series$RegionName)
## [1] "Alabama" "Arizona" "Arkansas"
## [4] "California" "Colorado" "Connecticut"
## [7] "Delaware" "Florida" "Georgia"
## [10] "Hawaii" "Idaho" "Illinois"
## [13] "Indiana" "Iowa" "Kansas"
## [16] "Kentucky" "Louisiana" "Maine"
## [19] "Maryland" "Massachusetts" "Michigan"
## [22] "Minnesota" "Mississippi" "Missouri"
## [25] "Nebraska" "Nevada" "NewHampshire"
## [28] "NewJersey" "NewMexico" "NewYork"
## [31] "NorthCarolina" "Ohio" "Oklahoma"
## [34] "Oregon" "Pennsylvania" "RhodeIsland"
## [37] "SouthCarolina" "Tennessee" "Texas"
## [40] "Utah" "Virginia" "Washington"
## [43] "WestVirginia" "Wisconsin" "SouthDakota"
## [46] "Vermont" "Alaska" "Montana"
## [49] "Wyoming" "DistrictofColumbia" "NorthDakota"
## [52] "UnitedStates"
dmv_rental <- state_time_series %>%
select(Date, RegionName,MedianRentalPrice_1Bedroom, MedianRentalPrice_2Bedroom, MedianRentalPrice_3Bedroom, MedianRentalPrice_4Bedroom, MedianRentalPrice_5BedroomOrMore,
MedianRentalPrice_CondoCoop, MedianRentalPrice_SingleFamilyResidence, MedianRentalPrice_Studio) %>%
filter(RegionName %in% c("Maryland", "Virginia", "DistrictofColumbia", "UnitedStates")) %>%
na.omit
#view(dmv_rental)
#VA bedroom priceds
va_rental <- dmv_rental %>%
filter(RegionName %in% c("Virginia"))
#view(va_rental)
boxplot(va_rental$MedianRentalPrice_1Bedroom, va_rental$MedianRentalPrice_2Bedroom, va_rental$MedianRentalPrice_3Bedroom, va_rental$MedianRentalPrice_4Bedroom, va_rental$MedianRentalPrice_5BedroomOrMore)
# Maryland bedroom prices
md_rental <- dmv_rental %>%
filter(RegionName %in% c("Maryland"))
boxplot(md_rental$MedianRentalPrice_1Bedroom, md_rental$MedianRentalPrice_2Bedroom, md_rental$MedianRentalPrice_3Bedroom, md_rental$MedianRentalPrice_4Bedroom, md_rental$MedianRentalPrice_5BedroomOrMore,md_rental$MedianRentalPrice_CondoCoop, md_rental$MedianRentalPrice_SingleFamilyResidence, md_rental$MedianRentalPrice_Studio)
#dc_rental <- dmv_rental %>%
# filter(RegionName %in% c("DistrictofColumbia"))
#view(dc_rental)
#no dc rental information
dc_zillow <- zillow_state_rental %>%
filter(RegionName %in% c("DistrictofColumbia"))
#view(dc_zillow)
boxplot(dc_zillow$MedianRentalPrice_1Bedroom, dc_zillow$MedianRentalPrice_2Bedroom, dc_zillow$MedianRentalPrice_3Bedroom, dc_zillow$MedianRentalPrice_4Bedroom, dc_zillow$MedianRentalPrice_CondoCoop, dc_zillow$MedianRentalPrice_MultiFamilyResidence5PlusUnits, dc_zillow$MedianRentalPrice_SingleFamilyResidence, dc_zillow$MedianRentalPrice_Studio) #dc_zillow$MedianRentalPrice_5BedroomOrMore
#hist(dc_zillow$ZHVI_AllHomes, breaks = 100)
#plot(dc_zillow$ZHVI_AllHomes)
#t.test(md_rental$MedianRentalPrice_1Bedroom, md_rental$MedianRentalPrice_Studio, conf.level = .95, alternative = "greater")
#t.test(md_rental$MedianRentalPrice_1Bedroom, md_rental$MedianRentalPrice_5BedroomOrMore, paired = TRUE)
#t.test(md_rental$MedianRentalPrice_1Bedroom, md_rental$MedianRentalPrice_SingleFamilyResidence, paired = TRUE)
#t.test(md_rental$MedianRentalPrice_CondoCoop, md_rental$MedianRentalPrice_Studio, paired = TRUE)
#t.test(x, y = NULL, alternative = c("two.sided", "less", "greater"), mu = 0,
# paired = FALSE, var.equal = FALSE, conf.level = 0.95)
#results <- t.test(MedianRentalPrice_1Bedroom ~ MedianRentalPrice_5BedroomOrMore, data = md_rental)
mean_rental_All <- mean(state_time_series$MedianRentalPrice_AllHomes, na.rm = TRUE)
mean_rental_1Bed <- mean(state_time_series$MedianRentalPrice_1Bedroom, na.rm= TRUE)
#add a new column mean
state_time_series$Means <- mean_rental_All
#check new column
#glimpse(state_time_series)
#two sided test for difference of means
# check if there is a difference between median rental price in california and maryland
state_time_series %>%
filter(RegionName %in% c("Maryland", "California")) %>%
t.test(MedianRentalPrice_AllHomes ~ RegionName, data =.,
alternative = "two.sided")
##
## Welch Two Sample t-test
##
## data: MedianRentalPrice_AllHomes by RegionName
## t = 15.936, df = 101.15, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group California and group Maryland is not equal to 0
## 95 percent confidence interval:
## 435.9071 559.8594
## sample estimates:
## mean in group California mean in group Maryland
## 2147.404 1649.521
x = state_time_series$Sale_Prices
y = state_time_series$MedianRentalPrice_AllHomes
plot(state_time_series$Sale_Prices, state_time_series$MedianRentalPrice_AllHomes,
data = state_time_series,
main = "Median Rental Price vs Sales prices 1996-2017",
xlab = "Sale Price of Homes",
ylab = "Median Rental Price") +abline(lm(y~x), col = "red")
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
## integer(0)
regression <- lm(MedianRentalPrice_AllHomes ~Sale_Prices, data=state_time_series)
summary(regression)
##
## Call:
## lm(formula = MedianRentalPrice_AllHomes ~ Sale_Prices, data = state_time_series)
##
## Residuals:
## Min 1Q Median 3Q Max
## -677.25 -172.02 -46.48 93.42 1803.93
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.943e+02 1.513e+01 32.66 <2e-16 ***
## Sale_Prices 4.549e-03 7.043e-05 64.58 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 306.1 on 3049 degrees of freedom
## (10161 observations deleted due to missingness)
## Multiple R-squared: 0.5777, Adjusted R-squared: 0.5776
## F-statistic: 4171 on 1 and 3049 DF, p-value: < 2.2e-16
#there for as the sales price of the house tends to increase so does the home rental price increases.
cor(state_time_series$Sale_Prices, state_time_series$MedianRentalPrice_AllHomes, use = "complete.obs")
## [1] 0.7600696
cor.test(state_time_series$Sale_Prices, state_time_series$MedianRentalPrice_AllHomes, alternative = "greater")
##
## Pearson's product-moment correlation
##
## data: state_time_series$Sale_Prices and state_time_series$MedianRentalPrice_AllHomes
## t = 64.584, df = 3049, p-value < 2.2e-16
## alternative hypothesis: true correlation is greater than 0
## 95 percent confidence interval:
## 0.7472004 1.0000000
## sample estimates:
## cor
## 0.7600696
# that shows the rent tends to increase to increase in sales price
# observed correlation coefficient of two vectors x and y