Dataset:Zillow data on 2018 Median Rental Price/sqft by State by Sunny Mehta

load the required library to tidy and modify data for further analysis

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

load csv dataset by into R

url <- "https://raw.githubusercontent.com/Weicaidata/607/master/State_MedianRentalPricePerSqft_AllHomes.csv"
df1 <- read.csv(url,header=TRUE)
head(df1)
##     RegionName SizeRank  X2018.01  X2018.02  X2018.03  X2018.04  X2018.05
## 1   California        1 1.8674136 1.8846154 1.9001086 1.9235033 1.9478528
## 2        Texas        2 0.9056244 0.9134432 0.9237706 0.9344262 0.9387974
## 3     New York        3 4.2682584 4.2000000 3.9166667 4.1900000 4.3078412
## 4      Florida        4 1.3180646 1.3242778 1.3303769 1.3429752 1.3420245
## 5     Illinois        5 1.2173913 1.2288786 1.2594270 1.2901892 1.2925563
## 6 Pennsylvania        6 1.0256410 1.0351064 1.0568107 1.0817308 1.0890381
##   X2018.06  X2018.07  X2018.08  X2018.09  X2018.10  X2018.11  X2018.12
## 1 1.956392 1.9519520 1.9389239 1.9215785 1.9117647 1.9245725 1.9344438
## 2 0.940000 0.9373203 0.9320449 0.9276438 0.9252537 0.9281437 0.9318022
## 3 4.042553 3.8974345 3.4742328 3.2467532 3.1003876 3.1424668 3.5211268
## 4 1.333333 1.3353116 1.3493800 1.3536748 1.3489209 1.3542038 1.3584906
## 5 1.299667 1.2840895 1.2759585 1.2681159 1.2653354 1.2568210 1.2536585
## 6 1.080456 1.0574495 1.0512691 1.0416667 1.0341616 1.0380623 1.0416667

rename the columns for better understanding the data

colnames(df1) <- c("state","size","aJan","bFeb","cMar","dApr","eMay","fJun","gJul","hAug","iSep","jOct","kNov","lDec")
head(df1)
##          state size      aJan      bFeb      cMar      dApr      eMay
## 1   California    1 1.8674136 1.8846154 1.9001086 1.9235033 1.9478528
## 2        Texas    2 0.9056244 0.9134432 0.9237706 0.9344262 0.9387974
## 3     New York    3 4.2682584 4.2000000 3.9166667 4.1900000 4.3078412
## 4      Florida    4 1.3180646 1.3242778 1.3303769 1.3429752 1.3420245
## 5     Illinois    5 1.2173913 1.2288786 1.2594270 1.2901892 1.2925563
## 6 Pennsylvania    6 1.0256410 1.0351064 1.0568107 1.0817308 1.0890381
##       fJun      gJul      hAug      iSep      jOct      kNov      lDec
## 1 1.956392 1.9519520 1.9389239 1.9215785 1.9117647 1.9245725 1.9344438
## 2 0.940000 0.9373203 0.9320449 0.9276438 0.9252537 0.9281437 0.9318022
## 3 4.042553 3.8974345 3.4742328 3.2467532 3.1003876 3.1424668 3.5211268
## 4 1.333333 1.3353116 1.3493800 1.3536748 1.3489209 1.3542038 1.3584906
## 5 1.299667 1.2840895 1.2759585 1.2681159 1.2653354 1.2568210 1.2536585
## 6 1.080456 1.0574495 1.0512691 1.0416667 1.0341616 1.0380623 1.0416667

use glimpse function in dplyr to examine the dataset, and use summary function to see if there is missing value

glimpse(df1)
## Observations: 51
## Variables: 14
## $ state <fct> California, Texas, New York, Florida, Illinois, Pennsylv...
## $ size  <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1...
## $ aJan  <dbl> 1.8674136, 0.9056244, 4.2682584, 1.3180646, 1.2173913, 1...
## $ bFeb  <dbl> 1.8846154, 0.9134432, 4.2000000, 1.3242778, 1.2288786, 1...
## $ cMar  <dbl> 1.9001086, 0.9237706, 3.9166667, 1.3303769, 1.2594270, 1...
## $ dApr  <dbl> 1.9235033, 0.9344262, 4.1900000, 1.3429752, 1.2901892, 1...
## $ eMay  <dbl> 1.9478528, 0.9387974, 4.3078412, 1.3420245, 1.2925563, 1...
## $ fJun  <dbl> 1.9563919, 0.9400000, 4.0425532, 1.3333333, 1.2996666, 1...
## $ gJul  <dbl> 1.9519520, 0.9373203, 3.8974345, 1.3353116, 1.2840895, 1...
## $ hAug  <dbl> 1.9389239, 0.9320449, 3.4742328, 1.3493800, 1.2759585, 1...
## $ iSep  <dbl> 1.9215785, 0.9276438, 3.2467532, 1.3536748, 1.2681159, 1...
## $ jOct  <dbl> 1.9117647, 0.9252537, 3.1003876, 1.3489209, 1.2653354, 1...
## $ kNov  <dbl> 1.9245725, 0.9281437, 3.1424668, 1.3542038, 1.2568210, 1...
## $ lDec  <dbl> 1.9344438, 0.9318022, 3.5211268, 1.3584906, 1.2536585, 1...
summary(df1)
##         state         size           aJan             bFeb       
##  Alabama   : 1   Min.   : 1.0   Min.   :0.7045   Min.   :0.7082  
##  Alaska    : 1   1st Qu.:13.5   1st Qu.:0.8461   1st Qu.:0.8577  
##  Arizona   : 1   Median :26.0   Median :0.9665   Median :0.9722  
##  Arkansas  : 1   Mean   :26.0   Mean   :1.1827   Mean   :1.1938  
##  California: 1   3rd Qu.:38.5   3rd Qu.:1.3010   3rd Qu.:1.2932  
##  Colorado  : 1   Max.   :51.0   Max.   :4.2683   Max.   :4.2000  
##  (Other)   :45                                                   
##       cMar             dApr             eMay             fJun       
##  Min.   :0.7186   Min.   :0.7277   Min.   :0.7246   Min.   :0.7292  
##  1st Qu.:0.8702   1st Qu.:0.8852   1st Qu.:0.8964   1st Qu.:0.8856  
##  Median :0.9900   Median :0.9958   Median :1.0000   Median :0.9997  
##  Mean   :1.2019   Mean   :1.2258   Mean   :1.2326   Mean   :1.2268  
##  3rd Qu.:1.3188   3rd Qu.:1.3363   3rd Qu.:1.3373   3rd Qu.:1.3333  
##  Max.   :3.9167   Max.   :4.1900   Max.   :4.3078   Max.   :4.0426  
##                                                                     
##       gJul             hAug             iSep             jOct       
##  Min.   :0.7283   Min.   :0.7246   Min.   :0.7252   Min.   :0.7253  
##  1st Qu.:0.8832   1st Qu.:0.8786   1st Qu.:0.8760   1st Qu.:0.8799  
##  Median :0.9908   Median :0.9793   Median :0.9759   Median :0.9912  
##  Mean   :1.2148   Mean   :1.1995   Mean   :1.1896   Mean   :1.1794  
##  3rd Qu.:1.3189   3rd Qu.:1.3193   3rd Qu.:1.3106   3rd Qu.:1.3078  
##  Max.   :3.8974   Max.   :3.4742   Max.   :3.2468   Max.   :3.1004  
##                                                                     
##       kNov             lDec       
##  Min.   :0.7333   Min.   :0.7323  
##  1st Qu.:0.8739   1st Qu.:0.8731  
##  Median :0.9866   Median :0.9927  
##  Mean   :1.1789   Mean   :1.1905  
##  3rd Qu.:1.3087   3rd Qu.:1.3090  
##  Max.   :3.1425   Max.   :3.5211  
## 

use gather function from tidyr to make the wide dataset into long, and sort by “size”

df2 <- gather(df1,"month","sales",3:14) %>% 
  arrange(size)
head(df2)
##        state size month    sales
## 1 California    1  aJan 1.867414
## 2 California    1  bFeb 1.884615
## 3 California    1  cMar 1.900109
## 4 California    1  dApr 1.923503
## 5 California    1  eMay 1.947853
## 6 California    1  fJun 1.956392

answer question “overall current distribution”

we can use histgram to see that it is a right skewed distributed, and use summary function to find out the mean, min,max,1st quartille and 3rd quartille.

hist(df2$sales,main="sales",xlab="Averge")

summary(df2$sales)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.7045  0.8753  0.9875  1.2013  1.3169  4.3078

answer question “sales trend by months”

use summarise funtion,grouped by“months” to find the average price for each months.

df3 <- df2 %>% 
  arrange(month) %>% 
  group_by(month) %>% 
  summarise(avg=mean(sales))
df3
## # A tibble: 12 x 2
##    month   avg
##    <chr> <dbl>
##  1 aJan   1.18
##  2 bFeb   1.19
##  3 cMar   1.20
##  4 dApr   1.23
##  5 eMay   1.23
##  6 fJun   1.23
##  7 gJul   1.21
##  8 hAug   1.20
##  9 iSep   1.19
## 10 jOct   1.18
## 11 kNov   1.18
## 12 lDec   1.19

from the graph below, we can see that summer is hot season for sales of house, winter is the low season, which is within our expectation.

ggplot(data=df3,aes(x=month,y=avg,group=1))+
  geom_line()+
  geom_point()+
  xlab("Month")+
  ylab("Total")+
  labs(title="Sales trend by month")

answer question" trend by states"

we want to see top 5 largest state’s sales over year.

df6 <- df2 %>% 
  filter(size<=5)
head(df6)
##        state size month    sales
## 1 California    1  aJan 1.867414
## 2 California    1  bFeb 1.884615
## 3 California    1  cMar 1.900109
## 4 California    1  dApr 1.923503
## 5 California    1  eMay 1.947853
## 6 California    1  fJun 1.956392

NY has the hightst sales price per sqf compare to other states, buth the price for other states are more steady.

ggplot(df6,aes(x=month,y=sales,color=state,group=state))+
   geom_line() +
  xlab('Month')+labs(title = "Top 5 states sales trend")