load the required library to tidy and modify data for further analysis
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
load csv dataset by into R
url <- "https://raw.githubusercontent.com/Weicaidata/607/master/State_MedianRentalPricePerSqft_AllHomes.csv"
df1 <- read.csv(url,header=TRUE)
head(df1)
## RegionName SizeRank X2018.01 X2018.02 X2018.03 X2018.04 X2018.05
## 1 California 1 1.8674136 1.8846154 1.9001086 1.9235033 1.9478528
## 2 Texas 2 0.9056244 0.9134432 0.9237706 0.9344262 0.9387974
## 3 New York 3 4.2682584 4.2000000 3.9166667 4.1900000 4.3078412
## 4 Florida 4 1.3180646 1.3242778 1.3303769 1.3429752 1.3420245
## 5 Illinois 5 1.2173913 1.2288786 1.2594270 1.2901892 1.2925563
## 6 Pennsylvania 6 1.0256410 1.0351064 1.0568107 1.0817308 1.0890381
## X2018.06 X2018.07 X2018.08 X2018.09 X2018.10 X2018.11 X2018.12
## 1 1.956392 1.9519520 1.9389239 1.9215785 1.9117647 1.9245725 1.9344438
## 2 0.940000 0.9373203 0.9320449 0.9276438 0.9252537 0.9281437 0.9318022
## 3 4.042553 3.8974345 3.4742328 3.2467532 3.1003876 3.1424668 3.5211268
## 4 1.333333 1.3353116 1.3493800 1.3536748 1.3489209 1.3542038 1.3584906
## 5 1.299667 1.2840895 1.2759585 1.2681159 1.2653354 1.2568210 1.2536585
## 6 1.080456 1.0574495 1.0512691 1.0416667 1.0341616 1.0380623 1.0416667
rename the columns for better understanding the data
colnames(df1) <- c("state","size","aJan","bFeb","cMar","dApr","eMay","fJun","gJul","hAug","iSep","jOct","kNov","lDec")
head(df1)
## state size aJan bFeb cMar dApr eMay
## 1 California 1 1.8674136 1.8846154 1.9001086 1.9235033 1.9478528
## 2 Texas 2 0.9056244 0.9134432 0.9237706 0.9344262 0.9387974
## 3 New York 3 4.2682584 4.2000000 3.9166667 4.1900000 4.3078412
## 4 Florida 4 1.3180646 1.3242778 1.3303769 1.3429752 1.3420245
## 5 Illinois 5 1.2173913 1.2288786 1.2594270 1.2901892 1.2925563
## 6 Pennsylvania 6 1.0256410 1.0351064 1.0568107 1.0817308 1.0890381
## fJun gJul hAug iSep jOct kNov lDec
## 1 1.956392 1.9519520 1.9389239 1.9215785 1.9117647 1.9245725 1.9344438
## 2 0.940000 0.9373203 0.9320449 0.9276438 0.9252537 0.9281437 0.9318022
## 3 4.042553 3.8974345 3.4742328 3.2467532 3.1003876 3.1424668 3.5211268
## 4 1.333333 1.3353116 1.3493800 1.3536748 1.3489209 1.3542038 1.3584906
## 5 1.299667 1.2840895 1.2759585 1.2681159 1.2653354 1.2568210 1.2536585
## 6 1.080456 1.0574495 1.0512691 1.0416667 1.0341616 1.0380623 1.0416667
use glimpse function in dplyr to examine the dataset, and use summary function to see if there is missing value
glimpse(df1)
## Observations: 51
## Variables: 14
## $ state <fct> California, Texas, New York, Florida, Illinois, Pennsylv...
## $ size <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1...
## $ aJan <dbl> 1.8674136, 0.9056244, 4.2682584, 1.3180646, 1.2173913, 1...
## $ bFeb <dbl> 1.8846154, 0.9134432, 4.2000000, 1.3242778, 1.2288786, 1...
## $ cMar <dbl> 1.9001086, 0.9237706, 3.9166667, 1.3303769, 1.2594270, 1...
## $ dApr <dbl> 1.9235033, 0.9344262, 4.1900000, 1.3429752, 1.2901892, 1...
## $ eMay <dbl> 1.9478528, 0.9387974, 4.3078412, 1.3420245, 1.2925563, 1...
## $ fJun <dbl> 1.9563919, 0.9400000, 4.0425532, 1.3333333, 1.2996666, 1...
## $ gJul <dbl> 1.9519520, 0.9373203, 3.8974345, 1.3353116, 1.2840895, 1...
## $ hAug <dbl> 1.9389239, 0.9320449, 3.4742328, 1.3493800, 1.2759585, 1...
## $ iSep <dbl> 1.9215785, 0.9276438, 3.2467532, 1.3536748, 1.2681159, 1...
## $ jOct <dbl> 1.9117647, 0.9252537, 3.1003876, 1.3489209, 1.2653354, 1...
## $ kNov <dbl> 1.9245725, 0.9281437, 3.1424668, 1.3542038, 1.2568210, 1...
## $ lDec <dbl> 1.9344438, 0.9318022, 3.5211268, 1.3584906, 1.2536585, 1...
summary(df1)
## state size aJan bFeb
## Alabama : 1 Min. : 1.0 Min. :0.7045 Min. :0.7082
## Alaska : 1 1st Qu.:13.5 1st Qu.:0.8461 1st Qu.:0.8577
## Arizona : 1 Median :26.0 Median :0.9665 Median :0.9722
## Arkansas : 1 Mean :26.0 Mean :1.1827 Mean :1.1938
## California: 1 3rd Qu.:38.5 3rd Qu.:1.3010 3rd Qu.:1.2932
## Colorado : 1 Max. :51.0 Max. :4.2683 Max. :4.2000
## (Other) :45
## cMar dApr eMay fJun
## Min. :0.7186 Min. :0.7277 Min. :0.7246 Min. :0.7292
## 1st Qu.:0.8702 1st Qu.:0.8852 1st Qu.:0.8964 1st Qu.:0.8856
## Median :0.9900 Median :0.9958 Median :1.0000 Median :0.9997
## Mean :1.2019 Mean :1.2258 Mean :1.2326 Mean :1.2268
## 3rd Qu.:1.3188 3rd Qu.:1.3363 3rd Qu.:1.3373 3rd Qu.:1.3333
## Max. :3.9167 Max. :4.1900 Max. :4.3078 Max. :4.0426
##
## gJul hAug iSep jOct
## Min. :0.7283 Min. :0.7246 Min. :0.7252 Min. :0.7253
## 1st Qu.:0.8832 1st Qu.:0.8786 1st Qu.:0.8760 1st Qu.:0.8799
## Median :0.9908 Median :0.9793 Median :0.9759 Median :0.9912
## Mean :1.2148 Mean :1.1995 Mean :1.1896 Mean :1.1794
## 3rd Qu.:1.3189 3rd Qu.:1.3193 3rd Qu.:1.3106 3rd Qu.:1.3078
## Max. :3.8974 Max. :3.4742 Max. :3.2468 Max. :3.1004
##
## kNov lDec
## Min. :0.7333 Min. :0.7323
## 1st Qu.:0.8739 1st Qu.:0.8731
## Median :0.9866 Median :0.9927
## Mean :1.1789 Mean :1.1905
## 3rd Qu.:1.3087 3rd Qu.:1.3090
## Max. :3.1425 Max. :3.5211
##
use gather function from tidyr to make the wide dataset into long, and sort by “size”
df2 <- gather(df1,"month","sales",3:14) %>%
arrange(size)
head(df2)
## state size month sales
## 1 California 1 aJan 1.867414
## 2 California 1 bFeb 1.884615
## 3 California 1 cMar 1.900109
## 4 California 1 dApr 1.923503
## 5 California 1 eMay 1.947853
## 6 California 1 fJun 1.956392
we can use histgram to see that it is a right skewed distributed, and use summary function to find out the mean, min,max,1st quartille and 3rd quartille.
hist(df2$sales,main="sales",xlab="Averge")
summary(df2$sales)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.7045 0.8753 0.9875 1.2013 1.3169 4.3078
use summarise funtion,grouped by“months” to find the average price for each months.
df3 <- df2 %>%
arrange(month) %>%
group_by(month) %>%
summarise(avg=mean(sales))
df3
## # A tibble: 12 x 2
## month avg
## <chr> <dbl>
## 1 aJan 1.18
## 2 bFeb 1.19
## 3 cMar 1.20
## 4 dApr 1.23
## 5 eMay 1.23
## 6 fJun 1.23
## 7 gJul 1.21
## 8 hAug 1.20
## 9 iSep 1.19
## 10 jOct 1.18
## 11 kNov 1.18
## 12 lDec 1.19
from the graph below, we can see that summer is hot season for sales of house, winter is the low season, which is within our expectation.
ggplot(data=df3,aes(x=month,y=avg,group=1))+
geom_line()+
geom_point()+
xlab("Month")+
ylab("Total")+
labs(title="Sales trend by month")
we want to see top 5 largest state’s sales over year.
df6 <- df2 %>%
filter(size<=5)
head(df6)
## state size month sales
## 1 California 1 aJan 1.867414
## 2 California 1 bFeb 1.884615
## 3 California 1 cMar 1.900109
## 4 California 1 dApr 1.923503
## 5 California 1 eMay 1.947853
## 6 California 1 fJun 1.956392
NY has the hightst sales price per sqf compare to other states, buth the price for other states are more steady.
ggplot(df6,aes(x=month,y=sales,color=state,group=state))+
geom_line() +
xlab('Month')+labs(title = "Top 5 states sales trend")