Very robust, useful and flexible for descriptive statistics. Advantages of ggplot2:
1-consistent, plot specification at a high level of abstraction.
2-very flexible.
3-theme system for polishing plot appearance.
4-mature and complete graphics system.
5-Great community support and active mailing list.
6- Easy to draw inferences and highlight any desired metric.
I have noted my inferences along with the plots as well.
NOTE: Data for this exercise can be downloaded from-http://tutorials.iq.harvard.edu/R/Rgraphics.zip
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 3.3.3
housing <- read.csv('C:/Users/awasthi/Desktop/landdata-states.csv', header = TRUE, sep = ",",na.strings = c("NA", "#N/A","" ))
#checking null
colSums(is.na(housing))
## State region Date Home.Value
## 0 153 0 0
## Structure.Cost Land.Value Land.Share..Pct. Home.Price.Index
## 0 0 0 0
## Land.Price.Index Year Qrtr
## 0 0 0
#saving nulls in the df
nullhousing <- subset(housing, is.na(housing$region))
head(nullhousing)
## State region Date Home.Value Structure.Cost Land.Value
## 7651 DC <NA> 2003.00 384443 93922 290522
## 7652 DC <NA> 2003.25 399633 93961 305673
## 7653 DC <NA> 2003.50 417110 94032 323078
## 7654 DC <NA> 2003.75 436496 94486 342010
## 7655 DC <NA> 2004.00 457806 95807 361999
## 7656 DC <NA> 2004.25 481171 98379 382792
## Land.Share..Pct. Home.Price.Index Land.Price.Index Year Qrtr
## 7651 75.6 1.469 1.654 2002 4
## 7652 76.5 1.527 1.740 2003 1
## 7653 77.5 1.594 1.839 2003 2
## 7654 78.4 1.668 1.947 2003 3
## 7655 79.1 1.749 2.062 2003 4
## 7656 79.6 1.839 2.182 2004 1
#checking the State for which region is null
unique(nullhousing$State)
## [1] DC
## 51 Levels: AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA ... WY
#all the NAs are from State = DC
subset(housing, housing$State == 'DC')[,2]
## [1] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [15] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [29] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [43] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [57] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [71] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [85] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [99] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [113] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [127] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [141] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## Levels: Midwest N. East South West
#setting DC to south region as per https://www2.census.gov/geo/pdfs/maps-data/maps/reference/us_regdiv.pdf
housing[housing$State=='DC',2] <- 'South'
#aggregating the data based home value - by state
Hvalue_by_state <- aggregate(Home.Value ~ State, housing, FUN = sum)
#creating a sorted dataset for further use
sortedd <- Hvalue_by_state[order(Hvalue_by_state$Home.Value, decreasing = TRUE),]
#top 5 states with highest home value
top5 <- as.list(sortedd[1:5,1])
#top 10 states with highest home value
top10states <- as.list(sortedd[1:10,1])
#scatter plot
ggplot(subset(housing, State %in% (unlist(top5))), aes(x=Date, y=Home.Value, color=State))+geom_point()
# Inference: Hawaii maintains it high home value after a drop, DC exceeds after a drop - closest to Hawaii. CA has a largest drop in it home values.
#box plot for top 10 State vs HomeValue #Box plots provide graphic display of five-number summary
ggplot(subset(housing, State %in% (unlist(top10states))), aes(x=State, y=Home.Value, color=State))+geom_boxplot(outlier.colour = 'red',outlier.shape = 3)+geom_jitter(alpha=0.3, color='tomato')
# Inference: Nevada, DC and Hawaii have the highest range. NV has few outliers and Hawaii has a #lot of points very close to the end of the whisker. Box plot for NV is interesting - it has a #lot of points within the 1st and 3rd quartile as well.
#boxplot for our 4 regions with outliers highlighted
ggplot(housing, aes(x=region, y=Home.Value, color=region))+geom_boxplot()+geom_boxplot(outlier.colour = 'red',outlier.shape = 3)+geom_jitter(alpha=0.3, color='tomato')
# Inferences: Clearly, Midwest is the most affordable region and West is the costliest followed by South. South and West regions are in demand!!
#creating a dataframe based on a date/time-frame
hp.2007Q1 <- subset(housing, Date == '2007.25')
#let us take a closer look on scatter plot for 2007 1st Quarter - Land value vs Structure Cost
ggplot(hp.2007Q1, aes(x= Land.Value, y = Structure.Cost))+geom_point()
#since our x axis has a high variance - lets try to fit against a log transformation of Land Value
ggplot(hp.2007Q1, aes(x= log(Land.Value), y = Structure.Cost))+geom_point()
#color coding
ggplot(hp.2007Q1, aes(x= log(Land.Value), y = Structure.Cost, color = log(Land.Value)))+geom_point()
#Let us try to fit a line based on our X and Y and add it to your plot
hp.2007Q1$pred.SC <- predict(lm(Structure.Cost ~ log(Land.Value), data = hp.2007Q1))
p1 <- ggplot(hp.2007Q1, aes(x=log(Land.Value), y = Structure.Cost ))
#adding points and line
p1 + geom_point(aes(color = Home.Value, size = Home.Value ))+geom_line(aes(y=pred.SC))
# Inference: Home Value seems to be more correlated with land value and less with #structure cost. We can see points which have high structure cost but less home value but #all the points which have high home value have high land value as well.
#mapping states on the plot
p1 + geom_text(aes(label = State), size = 2, color = 'red')
#install.packages("ggrepel") #library("ggrepel") required!!
#this is to map points with text
p1 + geom_point(color='red') + geom_text_repel(aes(label=State), size = 3)
# Inference: Top right corner - You have to filthy rich to get into those states. Bottom left are the least.
# Interesting to see Texas to be low on both the axis.
#mapping variable to other aesthetics
p1 + geom_point(aes(color=Home.Value, shape = region))
#mapping variable to other aesthetics, adding one more dimension(size) to our plot
#here size is based on the home value - can be seen in the legends
p1 + geom_point(aes(size=Home.Value, shape = region, color = Home.Value))
# Inference: Lot of big squares and crosses towards right
#Statistical transformations
p2 <- ggplot(housing, aes(x=Home.Value))+geom_histogram(stat = 'bin', binwidth = 4000)
#top10states
top10 <- (sortedd[1:10,])
ggplot(top10, aes(x=State, y=Home.Value)) + geom_bar(stat = 'identity')
#all states included
ggplot(Hvalue_by_state, aes(x=State, y=Home.Value)) + geom_bar(stat = 'identity')
#Home value by Date and State
p3 <- ggplot(housing, aes(x=State, y=Home.Price.Index)) + theme(legend.position = 'top', axis.text = element_text(size = 6))
p3+geom_point(aes(color=Date), alpha=0.5,size=1.5)
#To avoid overlapping - we can use jitter
p4 <- p3+geom_point(aes(color=Date), alpha=0.5,size=1.5, position = position_jitter(width = 0.25, height = 0))
#color coding to indicate HIGHs and LOWs
p4+scale_color_continuous(low = 'green', high = 'red')
# Inference: Small towers are thick/jittery edges are the ones getting a lot of investment at less cost.
#Facets
p5 <- ggplot(housing, aes(x=Date, y=Home.Value))
p5 + geom_line(aes(color=State)) #Difficult to distinguish??? Facets can help
#Facets can help here #These can be used to see the correlation as well!
p5 + geom_line()+facet_wrap(~State, ncol = 10)
# Inference: CA, DC, HI, MA, MD, NJ, NY, OR, VA, WA - are few to watch for the rise in cost
End of document
Feedback/Suggestions are welcome, Thank you!