include = FALSE prevents code and results from appearing in the finished file. R Markdown still runs the code in the chunk, and the results can be used by other chunks.echo = FALSE prevents code, but not the results from appearing in the finished file. This is a useful way to embed figures.message = FALSE prevents messages that are generated by code from appearing in the finished file.warning = FALSE prevents warnings that are generated by code from appearing in the finished.fig.cap = "..." adds a caption to graphical results.setwd("C:/Users/12267/Desktop/UWindsor/Winter 2021/MSCI 3230 Data Science Tools & Methods/RSTUDIO Work") # Always set the working directory at the beginning
#############List all libraries used
library(ggplot2)
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(reshape)
##########################
#### Read the data
df <- read.csv("data/WestRoxbury (2).csv")
###########################
t(t(names(df))) ##display column names in a user friendly manner
## [,1]
## [1,] "TOTAL.VALUE"
## [2,] "TAX"
## [3,] "LOT.SQFT"
## [4,] "YR.BUILT"
## [5,] "GROSS.AREA"
## [6,] "LIVING.AREA"
## [7,] "FLOORS"
## [8,] "ROOMS"
## [9,] "BEDROOMS"
## [10,] "FULL.BATH"
## [11,] "HALF.BATH"
## [12,] "KITCHEN"
## [13,] "FIREPLACE"
## [14,] "REMODEL"
#change the column name if so desired
names(df)[3]<- "LOT.SIZE"
#change to lowercase if so desired
names(df) <- tolower(names(df))
#boxplots for quantitative columns
# use par() to split the plots into panels.
ggplot(df, aes(x = "", y = total.value)) +
geom_boxplot(fill= "green", color="blue", outlier.color = "red", outlier.shape = "o",width = 0.2)+
labs(title = "Distribution of Total Property Value", y = "Value in (in 000s)")+
coord_cartesian(ylim = c(100, 1200))
Figure 1.1
ggplot(df, aes(x = remodel, y = total.value, fill = remodel )) +
geom_boxplot(outlier.color = "red", outlier.shape = "o")+
labs(title = "Distribution of Total value", x = "Remodel types", y = "Value in (in 000s)")
Figure 1.2
ggplot(df, aes(x= "", y = lot.size)) +
geom_boxplot(fill='green', color="blue", outlier.color = "red", outlier.shape = "o", width = 0.2)+
labs(title = "Distribution of Lot Size", x = "", y = "Lot size (Sq Ft)")
# Scatter plot
ggplot(df, aes(x = lot.size, y = total.value)) + geom_point()
ggplot(df, aes(x = gross.area, y = total.value)) + geom_point(aes(colour = factor(floors)))
ggplot(df, aes(x = living.area, y = total.value)) + geom_point(aes(colour = factor(floors)))
ggplot(df, aes(x = gross.area, y = living.area)) + geom_point()
##bar charts
ggplot(df, aes(x = remodel, fill = remodel)) +
geom_bar()
ggplot(df, aes(x = remodel, y = total.value, fill = remodel)) +
geom_bar(stat = "summary", fun = "max")
aggregate(total.value ~ remodel, data = df, max)
## remodel total.value
## 1 None 1217.8
## 2 Old 815.3
## 3 Recent 935.1
##histogram
ggplot(df, aes(x= total.value)) + geom_histogram( fill = "red", col = "blue", binwidth = 50)
## heatmap with values
t(t(names(df)))
## [,1]
## [1,] "total.value"
## [2,] "tax"
## [3,] "lot.size"
## [4,] "yr.built"
## [5,] "gross.area"
## [6,] "living.area"
## [7,] "floors"
## [8,] "rooms"
## [9,] "bedrooms"
## [10,] "full.bath"
## [11,] "half.bath"
## [12,] "kitchen"
## [13,] "fireplace"
## [14,] "remodel"
df1 <- df[ , c("total.value", "lot.size", "gross.area", "living.area", "floors", "rooms")]
heatmap.2(cor(df1), Rowv = FALSE, Colv = FALSE, dendrogram = "none",
cellnote = round(cor(df1),2),
notecol = "black", key = FALSE, trace = 'none', margins = c(10,10))