CCPP<- read.csv("data.csv")
library(ggplot2, quietly = TRUE)
library(knitr, quietly = TRUE)
dim(CCPP)
## [1] 9568 5
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
describe(CCPP)
## vars n mean sd median trimmed mad min max range
## AT 1 9568 19.65 7.45 20.34 19.77 9.08 1.81 37.11 35.30
## V 2 9568 54.31 12.71 52.08 53.85 16.71 25.36 81.56 56.20
## AP 3 9568 1013.26 5.94 1012.94 1013.11 6.03 992.89 1033.30 40.41
## RH 4 9568 73.31 14.60 74.97 74.07 15.65 25.56 100.16 74.60
## PE 5 9568 454.37 17.07 451.55 453.64 20.56 420.26 495.76 75.50
## skew kurtosis se
## AT -0.14 -1.04 0.08
## V 0.20 -1.44 0.13
## AP 0.27 0.09 0.06
## RH -0.43 -0.45 0.15
## PE 0.31 -1.05 0.17
Features consist of hourly average ambient variables
1. Temperature (T) in the range 1.81°C and 37.11°C,
2. Ambient Pressure (AP) in the range 992.89-1033.30 milibar,
3. Relative Humidity (RH) in the range 25.56% to 100.16%
4. Exhaust Vacuum (V) in teh range 25.36-81.56 cm Hg
5. Net hourly electrical energy output (EP) 420.26-495.76 MW
The averages are taken from various sensors located around the plant that record the ambient variables every second. The variables are given without normalization. Almost all of the values of the variables are different and there is no different category in any variable.(No point in making contigency tables.)
attach(CCPP)
par(mfrow=c(2,3))
boxplot(AT, main="Temperature", horizontal = TRUE, col = "tomato3")
boxplot(V, main="Exhaust Vaccum", horizontal = TRUE, col = "tomato3")
boxplot(AP, main="Ambient Pressure", horizontal = TRUE, col ="tomato3" )
boxplot(RH, main="Relative Humidity", horizontal = TRUE, col = "tomato3")
boxplot(PE, main="Electic Energy Output of the Plant", horizontal = TRUE, col = "tomato3")
par(mfrow=c(2,3))
hist(AT, main="Ambient Temperature", breaks=20, col="yellow", freq = FALSE)
lines(density(AT, bw=10), type = "l", col="red", lwd=2)
hist(V, main="Exhaust Vaccum", breaks=20, col="yellow", freq = FALSE)
lines(density(V, bw=10), type = "l", col="red", lwd=2)
hist(AP, main="Ambient Pressure", breaks=20, col="yellow", freq = FALSE)
lines(density(AP, bw=10), type = "l", col="red", lwd=2)
hist(RH, main="Relative Humidity", breaks=20, col="yellow", freq = FALSE)
lines(density(RH, bw=10), type = "l", col="red", lwd=2)
hist(PE, main="Electric Energy Output of the Plant", breaks=20, col="yellow", freq = FALSE)
lines(density(PE, bw=10), type = "l", col="red", lwd=2)
par(mfrow=c(2,2))
ggplot(CCPP, aes(x=AT, y=PE))+
geom_point(col="coral")+
geom_smooth(model = lm)+
labs(x="Ambient Temperature", y="Electric Energy Output of the Plant")
## Warning: Ignoring unknown parameters: model
## `geom_smooth()` using method = 'gam'
ggplot(CCPP, aes(x=V, y=PE))+
geom_point(col="coral")+
geom_smooth(model = lm)+
labs(x="Exhaust Vaccum", y="Electric Energy Output of the Plant")
## Warning: Ignoring unknown parameters: model
## `geom_smooth()` using method = 'gam'
ggplot(CCPP, aes(x=AP, y=PE))+
geom_point(col="coral")+
geom_smooth(model = lm)+
labs(x="Ambient Pressure", y="Electric Energy Output of the Plant")
## Warning: Ignoring unknown parameters: model
## `geom_smooth()` using method = 'gam'
ggplot(CCPP, aes(x=RH, y=PE))+
geom_point(col="coral")+
geom_smooth(model = lm)+
labs(x="Relative Humidity", y="Electric Energy Output of the Plant")
## Warning: Ignoring unknown parameters: model
## `geom_smooth()` using method = 'gam'
matrix <- round(cor(CCPP),2)
# Getting lower triangle of the correlation matrix
get_lower_tri<-function(matrix){
matrix[upper.tri(matrix)] <- NA
return(matrix)
}
# Getting upper triangle of the correlation matrix
get_upper_tri <- function(matrix){
matrix[lower.tri(matrix)]<- NA
return(matrix)
}
upper_tri <- get_upper_tri(matrix)
# Melt the correlation matrix
library(reshape2)
melted_cormat <- melt(upper_tri, na.rm = TRUE)
# Heatmap
library(ggplot2)
#reordering the correlation matrix according to the correlation coefficient is useful to identify the hidden pattern in the matrix. hclust for hierarchical clustering order is used below in the funtion defined.
reorder_cormat <- function(cormat){
# Use correlation between variables as distance
dd <- as.dist((1-cormat)/2)
hc <- hclust(dd)
cormat <-cormat[hc$order, hc$order]
}
# Reorder the correlation matrix
cormat <- reorder_cormat(matrix)
upper_tri <- get_upper_tri(matrix)
# Melt the correlation matrix
melted_cormat <- melt(upper_tri, na.rm = TRUE)
# Create a ggheatmap
ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+ # minimal theme
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
coord_fixed()
#Adding correlation coefficients on the heatmap
#Using geom_text() to add the correlation coefficients on the graph
#Using a blank theme (remove axis labels, panel grids and background, and axis ticks)
#Using guides() to change the position of the legend title
ggheatmap +
geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
legend.justification = c(1, 0),
legend.position = c(0.6, 0.7),
legend.direction = "horizontal")+
guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
title.position = "top", title.hjust = 0.5))
library(hexbin)
library(RColorBrewer)
bin1 <- hexbin(AT, PE, xbins = 40)
bin2 <- hexbin(V, PE, xbins = 40)
bin3 <- hexbin(AP, PE, xbins = 40)
bin4 <- hexbin(RH, PE, xbins = 40)
my_colors=colorRampPalette(rev(brewer.pal(11,'Spectral')))
plot(bin1, main="", xlab= "Ambient Temperature", ylab="", colramp=my_colors)
plot(bin2, main="", xlab= "Exhaust Vaccum", ylab="", colramp=my_colors)
plot(bin3, main="", xlab= "Ambient Pressure", ylab="", colramp=my_colors)
plot(bin4, main="", xlab= "Relative Humidity", ylab="", colramp=my_colors)
tbl1 <- table(AT,PE)
tbl2 <- table(V,PE)
tbl3 <- table(AP,PE)
tbl4 <- table(RH,PE)
chisq.test(tbl1)
## Warning in chisq.test(tbl1): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: tbl1
## X-squared = 14014000, df = 13403000, p-value < 2.2e-16
chisq.test(tbl2)
## Warning in chisq.test(tbl2): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: tbl2
## X-squared = 3056600, df = 3060600, p-value = 0.9431
chisq.test(tbl3)
## Warning in chisq.test(tbl3): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: tbl3
## X-squared = 12456000, df = 12165000, p-value < 2.2e-16
chisq.test(tbl4)
## Warning in chisq.test(tbl4): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: tbl4
## X-squared = 22004000, df = 21975000, p-value = 6.578e-06